In [1]:
import pandas as pd
import numpy as np
import os


In [2]:
# --- Step 1: Define the Complete List of Indicators from the PDF ---
# This is the master list we want to track.
print("--- Step 1: Defining Master List of PDF Indicators ---")
pdf_indicators_list = [
    "% of municipal WASH events with Haitian Rotarian participation",
    "% of water point committees (CPEs) and Professional Operators created which are functional",
    "% of intervention communities that achieved ODF status using the CLTS approach",
    "% of population in intervention communities paying for water service",
    "% of municipal WASH interventions implemented in alignment with the Commune Action Plan",
    "% of the intervention communes with functional WASH municipal committee",
    "% of service providers monitored according to DINEPA/OREPA guidelines as accepted by the Mayor's Office",
    "% of customers satisfied with the quality, affordability and reliability of the WASH services provided",
    "% of communes which have held an annual WASH service provider review with key stakeholders in the last year",
    # Note: Corrected a typo from the prompt where the next two were merged
    "% of population in intervention communes with access to basic drinking water service",
    "% of population in intervention communes with access to safely managed drinking water service",
    "% of intervention water points and water systems which are functional, potable, and have at least a balanced budget after 2 years",
    "% of population in intervention communes with at least basic sanitation service",
    "% of population in intervention communes with at least basic hygiene service",
    "% of schools with at least basic drinking water, sanitation, and hygiene services",
    "% of healthcare facilities with at least basic drinking water, sanitation, and hygiene services",
    "# of partnerships made in alignment with HANWASH core values",
    "Cumulative amount of money committed in alignment with HANWASH Core Values",
]
# Remove duplicates to ensure a clean report
pdf_indicators_list = sorted(list(set(pdf_indicators_list)))
print(f"Master list contains {len(pdf_indicators_list)} unique indicators to check.")

--- Step 1: Defining Master List of PDF Indicators ---
Master list contains 18 unique indicators to check.


In [3]:
# --- Step 1b: Define known variations between PDF and source data ---
# This mapping helps us find indicators with slight wording differences.
known_variations_map = {
    "% of municipal WASH events with Haitian Rotarian participation": "% of commune WASH events with Rotarian participation",
    "% of municipal WASH interventions implemented in alignment with the Commune Action Plan": "% of approved interventions implemented in alignment with Commune Action Plans",
    "% of service providers monitored according to DINEPA/OREPA guidelines as accepted by the Mayor's Office": "% of service providers under HANWASH initiative monitored according to national guidelines",
    # The PDF uses 'access to', the data uses 'at least'
    "% of population in intervention communes with access to basic drinking water service": "% of population in intervention communes with at least basic drinking water service"
}


In [4]:
# --- Step 2: Load the Merged Data (Our Source of Truth) ---
print("\n--- Step 2: Loading Data ---")
input_file = './output/merged_indicators.xlsx'
try:
    df = pd.read_excel(input_file)
    print(f"Successfully loaded '{input_file}'.")
    df['Indicator Type'] = df['Indicator Type'].astype(object)
except FileNotFoundError:
    print(f"Error: The file '{input_file}' was not found. Please ensure it's in the same directory.")
    exit()




--- Step 2: Loading Data ---
Successfully loaded './output/merged_indicators.xlsx'.


In [5]:
# For efficient lookup, create a set of the indicators that exist in our data
source_indicators = set(df['Indicator'].dropna())

In [6]:
# --- Step 3: Create the Matching Imbalances Report ---
print("\n--- Step 3: Generating 'matching_imbalances.xlsx' report ---")
imbalance_report_data = []
evaluation_indicators_found = []

for indicator_title in pdf_indicators_list:
    # Use None for consistency with the np.select fix
    indicator_found_text = None
    is_found = "No"

    # 1. Check for a perfect match
    if indicator_title in source_indicators:
        is_found = "Yes"
        indicator_found_text = indicator_title
        evaluation_indicators_found.append(indicator_title)
    # 2. If no perfect match, check our known variations map
    elif indicator_title in known_variations_map:
        mapped_indicator = known_variations_map[indicator_title]
        if mapped_indicator in source_indicators:
            is_found = "Yes"
            indicator_found_text = mapped_indicator
            evaluation_indicators_found.append(mapped_indicator)

    imbalance_report_data.append({
        'pdf_presentation_indicator': indicator_title,
        'is_found': is_found,
        'indicator_found': indicator_found_text
    })

report_df = pd.DataFrame(imbalance_report_data)
print("Imbalance report created.")



--- Step 3: Generating 'matching_imbalances.xlsx' report ---
Imbalance report created.


In [7]:
# --- Step 4: Save Both Spreadsheets ---
print("\n--- Step 4: Saving Files ---")
output_dir = 'output'
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Save the matching imbalances report
report_filename = os.path.join(output_dir, 'matching_imbalances.xlsx')
report_df.to_excel(report_filename, index=False)
print(f"Imbalance report saved to '{report_filename}'")


--- Step 4: Saving Files ---
Imbalance report saved to 'output/matching_imbalances.xlsx'


In [8]:
# --- Step 5: Apply Classification Logic using ONLY the found indicators ---
print("\n--- Step 5: Classifying Indicators in the main spreadsheet ---")
# Ensure the list of evaluation indicators is unique
evaluation_indicators = sorted(list(set(evaluation_indicators_found)))
print(f"Using {len(evaluation_indicators)} found indicators for classification.")

conditions = [
    df['Indicator'].isin(evaluation_indicators),
    (~df['Indicator'].isin(evaluation_indicators)) & (df['Indicator'].notna())
]
outcomes = ['Evaluation Indicator', 'Monitoring Indicator']
# FIX: Use `None` as the default value to avoid the TypeError with mixed string/float types.
df['Indicator Type'] = np.select(conditions, outcomes, default=None)
print("Classification complete.")



--- Step 5: Classifying Indicators in the main spreadsheet ---
Using 7 found indicators for classification.
Classification complete.


In [9]:
# Save the classified data
output_filename = os.path.join(output_dir, 'classified_indicators.xlsx')
df.to_excel(output_filename, index=False)
print(f"Updated main table saved to '{output_filename}'")


Updated main table saved to 'output/classified_indicators.xlsx'


In [10]:
# --- Verification ---
print("\n--- Verification of Main Table ---")
print("Value counts for 'Indicator Type':")
print(df['Indicator Type'].value_counts(dropna=False))


--- Verification of Main Table ---
Value counts for 'Indicator Type':
Indicator Type
Monitoring Indicator    48
Evaluation Indicator     7
None                     4
Name: count, dtype: int64
