In [None]:
import pandas as pd
import numpy as np

PATH_RAW = '../data/raw/ethiopia_fi_unified_data - ethiopia_fi_unified_data.csv'
PATH_IMPACT = '../data/raw/ethiopia_fi_unified_data - Impact_sheet.csv'

# Load and merge to create the "Unified Dataset"
df_hist = pd.read_csv(PATH_RAW)
df_imp = pd.read_csv(PATH_IMPACT)
df = pd.concat([df_hist, df_imp], ignore_index=True)

# 2025 ENRICHMENT: Adding the leading indicators 
enrichment = [
    {"record_type": "observation", "pillar": "Access", "indicator": "Fayda Digital ID", "indicator_code": "fayda_reg_count", "value_numeric": 33694522, "observation_date": "2026-01-28", "source_name": "NIDP"},
    {"record_type": "observation", "pillar": "Usage", "indicator": "Telebirr Half-Year Value", "indicator_code": "telebirr_trans_val_6m", "value_numeric": 1.9e12, "observation_date": "2026-01-29", "source_name": "Ethio Telecom"},
    {"record_type": "event", "category": "policy", "indicator": "Fuel Digitization", "observation_date": "2024-08-01", "notes": "Mandatory digital wallet use"}
]
df = pd.concat([df, pd.DataFrame(enrichment)], ignore_index=True)
df['observation_date'] = pd.to_datetime(df['observation_date'])
print(f"Task 1 Complete: Total records {len(df)}")

Task 1 Complete: Total records 60


In [None]:
def clean_data(df):
    df['indicator_code'] = df['indicator_code'].str.lower().str.strip()
    
    # Normalize values (0 to 1)
    # We treat 0-100 values as percentages and billions as raw scales
    def normalize(row):
        if row['indicator_code'] in ['account_ownership', 'digital_payment_adoption']:
            return row['value_numeric'] / 100 if row['value_numeric'] > 1 else row['value_numeric']
        return row['value_numeric']

    df['norm_val'] = df.apply(normalize, axis=1)
    return df

df_clean = clean_data(df)
df_clean.to_csv('../data/processed/ethiopia_fi_cleaned.csv', index=False)

In [None]:
import pandas as pd

# We compare the historical 'Usage' growth rate against the 2025 'Proxy'
usage_hist = df[df['indicator_code'] == 'digital_payment_adoption'].sort_values('observation_date')
proxy_2025 = df[df['indicator_code'] == 'telebirr_trans_val_6m'].iloc[-1]['value_numeric']

# Calculate 2024 vs 2025 Growth - 2024 Telebirr approx was 1.1T, 2025 is 2.38T (Full Year)
growth_rate = ((2.38e12 - 1.1e12) / 1.1e12) * 100

print("--- Q1: DATA ENRICHMENT PROOF ---")
print(f"2025 Telebirr Transaction Volume: {proxy_2025/1e12:.2f} Trillion ETB")
print(f"Estimated Year-on-Year Usage Growth: {growth_rate:.1f}%")
print("CONCLUSION: Since transaction value grew by ~116% into 2025, the 2024 Findex")
print("survey (which shows ~30% usage) is 'outdated' as it fails to capture the velocity.")

print("\n" + "-"*30 + "\n")

# Analysis The Rural "Blind Spot"
print("--- Q5: DATA LIMITATIONS (SOURCE ANALYSIS) ---")
sources = df['source_name'].unique()
print(f"Unique Data Sources: {sources}")

# Categorizing sources
urban_centric = ['Ethio Telecom', 'NIDP', 'Telebirr']
is_urban = df['source_name'].isin(urban_centric).sum()
total_records = len(df)

print(f"Total Records: {total_records}")
print(f"Urban-Centric Records (Telecom/ID): {is_urban}")
print(f"Rural-Specific Records: {total_records - is_urban} (Historical Surveys Only)")
print("CONCLUSION: High-confidence 2025 data is 100% sourced from digital platforms")
print("(Telecom/ID), which requires smartphones and urban infrastructure. Rural areas")
print("remain a data 'blind spot' in the current model.")

--- Q1: DATA ENRICHMENT PROOF ---
2025 Telebirr Transaction Volume: 1.90 Trillion ETB
Estimated Year-on-Year Usage Growth: 116.4%
CONCLUSION: Since transaction value grew by ~116% into 2025, the 2024 Findex
survey (which shows ~30% usage) is 'outdated' as it fails to capture the velocity.

------------------------------

--- Q5: DATA LIMITATIONS (SOURCE ANALYSIS) ---
Unique Data Sources: ['Global Findex 2014' 'Global Findex 2017' 'Global Findex 2021'
 'Global Findex 2024' 'Ethio Telecom LEAD Report'
 'DataReportal Digital 2026' 'Fayda Official' 'World Bank'
 'ID4Africa Conference' 'EthSwitch Annual Report' 'Calculated'
 'Ethio Telecom Report' 'Safaricom Results' 'A4AI/ITU' 'NBE/Shega'
 'GSMA Gender Gap Report' 'NFIS-II Strategy' 'Fayda/NIDP' 'NBE'
 'Ethio Telecom' 'News' 'Safaricom' 'NIDP' 'EthSwitch' 'NBE/EthSwitch' nan]
Total Records: 60
Urban-Centric Records (Telecom/ID): 4
Rural-Specific Records: 56 (Historical Surveys Only)
CONCLUSION: High-confidence 2025 data is 100% sourced fro