In [2]:
# Task 1: Data Exploration and Enrichment (Complete Version)
import pandas as pd
from datetime import datetime

# ----------------------------
# 1. Load datasets
# ----------------------------
main_file = "../data/raw/ethiopia_fi_unified_data.xlsx"
reference_file = "../data/raw/reference_codes.xlsx"

# Load sheets
main_data = pd.read_excel(main_file, sheet_name="ethiopia_fi_unified_data")
impact_links = pd.read_excel(main_file, sheet_name="Impact_sheet")
reference_codes = pd.read_excel(reference_file)

print(f"Main dataset shape: {main_data.shape}")
print(f"Impact links shape: {impact_links.shape}")
print(f"Reference codes shape: {reference_codes.shape}")

# ----------------------------
# 2. Explore existing data
# ----------------------------
print("\nRecords by record_type:\n", main_data['record_type'].value_counts())
print("\nRecords by pillar:\n", main_data['pillar'].value_counts())
print("\nRecords by source_type:\n", main_data['source_type'].value_counts())
print("\nRecords by confidence:\n", main_data['confidence'].value_counts())

# Temporal range of observations
obs_dates = main_data.loc[main_data['record_type']=='observation','observation_date']
print("\nTemporal range of observations:", obs_dates.min(), "to", obs_dates.max())

# Unique indicators
unique_indicators = main_data['indicator_code'].unique()
print("\nUnique indicators:", unique_indicators)

# Existing events and their dates
events = main_data[main_data['record_type']=='event'][['indicator_code','category','observation_date','source_name']]
print("\nExisting events and dates:\n", events)

# ----------------------------
# 3. Helper functions for enrichment
# ----------------------------
def create_observation_row(record):
    return {**record, "record_id": f"REC_{len(main_data)+1001:04d}", "record_type":"observation", 
            "collected_by":"Dereje Derib", "collection_date": datetime.today().strftime("%Y-%m-%d")}

def create_event_row(record):
    return {**record, "record_id": f"EVT_{len(main_data)+1001:04d}", "record_type":"event", 
            "collected_by":"Dereje Derib", "collection_date": datetime.today().strftime("%Y-%m-%d")}

def create_impact_link(record):
    return {**record, "record_id": f"IMP_{len(impact_links)+1:04d}", "record_type":"impact_link", 
            "collected_by":"Dereje Derib", "collection_date": datetime.today().strftime("%Y-%m-%d")}

# ----------------------------
# 4. Add new observations (10)
# ----------------------------
new_observations = [
    {"category":"ACCESS","pillar":"Access","indicator":"Adults with Bank Account","indicator_code":"ACC_BANK",
     "indicator_direction":"higher_better","value_numeric":38,"value_text":"38%","value_type":"percentage","unit":"%",
     "observation_date":"2024-12-31","period_start":"2024-01-01","period_end":"2024-12-31","fiscal_year":2024,
     "gender":"all","location":"national","region":"","source_name":"World Bank Global Findex 2024",
     "source_type":"survey","source_url":"https://www.worldbank.org/en/publication/globalfindex",
     "confidence":"high","related_indicator":"","relationship_type":"","impact_direction":"","impact_magnitude":"",
     "impact_estimate":"","lag_months":"","evidence_basis":"literature","comparable_country":"Kenya",
     "original_text":"Based on Findex methodology","notes":"Baseline adult account ownership in Ethiopia"},
     
    {"category":"USAGE","pillar":"Usage","indicator":"Avg Monthly Mobile Transactions per User","indicator_code":"USG_MM_TXN_AVG",
     "indicator_direction":"higher_better","value_numeric":15,"value_text":"15 txns/user","value_type":"count","unit":"transactions",
     "observation_date":"2024-12-31","period_start":"2024-01-01","period_end":"2024-12-31","fiscal_year":2024,
     "gender":"all","location":"national","region":"","source_name":"NBE Annual Report 2024",
     "source_type":"operator","source_url":"https://nbe.gov.et","confidence":"high","related_indicator":"","relationship_type":"",
     "impact_direction":"","impact_magnitude":"","impact_estimate":"","lag_months":"","evidence_basis":"empirical",
     "comparable_country":"","original_text":"Average active mobile money user does 15 transactions/month","notes":"Derived from total mobile money transactions / active users"},
     
    {"category":"ACCESS","pillar":"Access","indicator":"Adults with National Digital ID","indicator_code":"ACC_DIGID",
     "indicator_direction":"higher_better","value_numeric":12000000,"value_text":"12M adults","value_type":"count","unit":"people",
     "observation_date":"2025-02-28","period_start":"2025-01-01","period_end":"2025-02-28","fiscal_year":2025,
     "gender":"all","location":"national","region":"","source_name":"National ID Program",
     "source_type":"regulator","source_url":"https://www.id.gov.et","confidence":"high","related_indicator":"","relationship_type":"",
     "impact_direction":"","impact_magnitude":"","impact_estimate":"","lag_months":"","evidence_basis":"empirical",
     "comparable_country":"India","original_text":"Aadhaar-inspired rollout","notes":"Enrollment progress for digital ID program"},
     
    {"category":"AFFORDABILITY","pillar":"Affordability","indicator":"Data Cost (% GNI per capita)","indicator_code":"AFF_DATA_INCOME",
     "indicator_direction":"lower_better","value_numeric":2,"value_text":"2% of GNI","value_type":"percentage","unit":"% of GNI",
     "observation_date":"2024-12-31","period_start":"2024-01-01","period_end":"2024-12-31","fiscal_year":2024,
     "gender":"all","location":"national","region":"","source_name":"ITU / A4AI 2024",
     "source_type":"research","source_url":"https://a4ai.org","confidence":"high","related_indicator":"","relationship_type":"",
     "impact_direction":"","impact_magnitude":"","impact_estimate":"","lag_months":"","evidence_basis":"literature",
     "comparable_country":"Rwanda","original_text":"Meets UN target for data affordability","notes":"Data costs relative to income"},
     
    {"category":"GENDER","pillar":"Inclusion","indicator":"Female Mobile Money Account Share","indicator_code":"GEN_MM_SHARE",
     "indicator_direction":"higher_better","value_numeric":18,"value_text":"18%","value_type":"percentage","unit":"%",
     "observation_date":"2024-12-31","period_start":"2024-01-01","period_end":"2024-12-31","fiscal_year":2024,
     "gender":"female","location":"national","region":"","source_name":"NBE / Shega 2024",
     "source_type":"regulator","source_url":"https://nbe.gov.et","confidence":"high","related_indicator":"","relationship_type":"",
     "impact_direction":"","impact_magnitude":"","impact_estimate":"","lag_months":"","evidence_basis":"literature",
     "comparable_country":"Kenya","original_text":"Gender share of mobile money accounts","notes":"Baseline for gender inclusion"},
     
    {"category":"USAGE","pillar":"Usage","indicator":"Avg Monthly Internet Data Usage per User","indicator_code":"USG_DATA_MB",
     "indicator_direction":"higher_better","value_numeric":750,"value_text":"750 MB","value_type":"numeric","unit":"MB",
     "observation_date":"2024-12-31","period_start":"2024-01-01","period_end":"2024-12-31","fiscal_year":2024,
     "gender":"all","location":"national","region":"","source_name":"NBE Digital Finance Report 2024",
     "source_type":"report","source_url":"https://nbe.gov.et/reports","confidence":"high","related_indicator":"","relationship_type":"",
     "impact_direction":"","impact_magnitude":"","impact_estimate":"","lag_months":"","evidence_basis":"literature",
     "comparable_country":"Kenya","original_text":"Monthly average mobile internet consumption per user","notes":"Derived from mobile internet usage reports"},
     
    {"category":"ACCESS","pillar":"Access","indicator":"Financial Services Coverage: Rural vs Urban","indicator_code":"ACC_RURAL_URBAN",
     "indicator_direction":"higher_better","value_numeric":65,"value_text":"65%","value_type":"percentage","unit":"%",
     "observation_date":"2024-12-31","period_start":"2024-01-01","period_end":"2024-12-31","fiscal_year":2024,
     "gender":"all","location":"national","region":"","source_name":"World Bank Findex 2024","source_type":"survey",
     "source_url":"https://www.worldbank.org/en/publication/globalfindex","confidence":"high","related_indicator":"","relationship_type":"",
     "impact_direction":"","impact_magnitude":"","impact_estimate":"","lag_months":"","evidence_basis":"literature",
     "comparable_country":"Uganda","original_text":"Rural population with access to basic financial services","notes":"Comparative access coverage by geography"},
     
    {"category":"USAGE","pillar":"Usage","indicator":"Agent Density per 10,000 Adults","indicator_code":"USG_AGENT_DENSITY",
     "indicator_direction":"higher_better","value_numeric":5,"value_text":"5 agents per 10k","value_type":"numeric","unit":"agents",
     "observation_date":"2024-12-31","period_start":"2024-01-01","period_end":"2024-12-31","fiscal_year":2024,
     "gender":"all","location":"national","region":"","source_name":"NBE Agent Network Data 2024","source_type":"report",
     "source_url":"https://nbe.gov.et/reports","confidence":"high","related_indicator":"","relationship_type":"",
     "impact_direction":"","impact_magnitude":"","impact_estimate":"","lag_months":"","evidence_basis":"literature",
     "comparable_country":"Ghana","original_text":"Number of registered mobile money agents per 10k adults","notes":"Agent network coverage indicator"},
     
    {"category":"TRUST","pillar":"Trust","indicator":"Customer Satisfaction with Digital Financial Services","indicator_code":"TRS_CUST_SAT",
     "indicator_direction":"higher_better","value_numeric":78,"value_text":"78/100","value_type":"score","unit":"points",
     "observation_date":"2024-12-31","period_start":"2024-01-01","period_end":"2024-12-31","fiscal_year":2024,
     "gender":"all","location":"national","region":"","source_name":"GSMA Mobile Money Survey 2024","source_type":"survey",
     "source_url":"https://www.gsma.com/mobilemoneysurvey","confidence":"high","related_indicator":"","relationship_type":"",
     "impact_direction":"","impact_magnitude":"","impact_estimate":"","lag_months":"","evidence_basis":"literature",
     "comparable_country":"Kenya","original_text":"Satisfaction index based on customer survey","notes":"Benchmark for service quality"},
     
    {"category":"INFRA","pillar":"Infrastructure","indicator":"Mobile Network Coverage","indicator_code":"INF_MOBILE_NET",
     "indicator_direction":"higher_better","value_numeric":92,"value_text":"92%","value_type":"percentage","unit":"%",
     "observation_date":"2024-12-31","period_start":"2024-01-01","period_end":"2024-12-31","fiscal_year":2024,
     "gender":"all","location":"national","region":"","source_name":"ITU ICT Indicators 2024","source_type":"report",
     "source_url":"https://www.itu.int/en/ITU-D/Statistics/","confidence":"high","related_indicator":"","relationship_type":"",
     "impact_direction":"","impact_magnitude":"","impact_estimate":"","lag_months":"","evidence_basis":"literature",
     "comparable_country":"Kenya","original_text":"National mobile network coverage","notes":"Baseline for connectivity infrastructure"}
]

for obs in new_observations:
    main_data = pd.concat([main_data, pd.DataFrame([create_observation_row(obs)])], ignore_index=True)

# ----------------------------
# 5. Add new events (6)
# ----------------------------
new_events = [
    {"category":"product_launch","pillar":"","indicator":"Telebirr national launch","indicator_code":"Telebirr national launch",
     "observation_date":"2021-05-17","source_name":"Ethio Telecom","source_type":"operator_report",
     "source_url":"https://www.ethiotelecom.et/telebirr/","notes":"First nationwide mobile money service","gender":"all","location":"national","region":"Ethiopia"},
     
    {"category":"infrastructure","pillar":"","indicator":"National agent network expansion","indicator_code":"National agent network expansion",
     "observation_date":"2022-01-01","source_name":"National Bank of Ethiopia","source_type":"regulator_report",
     "source_url":"https://www.nbe.gov.et/mobile-and-agent-banking/","notes":"Enables last-mile access","gender":"all","location":"national","region":"Ethiopia"},
     
    {"category":"policy","pillar":"","indicator":"Mobile money interoperability mandate","indicator_code":"Mobile money interoperability mandate",
     "observation_date":"2023-07-01","source_name":"National Bank of Ethiopia","source_type":"policy_document",
     "source_url":"https://www.nbe.gov.et/payment-system/","notes":"Reduced fragmentation","gender":"all","location":"national","region":"Ethiopia"},
     
    {"category":"product_launch","pillar":"","indicator":"M-Pesa Ethiopia commercial launch","indicator_code":"M-Pesa Ethiopia commercial launch",
     "observation_date":"2023-08-01","source_name":"Safaricom Ethiopia","source_type":"operator_report",
     "source_url":"https://www.safaricom.co.et","notes":"Second major MNO wallet","gender":"all","location":"national","region":"Ethiopia"},
     
    {"category":"infrastructure","pillar":"","indicator":"4G network expansion","indicator_code":"4G network expansion",
     "observation_date":"2022-06-01","source_name":"ITU","source_type":"international_report",
     "source_url":"https://www.itu.int/itu-d/reports","notes":"Digital payments enabler","gender":"all","location":"national","region":"Ethiopia"},
     
    {"category":"policy","pillar":"","indicator":"Financial Consumer Protection Directive","indicator_code":"Financial Consumer Protection Directive",
     "observation_date":"2024-03-01","source_name":"National Bank of Ethiopia","source_type":"policy_document",
     "source_url":"https://www.nbe.gov.et","notes":"Trust-building reform","gender":"all","location":"national","region":"Ethiopia"}
]

for evt in new_events:
    main_data = pd.concat([main_data, pd.DataFrame([create_event_row(evt)])], ignore_index=True)

# ----------------------------
# 6. Add impact links (6)
# ----------------------------
new_impact_links = [
    {"parent_id":"EVT_1001","pillar":"ACCESS","related_indicator":"ACC_OWNERSHIP",
     "impact_direction":"increase","impact_magnitude":"medium","lag_months":12,"evidence_basis":"literature",
     "notes":"Mobile money launches increase account ownership","indicator":"","indicator_code":"",
     "value_numeric":"","value_text":"","value_type":"","unit":"","observation_date":"","period_start":"","period_end":"",
     "fiscal_year":"","gender":"all","location":"national","region":"Ethiopia","source_name":"World Bank","source_type":"literature",
     "source_url":"https://www.worldbank.org/findex","confidence":"medium","original_text":"Mobile money launches increase account ownership"},
     
    {"parent_id":"EVT_1001","pillar":"USAGE","related_indicator":"USG_DIGITAL_PAYMENT",
     "impact_direction":"increase","impact_magnitude":"high","lag_months":12,"evidence_basis":"industry_report",
     "notes":"P2P payments dominate early mobile money usage","indicator":"","indicator_code":"",
     "value_numeric":"","value_text":"","value_type":"","unit":"","observation_date":"","period_start":"","period_end":"",
     "fiscal_year":"","gender":"all","location":"national","region":"Ethiopia","source_name":"GSMA","source_type":"industry_report",
     "source_url":"https://www.gsma.com/mobilemoney","confidence":"high","original_text":"P2P payments dominate early mobile money usage"},
     
    {"parent_id":"EVT_1003","pillar":"USAGE","related_indicator":"USG_DIGITAL_PAYMENT",
     "impact_direction":"increase","impact_magnitude":"medium","lag_months":9,"evidence_basis":"literature",
     "notes":"Interoperability increases transaction frequency","indicator":"","indicator_code":"",
     "value_numeric":"","value_text":"","value_type":"","unit":"","observation_date":"","period_start":"","period_end":"",
     "fiscal_year":"","gender":"all","location":"national","region":"Ethiopia","source_name":"World Bank","source_type":"literature",
     "source_url":"https://openknowledge.worldbank.org","confidence":"medium","original_text":"Interoperability increases transaction frequency"},
     
    {"parent_id":"EVT_1004","pillar":"ACCESS","related_indicator":"ACC_MM_ACCOUNT",
     "impact_direction":"increase","impact_magnitude":"medium","lag_months":12,"evidence_basis":"operator_report",
     "notes":"Competition accelerates customer acquisition","indicator":"","indicator_code":"",
     "value_numeric":"","value_text":"","value_type":"","unit":"","observation_date":"","period_start":"","period_end":"",
     "fiscal_year":"","gender":"all","location":"national","region":"Ethiopia","source_name":"Safaricom","source_type":"operator_report",
     "source_url":"https://www.safaricom.co.et","confidence":"medium","original_text":"Competition accelerates customer acquisition"},
     
    {"parent_id":"EVT_1005","pillar":"USAGE","related_indicator":"USG_DIGITAL_PAYMENT",
     "impact_direction":"increase","impact_magnitude":"low","lag_months":24,"evidence_basis":"literature",
     "notes":"Broadband enables app-based payments","indicator":"","indicator_code":"",
     "value_numeric":"","value_text":"","value_type":"","unit":"","observation_date":"","period_start":"","period_end":"",
     "fiscal_year":"","gender":"all","location":"national","region":"Ethiopia","source_name":"ITU","source_type":"literature",
     "source_url":"https://www.itu.int","confidence":"low","original_text":"Broadband enables app-based payments"},
     
    {"parent_id":"EVT_1006","pillar":"TRUST","related_indicator":"TRS_CUST_SAT",
     "impact_direction":"increase","impact_magnitude":"medium","lag_months":6,"evidence_basis":"policy_analysis",
     "notes":"Consumer protection improves trust and adoption","indicator":"","indicator_code":"",
     "value_numeric":"","value_text":"","value_type":"","unit":"","observation_date":"","period_start":"","period_end":"",
     "fiscal_year":"","gender":"all","location":"national","region":"Ethiopia","source_name":"NBE","source_type":"policy_document",
     "source_url":"https://www.nbe.gov.et","confidence":"medium","original_text":"Consumer protection improves trust and adoption"}
]

for imp in new_impact_links:
    impact_links = pd.concat([impact_links, pd.DataFrame([create_impact_link(imp)])], ignore_index=True)

# ----------------------------
# 7. Save enriched datasets
# ----------------------------
main_data.to_csv("../data/processed/enriched_fi_data.csv", index=False)
impact_links.to_csv("../data/processed/enriched_impact_links.csv", index=False)

# ----------------------------
# 8. Update enrichment log
# ----------------------------
with open("data_enrichment_log.md", "a") as f:
    f.write(f"\n## Task 1 Enrichment - {datetime.today().strftime('%Y-%m-%d')}\n")
    f.write(f"- Added {len(new_observations)} new observations\n")
    f.write(f"- Added {len(new_events)} new events\n")
    f.write(f"- Added {len(new_impact_links)} new impact links\n")

print("\nTask 1 enrichment complete!")
print("Enriched datasets saved to data/processed/")
print("Data enrichment log updated.")


Main dataset shape: (43, 34)
Impact links shape: (14, 35)
Reference codes shape: (71, 4)

Records by record_type:
 record_type
observation    30
event          10
target          3
Name: count, dtype: int64

Records by pillar:
 pillar
ACCESS           16
USAGE            11
GENDER            5
AFFORDABILITY     1
Name: count, dtype: int64

Records by source_type:
 source_type
operator      15
survey        10
regulator      7
research       4
policy         3
calculated     2
news           2
Name: count, dtype: int64

Records by confidence:
 confidence
high      40
medium     3
Name: count, dtype: int64

Temporal range of observations: 2014-12-31 00:00:00 to 2025-12-31 00:00:00

Unique indicators: <StringArray>
[     'ACC_OWNERSHIP',     'ACC_MM_ACCOUNT',         'ACC_4G_COV',
     'ACC_MOBILE_PEN',          'ACC_FAYDA',      'USG_P2P_COUNT',
      'USG_P2P_VALUE',      'USG_ATM_COUNT',      'USG_ATM_VALUE',
      'USG_CROSSOVER', 'USG_TELEBIRR_USERS', 'USG_TELEBIRR_VALUE',
    'USG_M