In [2]:
import pandas as pd
from datetime import date

# Load raw unified dataset
df = pd.read_csv("../data/raw/ethiopia_fi_unified_data.csv")

print(df.shape)
df.head()


(43, 34)


Unnamed: 0,record_id,record_type,category,pillar,indicator,indicator_code,indicator_direction,value_numeric,value_text,value_type,...,impact_direction,impact_magnitude,impact_estimate,lag_months,evidence_basis,comparable_country,collected_by,collection_date,original_text,notes
0,REC_0001,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,22.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Baseline year,
1,REC_0002,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,35.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,,
2,REC_0003,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,46.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,,
3,REC_0004,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,56.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,
4,REC_0005,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,36.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,


In [8]:
def next_record_id(df, prefix="REC_", width=4):
    """
    Generate the next record_id as a string, keeping the same prefix and zero-padding.
    Example: REC_0001 → REC_0002
    """
    if df.empty:
        return f"{prefix}{'1'.zfill(width)}"
    
    # Extract numeric part
    numeric_ids = df["record_id"].str.replace(prefix, "", regex=False)
    numeric_ids = pd.to_numeric(numeric_ids, errors='coerce')
    
    next_id_num = int(numeric_ids.max()) + 1
    # Format with leading zeros
    return f"{prefix}{str(next_id_num).zfill(width)}"


In [9]:
new_observation_1 = {
    "record_id": next_record_id(df),
    "record_type": "observation",
    "category": None,
    "pillar": "enabler",
    "indicator": "Smartphone penetration rate",
    "indicator_code": "ENB_SMARTPHONE_PEN",
    "indicator_direction": "positive",
    "value_numeric": 28.0,
    "value_text": None,
    "value_type": "percentage",
    "unit": "%",
    "observation_date": "2023-12-31",
    "period_start": None,
    "period_end": None,
    "fiscal_year": None,
    "gender": "all",
    "location": "national",
    "region": "Ethiopia",
    "source_name": "GSMA",
    "source_type": "industry_report",
    "source_url": "https://www.gsma.com",
    "confidence": "medium",
    "related_indicator": None,
    "relationship_type": None,
    "impact_direction": None,
    "impact_magnitude": None,
    "impact_estimate": None,
    "lag_months": None,
    "evidence_basis": None,
    "comparable_country": None,
    "collected_by": "Helen",
    "collection_date": date.today().isoformat(),
    "original_text": "Smartphone adoption in Ethiopia reached approximately 28% in 2023.",
    "notes": "Smartphone access is a prerequisite for mobile money and digital payments."
}

df = pd.concat([df, pd.DataFrame([new_observation_1])], ignore_index=True)


In [5]:
df["record_id"].dtype


<StringDtype(storage='python', na_value=nan)>

In [10]:
new_observation_2 = {
    "record_id": next_record_id(df),
    "record_type": "observation",
    "category": None,
    "pillar": "enabler",
    "indicator": "Mobile phone penetration rate",
    "indicator_code": "ENB_MOBILE_PEN",
    "indicator_direction": "positive",
    "value_numeric": 55.0,
    "value_text": None,
    "value_type": "percentage",
    "unit": "%",
    "observation_date": "2023-12-31",
    "period_start": None,
    "period_end": None,
    "fiscal_year": None,
    "gender": "all",
    "location": "national",
    "region": "Ethiopia",
    "source_name": "ITU",
    "source_type": "international_org",
    "source_url": "https://www.itu.int",
    "confidence": "medium",
    "related_indicator": None,
    "relationship_type": None,
    "impact_direction": None,
    "impact_magnitude": None,
    "impact_estimate": None,
    "lag_months": None,
    "evidence_basis": None,
    "comparable_country": None,
    "collected_by": "Helen",
    "collection_date": date.today().isoformat(),
    "original_text": "Mobile phone penetration in Ethiopia estimated at ~55%.",
    "notes": "Mobile access enables both banking and mobile money usage."
}

df = pd.concat([df, pd.DataFrame([new_observation_2])], ignore_index=True)


In [11]:
event_record_id = next_record_id(df)

new_event = {
    "record_id": event_record_id,
    "record_type": "event",
    "category": "infrastructure",
    "pillar": None,
    "indicator": None,
    "indicator_code": None,
    "indicator_direction": None,
    "value_numeric": None,
    "value_text": None,
    "value_type": None,
    "unit": None,
    "observation_date": None,
    "period_start": "2022-06-01",
    "period_end": None,
    "fiscal_year": None,
    "gender": None,
    "location": "national",
    "region": "Ethiopia",
    "source_name": "EthSwitch",
    "source_type": "operator_report",
    "source_url": "https://ethswitch.com",
    "confidence": "high",
    "related_indicator": None,
    "relationship_type": None,
    "impact_direction": None,
    "impact_magnitude": None,
    "impact_estimate": None,
    "lag_months": None,
    "evidence_basis": None,
    "comparable_country": None,
    "collected_by": "Helen",
    "collection_date": date.today().isoformat(),
    "original_text": "EthSwitch expanded interoperable P2P transfers nationwide.",
    "notes": "Interoperability reduced friction between banks and mobile money platforms."
}

df = pd.concat([df, pd.DataFrame([new_event])], ignore_index=True)


In [12]:
impact_link = {
    "record_id": next_record_id(df),
    "record_type": "impact_link",
    "category": None,
    "pillar": "usage",
    "indicator": None,
    "indicator_code": None,
    "indicator_direction": None,
    "value_numeric": None,
    "value_text": None,
    "value_type": None,
    "unit": None,
    "observation_date": None,
    "period_start": None,
    "period_end": None,
    "fiscal_year": None,
    "gender": None,
    "location": None,
    "region": None,
    "source_name": None,
    "source_type": None,
    "source_url": None,
    "confidence": "medium",
    "related_indicator": "USG_DIGITAL_PAYMENT",
    "relationship_type": "causal",
    "impact_direction": "positive",
    "impact_magnitude": "medium",
    "impact_estimate": None,
    "lag_months": 6,
    "evidence_basis": "Observed effects in Kenya after interoperability rollout",
    "comparable_country": "Kenya",
    "collected_by": "Helen",
    "collection_date": date.today().isoformat(),
    "original_text": None,
    "notes": "Interoperability increases transaction convenience and usage frequency."
}

df = pd.concat([df, pd.DataFrame([impact_link])], ignore_index=True)


In [13]:
df.to_csv(
    "../data/processed/ethiopia_fi_unified_data_enriched.csv",
    index=False
)

print("Enriched dataset saved.")


Enriched dataset saved.
