In [3]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

# Configure visualization if needed
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


In [5]:
# Load Starter Dataset
data_path = '../data/raw/'
unified_data = pd.read_csv(os.path.join(data_path, 'ethiopia_fi_unified_data.csv'))
impact_links = pd.read_csv(os.path.join(data_path, 'impact_links.csv'))
reference_codes = pd.read_csv(os.path.join(data_path, 'reference_codes.csv'))

print("Unified Data Shape:", unified_data.shape)
print("Impact Links Shape:", impact_links.shape)
print("Reference Codes Shape:", reference_codes.shape)

unified_data.head()


Unified Data Shape: (43, 34)
Impact Links Shape: (14, 35)
Reference Codes Shape: (71, 4)


Unnamed: 0,record_id,record_type,category,pillar,indicator,indicator_code,indicator_direction,value_numeric,value_text,value_type,...,impact_direction,impact_magnitude,impact_estimate,lag_months,evidence_basis,comparable_country,collected_by,collection_date,original_text,notes
0,REC_0001,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,22.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Baseline year,
1,REC_0002,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,35.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,,
2,REC_0003,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,46.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,,
3,REC_0004,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,56.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,
4,REC_0005,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,36.0,,percentage,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,


In [7]:
# 1. Explore Data Summary
print("--- Records by Type ---")
print(unified_data['record_type'].value_counts())

print("\n--- Records by Pillar ---")
print(unified_data['pillar'].value_counts())

print("\n--- Records by Source Type ---")
print(unified_data['source_type'].value_counts())

print("\n--- Temporal Range ---")
# Convert to datetime if possible, or just extract year
# unified_data['observation_date'] = pd.to_datetime(unified_data['observation_date']) # Format might be mixed
print(unified_data['observation_date'].min(), "to", unified_data['observation_date'].max())

print("\n--- Unique Indicators ---")
print(unified_data.loc[unified_data['record_type']=='observation', 'indicator_code'].unique())


--- Records by Type ---
record_type
observation    30
event          10
target          3
Name: count, dtype: int64

--- Records by Pillar ---
pillar
ACCESS           16
USAGE            11
GENDER            5
AFFORDABILITY     1
Name: count, dtype: int64

--- Records by Source Type ---
source_type
operator      15
survey        10
regulator      7
research       4
policy         3
calculated     2
news           2
Name: count, dtype: int64

--- Temporal Range ---
2014-12-31 to 2030-12-31

--- Unique Indicators ---
['ACC_OWNERSHIP' 'ACC_MM_ACCOUNT' 'ACC_4G_COV' 'ACC_MOBILE_PEN'
 'ACC_FAYDA' 'USG_P2P_COUNT' 'USG_P2P_VALUE' 'USG_ATM_COUNT'
 'USG_ATM_VALUE' 'USG_CROSSOVER' 'USG_TELEBIRR_USERS' 'USG_TELEBIRR_VALUE'
 'USG_MPESA_USERS' 'USG_MPESA_ACTIVE' 'USG_ACTIVE_RATE' 'AFF_DATA_INCOME'
 'GEN_GAP_ACC' 'GEN_MM_SHARE' 'GEN_GAP_MOBILE']


In [None]:
# 2. Enrichment Data Loading
enrich_A = pd.read_csv(os.path.join(data_path, 'enrichment_A_baselines.csv'))
enrich_B = pd.read_csv(os.path.join(data_path, 'enrichment_B_direct.csv'))
enrich_C = pd.read_csv(os.path.join(data_path, 'enrichment_C_indirect.csv'))

print("Enrichment A Columns:", enrich_A.columns.tolist())
print("Enrichment B Columns:", enrich_B.columns.tolist())
print("Enrichment C Columns:", enrich_C.columns.tolist())

# Show first few rows to understand structure
display(enrich_A.head(2))
display(enrich_B.head(2))


# Task 1: Data Exploration and Enrichment

## Objective
Understand the starter dataset and enrich it with additional data found in `data/raw`.

## Data Sources
- Starter Dataset: `data/raw/ethiopia_fi_unified_data.csv`
- Impact Links: `data/raw/impact_links.csv`
- Reference Codes: `data/raw/reference_codes.csv`
- Additional Data Points: `data/raw/Additional Data Points Guide.xlsx - *.csv`
