# ## # Task 1: Data Exploration and Enrichment

In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Load initial data
data = pd.read_csv('../data/raw/ethiopia_fi_unified_data.csv')
ref_codes = pd.read_csv('../data/raw/reference_codes.csv')

# Explore data structure
print("Data shape:", data.shape)
print("\nRecord type distribution:")
print(data['record_type'].value_counts())

print("\nPillar distribution:")
print(data['pillar'].value_counts())

print("\nTemporal range:")
data['observation_date'] = pd.to_datetime(data['observation_date'])
print("From:", data['observation_date'].min())
print("To:", data['observation_date'].max())

# Enrichment: Add new observations based on research
new_observations = []

# Example: Add GSMA data for mobile penetration
new_obs_1 = {
    'record_type': 'observation',
    'pillar': 'enabler',
    'indicator': 'Mobile Cellular Subscriptions per 100 people',
    'indicator_code': 'EN_MOBILE_SUBS',
    'value_numeric': 44.0,
    'observation_date': '2024-12-31',
    'source_name': 'World Bank, ITU',
    'source_url': 'https://data.worldbank.org/indicator/IT.CEL.SETS.P2',
    'confidence': 'medium',
    'collected_by': 'Analyst Name',
    'collection_date': datetime.now().strftime('%Y-%m-%d'),
    'notes': 'Mobile subscription rate as enabler for digital finance'
}

# Example: Add smartphone penetration data
new_obs_2 = {
    'record_type': 'observation',
    'pillar': 'enabler',
    'indicator': 'Smartphone Penetration Rate',
    'indicator_code': 'EN_SMARTPHONE',
    'value_numeric': 25.0,
    'observation_date': '2024-12-31',
    'source_name': 'GSMA Mobile Economy Report',
    'source_url': 'https://www.gsma.com/mobileeconomy/',
    'confidence': 'medium',
    'collected_by': 'Analyst Name',
    'collection_date': datetime.now().strftime('%Y-%m-%d'),
    'notes': 'Critical for mobile money adoption'
}

# Add new events
new_events = []

# Example: Add EthSwitch national switch launch
new_event_1 = {
    'record_type': 'event',
    'category': 'infrastructure',
    'event_name': 'EthSwitch National Payment Switch Upgrade',
    'event_date': '2024-06-01',
    'source_name': 'National Bank of Ethiopia',
    'source_url': 'https://nbebank.com/payment-systems/',
    'confidence': 'high',
    'collected_by': 'Analyst Name',
    'collection_date': datetime.now().strftime('%Y-%m-%d'),
    'notes': 'Enhanced interoperability expected to boost digital payments'
}

# Add new impact links
new_impact_links = []

# Example: Link smartphone penetration to digital payments
new_link_1 = {
    'record_type': 'impact_link',
    'parent_id': 'event_id_for_smartphone_growth',  # Reference to relevant event
    'pillar': 'usage',
    'related_indicator': 'USG_DIGITAL_PAYMENT',
    'impact_direction': 'positive',
    'impact_magnitude': 0.3,
    'lag_months': 12,
    'evidence_basis': 'GSMA research on correlation between smartphone ownership and digital payment usage'
}

# Convert to DataFrame and append
new_data = pd.DataFrame(new_observations + new_events + new_impact_links)
enriched_data = pd.concat([data, new_data], ignore_index=True)

# Save enriched data
enriched_data.to_csv('../data/processed/enriched_data.csv', index=False)

Data shape: (43, 34)

Record type distribution:
record_type
observation    30
event          10
target          3
Name: count, dtype: int64

Pillar distribution:
pillar
ACCESS           16
USAGE            11
GENDER            5
AFFORDABILITY     1
Name: count, dtype: int64

Temporal range:
From: 2014-12-31 00:00:00
To: 2030-12-31 00:00:00
