In [None]:
# Task 2: Exploratory Data Analysis
import sys
sys.path.append('../src')

from eda_analyzer import EDAAnalyzer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from IPython.display import display, HTML

# Initialize analyzer
print("Loading enriched data...")
analyzer = EDAAnalyzer()

print("\n=== DATASET OVERVIEW ===")
print(f"Total records: {len(analyzer.data)}")
print(f"Observations: {len(analyzer.observations)}")
print(f"Events: {len(analyzer.events)}")
print(f"Impact links: {len(analyzer.impact_links)}")

# 1. Access Analysis
print("\n=== ACCESS ANALYSIS ===")
access_data, access_insights = analyzer.analyze_access_trends()
print(f"Account ownership trajectory:")
print(access_data[['observation_date', 'value_numeric', 'growth_pp']].to_string())
print(f"\nKey Insights:")
print(f"- Current rate: {access_insights['current_rate']}%")
print(f"- Latest growth: {access_insights['latest_growth']} percentage points")
print(f"- Average annual growth: {access_insights['avg_annual_growth']:.1f} pp/year")
print(f"- 2024 slowdown: {'Yes' if access_insights['slowdown_2024'] else 'No'}")

# 2. Usage Analysis
print("\n=== USAGE ANALYSIS ===")
payment_data, mm_data, usage_insights = analyzer.analyze_usage_trends()
print(f"Digital payment indicators:")
unique_indicators = payment_data['indicator'].unique()
for indicator in unique_indicators:
    latest = payment_data[payment_data['indicator'] == indicator]['value_numeric'].iloc[-1]
    print(f"- {indicator}: {latest}%")

if mm_data is not None and len(mm_data) > 0:
    print(f"\nMobile money growth:")
    print(mm_data[['observation_date', 'value_numeric']].to_string())
    print(f"- Estimated active users: {usage_insights['registered_active_gap']:.1f}% gap")

# 3. Infrastructure Analysis
print("\n=== INFRASTRUCTURE ANALYSIS ===")
infra_data, infra_insights = analyzer.analyze_infrastructure_correlations()
print("Infrastructure indicators:")
infra_summary = infra_data.groupby('indicator')['value_numeric'].agg(['last', 'mean'])
print(infra_summary.to_string())
print(f"\nAgent network growth: {infra_insights['agent_density_trend']:.1f}% annually")
print(f"4G coverage growth: {infra_insights['coverage_growth']:.1f}% annually")

# 4. Gender Gap Analysis
print("\n=== GENDER GAP ANALYSIS ===")
gender_data, gender_insights = analyzer.analyze_gender_gap()
print(f"Gender gap analysis:")
print(gender_data.to_string())
print(f"\nKey findings:")
print(f"- Current gender gap: {gender_insights['current_gap']} percentage points")
print(f"- Gap trend: {gender_insights['gap_trend']}")
print(f"- Female growth rate: {gender_insights['female_growth']:.1f}% annually")

# 5. Urban-Rural Analysis
print("\n=== URBAN-RURAL ANALYSIS ===")
urban_rural_data, urban_insights = analyzer.analyze_urban_rural_gap()
print(f"Urban-rural divide:")
print(urban_rural_data.to_string())
print(f"\nKey findings:")
print(f"- Current urban-rural gap: {urban_insights['current_gap']} percentage points")
print(f"- Rural growth rate: {urban_insights['rural_growth']:.1f}% annually")
print(f"- Gap trend: {urban_insights['gap_trend']}")

# 6. Event Timeline Analysis
print("\n=== EVENT TIMELINE ===")
timeline_data = analyzer.create_event_timeline()
print(f"Key events cataloged:")
for event in timeline_data['events']:
    print(f"- {event['name']} ({event['event_date'].year})")

# 7. Generate Comprehensive Insights
print("\n=== COMPREHENSIVE INSIGHTS ===")
insights = analyzer.generate_key_insights()

print("\n1. ACCESS DRIVERS AND CHALLENGES:")
print(f"   Current rate: {insights['access']['current_rate']}%")
print(f"   2024 slowdown explanation:")
for reason in insights['access']['explanation_2024_slowdown']:
    print(f"   - {reason}")

print("\n2. USAGE PATTERNS:")
print(f"   Mobile money growth: {insights['usage']['mobile_money_growth']:.1f}% annually")
print(f"   Registered-active gap: {insights['usage']['registered_active_gap']:.1f}%")
print(f"   Payment adoption levels:")
for indicator, value in insights['usage']['payment_adoption'].items():
    print(f"   - {indicator}: {value}%")

print("\n3. INFRASTRUCTURE CORRELATIONS:")
print(f"   Agent density growth: {insights['infrastructure']['agent_density_growth']:.1f}%")
print(f"   Correlation with access: {insights['infrastructure']['correlation_with_access']}")

print("\n4. DISPARITIES:")
print(f"   Gender gap: {insights['disparities']['gender_gap']} pp ({insights['disparities']['gender_gap_trend']})")
print(f"   Urban-rural gap: {insights['disparities']['urban_rural_gap']} pp ({insights['disparities']['urban_rural_trend']})")

print("\n5. MARKET NUANCES:")
for nuance in insights['market_nuances']:
    print(f"   • {nuance}")

print("\n6. DATA LIMITATIONS:")
for limitation in insights['data_limitations']:
    print(f"   • {limitation}")

print("\n7. KEY HYPOTHESES FOR TESTING:")
hypotheses = [
    "H1: Mobile money growth drives account ownership but with diminishing returns",
    "H2: Infrastructure investments have 12-18 month lagged effects on inclusion",
    "H3: Interoperability increases usage more than access",
    "H4: Gender gap persists despite overall growth due to structural barriers",
    "H5: Economic factors (inflation, GDP) moderate inclusion growth"
]
for hypothesis in hypotheses:
    print(f"   {hypothesis}")

# Create visualizations
print("\n=== CREATING VISUALIZATIONS ===")
viz_files = analyzer.create_visualizations()
print("Visualizations created:")
for name, path in viz_files.items():
    if path:
        print(f"- {name}: {path}")

# Create correlation matrix
print("\n=== CORRELATION ANALYSIS ===")
# Prepare data for correlation analysis
numeric_data = analyzer.observations.pivot_table(
    index='observation_date',
    columns='indicator_code',
    values='value_numeric'
).reset_index()

# Calculate year for aggregation
numeric_data['year'] = numeric_data['observation_date'].dt.year
annual_data = numeric_data.groupby('year').mean()

# Calculate correlations
correlation_matrix = annual_data.corr()

# Plot correlation heatmap
plt.figure(figsize=(12, 8))
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', 
            cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix of Financial Inclusion Indicators')
plt.tight_layout()
plt.savefig('reports/figures/correlation_matrix.png', dpi=300)
plt.show()

print("\n=== TASK 2 COMPLETED ===")
print("1. Comprehensive EDA performed")
print("2. Key insights generated and documented")
print("3. Visualizations created for all analysis dimensions")
print("4. Hypotheses formulated for impact modeling")
print("5. Data limitations documented")