In [None]:
# Task 1: Data Exploration and Enrichment
import sys
sys.path.append('../src')

from data_loader import DataLoader
from data_enricher import DataEnricher
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Initialize data loader
loader = DataLoader()
raw_data, reference_codes = loader.load_data()

# Display basic information
print("=== RAW DATA INFO ===")
print(f"Shape: {raw_data.shape}")
print(f"\nColumns: {raw_data.columns.tolist()}")
print(f"\nRecord types: {raw_data['record_type'].value_counts()}")

# Validate data
is_valid = loader.validate_data()
print(f"\nData validation: {'PASS' if is_valid else 'FAIL'}")

# Generate summary
summary = loader.get_data_summary()
print("\n=== DATA SUMMARY ===")
for key, value in summary.items():
    print(f"{key}: {value}")

# Show sample of each record type
print("\n=== SAMPLE RECORDS ===")
for record_type in raw_data['record_type'].unique():
    sample = raw_data[raw_data['record_type'] == record_type].head(2)
    print(f"\n{record_type.upper()} records:")
    print(sample[['indicator', 'value_numeric', 'observation_date']].to_string())

# Initialize data enricher
enricher = DataEnricher(raw_data)

# Enrich data with additional sources
print("\n=== ENRICHING DATA ===")
enricher.add_mobile_money_data()
enricher.add_infrastructure_data()
enricher.add_additional_events()
enricher.add_impact_links()
enricher.add_economic_indicators()

# Get enriched data
enriched_data = enricher.get_enriched_data()
print(f"\nEnriched data shape: {enriched_data.shape}")

# Analyze enriched data
print("\n=== ENRICHED DATA ANALYSIS ===")
print(f"Total records by type:")
print(enriched_data['record_type'].value_counts())

print(f"\nObservations by pillar:")
obs_data = enriched_data[enriched_data['record_type'] == 'observation']
print(obs_data['pillar'].value_counts())

print(f"\nUnique indicators:")
print(enriched_data['indicator'].nunique())

# Save enriched data
output_path, log_path = enricher.save_enriched_data()

# Create summary visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# 1. Record type distribution
record_counts = enriched_data['record_type'].value_counts()
axes[0, 0].pie(record_counts.values, labels=record_counts.index, autopct='%1.1f%%')
axes[0, 0].set_title('Distribution of Record Types')

# 2. Observations by pillar (filter out non-observations)
pillar_data = enriched_data[
    (enriched_data['record_type'] == 'observation') & 
    (enriched_data['pillar'].notna())
]
if not pillar_data.empty:
    pillar_counts = pillar_data['pillar'].value_counts()
    axes[0, 1].bar(pillar_counts.index, pillar_counts.values)
    axes[0, 1].set_title('Observations by Pillar')
    axes[0, 1].set_ylabel('Count')

# 3. Data quality by confidence level
conf_counts = enriched_data['confidence'].value_counts()
axes[1, 0].bar(conf_counts.index, conf_counts.values)
axes[1, 0].set_title('Data Quality (Confidence Levels)')
axes[1, 0].set_ylabel('Count')

# 4. Temporal coverage
enriched_data['year'] = pd.to_datetime(enriched_data['observation_date']).dt.year
year_counts = enriched_data['year'].value_counts().sort_index()
axes[1, 1].plot(year_counts.index, year_counts.values, marker='o')
axes[1, 1].set_title('Temporal Coverage of Data')
axes[1, 1].set_xlabel('Year')
axes[1, 1].set_ylabel('Number of Records')

plt.tight_layout()
plt.savefig('reports/figures/task1_data_overview.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"\n=== TASK 1 COMPLETED ===")
print(f"1. Data loaded and validated")
print(f"2. Dataset enriched with additional sources")
print(f"3. Enriched data saved to: {output_path}")
print(f"4. Enrichment log saved to: {log_path}")
print(f"5. Visualizations created and saved")