In [9]:
# Task 1: Data Exploration and Enrichment (All-in-One Cell)

import sys, os
sys.path.append(os.path.abspath(".."))  # Allow importing src modules

import pandas as pd
from datetime import datetime
from src.data_loader import load_raw_data, load_reference_codes
from src.preprocessing import clean_fi_data

# -----------------------------
# 1. Load Data (via src/data_loader.py)
# -----------------------------
fi_df, impact_df = load_raw_data("../data/raw/ethiopia_fi_unified_data.xlsx")
ref_df = load_reference_codes("../data/raw/reference_codes.xlsx")

print("Main dataset shape:", fi_df.shape)
print("Impact links shape:", impact_df.shape)
print("Reference codes shape:", ref_df.shape)

# -----------------------------
# 2. Clean / preprocess
# -----------------------------
fi_df = clean_fi_data(fi_df)

# -----------------------------
# 3. Explore Schema
# -----------------------------
print("\nRecords by record_type:")
print(fi_df['record_type'].value_counts())

print("\nRecords by pillar:")
print(fi_df['pillar'].value_counts())

print("\nRecords by source_type:")
if 'source_type' in fi_df.columns:
    print(fi_df['source_type'].value_counts())

print("\nRecords by confidence:")
if 'confidence' in fi_df.columns:
    print(fi_df['confidence'].value_counts())

print("\nTemporal range of observations:")
print(fi_df['observation_date'].min(), "to", fi_df['observation_date'].max())

print("\nUnique indicators:")
print(fi_df['indicator_code'].unique())

print("\nExisting events and dates:")
events_df = fi_df[fi_df['record_type']=='event']
print(events_df[['indicator_code','category','observation_date','source_name']])

print("\nImpact links overview:")
print(impact_df.head())

# -----------------------------
# 4. Enrich Dataset (example additions)
# -----------------------------
# Example: new observation
new_obs = {
    'record_type': 'observation',
    'pillar': 'Access',
    'indicator': 'Account Ownership',
    'indicator_code': 'ACC_OWNERSHIP',
    'value_numeric': 51.0,
    'observation_date': pd.Timestamp('2025-01-01'),
    'source_name': 'Findex Microdata Projection',
    'source_url': 'https://example.com/findex2025',
    'confidence': 'medium'
}

# Example: new event
new_event = {
    'record_type': 'event',
    'category': 'policy',
    'pillar': '',  # leave empty
    'indicator': 'ACC_OWNERSHIP',
    'indicator_code': 'ACC_OWNERSHIP',
    'value_numeric': None,
    'observation_date': pd.Timestamp('2025-06-01'),
    'source_name': 'New National Financial Policy',
    'source_url': 'https://example.com/nfip2025',
    'confidence': 'high'
}

# Example: new impact_link
new_impact = {
    'parent_id': 'EVT-2025-01',  # reference to the new event
    'pillar': 'Access',
    'related_indicator': 'ACC_OWNERSHIP',
    'impact_direction': 'positive',
    'impact_magnitude': 2.0,  # estimated pp impact
    'lag_months': 6,
    'evidence_basis': 'Comparable countries evidence + expert opinion'
}

# Append new records
fi_df = pd.concat([fi_df, pd.DataFrame([new_obs, new_event])], ignore_index=True)
impact_df = pd.concat([impact_df, pd.DataFrame([new_impact])], ignore_index=True)

# -----------------------------
# 5. Document Additions in data_enrichment_log.md
# -----------------------------
log_path = "../data_enrichment_log.md"
with open(log_path, "w") as f:
    f.write("# Task 1 Data Enrichment Log\n")
    f.write(f"Date: {datetime.today().date()}\n\n")
    f.write("## New Observation\n")
    f.write(str(new_obs) + "\n\n")
    f.write("## New Event\n")
    f.write(str(new_event) + "\n\n")
    f.write("## New Impact Link\n")
    f.write(str(new_impact) + "\n\n")
    f.write("Notes: Examples added for Task 1 demonstration.\n")

print(f"\nData enrichment log saved to {log_path}")

# -----------------------------
# 6. Save final enriched dataset
# -----------------------------
fi_df.to_csv("../data/processed/enriched_fi_data.csv", index=False)
impact_df.to_csv("../data/processed/enriched_impact_links.csv", index=False)
print("\nEnriched datasets saved to data/processed/")


Main dataset shape: (43, 34)
Impact links shape: (14, 35)
Reference codes shape: (71, 4)

Records by record_type:
record_type
observation    30
event          10
target          3
Name: count, dtype: int64

Records by pillar:
pillar
ACCESS           16
USAGE            11
unknown          10
GENDER            5
AFFORDABILITY     1
Name: count, dtype: int64

Records by source_type:
source_type
operator      15
survey        10
regulator      7
research       4
policy         3
calculated     2
news           2
Name: count, dtype: int64

Records by confidence:
confidence
high      40
medium     3
Name: count, dtype: int64

Temporal range of observations:
2014-12-31 00:00:00 to 2030-12-31 00:00:00

Unique indicators:
<StringArray>
[     'ACC_OWNERSHIP',     'ACC_MM_ACCOUNT',         'ACC_4G_COV',
     'ACC_MOBILE_PEN',          'ACC_FAYDA',      'USG_P2P_COUNT',
      'USG_P2P_VALUE',      'USG_ATM_COUNT',      'USG_ATM_VALUE',
      'USG_CROSSOVER', 'USG_TELEBIRR_USERS', 'USG_TELEBIRR_VA