In [25]:
# =========================================================
# V3 STEP 1: MACRO DATA REFINERY (Footer-Proof Edition) üè≠
# =========================================================
# Goal: Standardize 4 disparate data sources into Annual Time Series.

import pandas as pd
import numpy as np
import os

# 1. SETUP PATHS
BASE_DIR = ".." 
RAW_PATH = os.path.join(BASE_DIR, "data/raw_macro")
PROCESSED_PATH = os.path.join(BASE_DIR, "data/processed")
os.makedirs(PROCESSED_PATH, exist_ok=True)

print(f"üöÄ Starting Macro Refinery...")

# --- Helper: Robust Loader ---
def robust_load(filepath, keywords, skip_summary=False):
    """
    Finds the header row index by scanning lines, then loads CSV 
    with skip_blank_lines=False to ensure index alignment.
    """
    try:
        # 1. Find Header Row Index (0-based)
        header_row = -1
        with open(filepath, 'r', encoding='utf-8', errors='replace') as f:
            lines = f.readlines()
            
        for i, line in enumerate(lines):
            # Special check for BoC summary block
            if skip_summary and "Summary" in line: continue
            
            # Check if all keywords are in the line
            if all(k in line for k in keywords):
                header_row = i
                print(f"   üìç Found header at line {i} in {os.path.basename(filepath)}")
                break
        
        if header_row == -1:
            print(f"   ‚ùå Header not found in {os.path.basename(filepath)}")
            return None

        # 2. Load Data (skip_blank_lines=False is crucial here!)
        df = pd.read_csv(filepath, header=header_row, skip_blank_lines=False)
        return df

    except Exception as e:
        print(f"   ‚ùå Error loading {os.path.basename(filepath)}: {e}")
        return None

# =========================================================
# MODULE A: INTEREST RATES (Gravity) üìâ
# =========================================================
print("\n[1/4] Processing Interest Rates...")
df_rates = robust_load(f"{RAW_PATH}/boc_interest_rates.csv", ["Date", "V39079"], skip_summary=True)

if df_rates is not None:
    try:
        df_rates = df_rates.iloc[:, 0:2] # Keep Date and Rate
        df_rates.columns = ['Date', 'Interest_Rate']
        
        df_rates['Date'] = pd.to_datetime(df_rates['Date'], errors='coerce')
        df_rates.dropna(subset=['Date'], inplace=True)
        df_rates['Interest_Rate'] = pd.to_numeric(df_rates['Interest_Rate'], errors='coerce')
        
        # Annualize
        df_rates['Year'] = df_rates['Date'].dt.year
        df_rates_annual = df_rates.groupby('Year')['Interest_Rate'].mean().reset_index()

        # Patch 2014-2015
        if 2015 not in df_rates_annual['Year'].values:
            history = pd.DataFrame([{'Year': 2014, 'Interest_Rate': 1.00}, {'Year': 2015, 'Interest_Rate': 0.63}])
            df_rates_annual = pd.concat([history, df_rates_annual], ignore_index=True)

        df_rates_annual.sort_values('Year').to_csv(f"{PROCESSED_PATH}/clean_interest_rates.csv", index=False)
        print(f"   ‚úÖ Saved Interest Rates: {len(df_rates_annual)} years")
    except Exception as e:
        print(f"   ‚ùå Error cleaning Interest Rates: {e}")

# =========================================================
# MODULE B: UNEMPLOYMENT (Ability to Pay) üíº
# =========================================================
print("\n[2/4] Processing Unemployment...")
df_unemp = robust_load(f"{RAW_PATH}/unemployment_stats.csv", ["Geography", "January"])

if df_unemp is not None:
    try:
        df_unemp.rename(columns={df_unemp.columns[0]: 'Region'}, inplace=True)

        # Drop Unit Row
        if len(df_unemp) > 0 and "Percent" in str(df_unemp.iloc[0, 1]):
            df_unemp = df_unemp.iloc[1:].copy()

        # Melt
        df_long = df_unemp.melt(id_vars=['Region'], var_name='Date_Str', value_name='Unemployment_Rate')
        
        # Extract Year
        df_long['Year'] = df_long['Date_Str'].astype(str).str.extract(r'(\d{4})')
        df_long['Unemployment_Rate'] = pd.to_numeric(df_long['Unemployment_Rate'], errors='coerce')
        
        df_long.dropna(subset=['Year', 'Unemployment_Rate'], inplace=True)
        df_long['Year'] = df_long['Year'].astype(int)
        
        # Annual Average
        df_unemp_annual = df_long.groupby(['Region', 'Year'])['Unemployment_Rate'].mean().reset_index()
        
        # Clean Region Names
        df_unemp_annual['City_Map'] = df_unemp_annual['Region'].apply(lambda x: str(x).split(',')[0].strip())
        
        df_unemp_annual.to_csv(f"{PROCESSED_PATH}/clean_unemployment.csv", index=False)
        print(f"   ‚úÖ Saved Unemployment: {len(df_unemp_annual)} rows")
    except Exception as e:
        print(f"   ‚ùå Error cleaning Unemployment: {e}")

# =========================================================
# MODULE C: PROVINCIAL GDP (Optimism) üìà
# =========================================================
print("\n[3/4] Processing GDP...")
df_gdp = robust_load(f"{RAW_PATH}/provincial_gdp.csv", ["Geography", "2015"])

if df_gdp is not None:
    try:
        df_gdp.rename(columns={df_gdp.columns[0]: 'Province'}, inplace=True)
        
        # Drop Unit Row
        if len(df_gdp) > 0 and "Dollars" in str(df_gdp.iloc[0, 1]):
            df_gdp = df_gdp.iloc[1:].copy()
        
        valid_provinces = ['Ontario', 'British Columbia', 'Quebec', 'Alberta', 'Nova Scotia', 
                           'Manitoba', 'Saskatchewan', 'New Brunswick', 'Newfoundland and Labrador', 'Prince Edward Island']
        df_gdp = df_gdp[df_gdp['Province'].isin(valid_provinces)].copy()
        
        # Melt
        year_cols = [c for c in df_gdp.columns if c.strip().isdigit()]
        df_gdp_long = df_gdp.melt(id_vars=['Province'], value_vars=year_cols, var_name='Year', value_name='GDP_Millions')
        
        # Clean Numeric (Handle Commas & Footnotes)
        df_gdp_long['GDP_Millions'] = pd.to_numeric(
            df_gdp_long['GDP_Millions'].astype(str).str.replace(',', ''), 
            errors='coerce'
        )
        df_gdp_long.dropna(subset=['GDP_Millions'], inplace=True)
        df_gdp_long['Year'] = df_gdp_long['Year'].astype(int)
        
        # Calculate Growth %
        df_gdp_long.sort_values(['Province', 'Year'], inplace=True)
        df_gdp_long['GDP_Growth_Pct'] = df_gdp_long.groupby('Province')['GDP_Millions'].pct_change() * 100
        
        df_gdp_long.to_csv(f"{PROCESSED_PATH}/clean_gdp.csv", index=False)
        print(f"   ‚úÖ Saved GDP: {len(df_gdp_long)} rows")
    except Exception as e:
        print(f"   ‚ùå Error cleaning GDP: {e}")

# =========================================================
# MODULE D: POPULATION (Demand) üë®‚Äçüë©‚Äçüëß‚Äçüë¶
# =========================================================
print("\n[4/4] Processing Population...")
df_pop = robust_load(f"{RAW_PATH}/Population.csv", ["Geography", "2015"])

if df_pop is not None:
    try:
        df_pop.rename(columns={df_pop.columns[0]: 'Region'}, inplace=True)
        
        # Drop Unit Row
        if len(df_pop) > 0 and "Persons" in str(df_pop.iloc[0, 1]):
            df_pop = df_pop.iloc[1:].copy()
            
        # Melt
        year_cols = [c for c in df_pop.columns if c.strip().isdigit()]
        df_pop_long = df_pop.melt(id_vars=['Region'], value_vars=year_cols, var_name='Year', value_name='Population')
        
        # Clean Numeric (THE FIX: Force errors='coerce' to drop footnotes)
        df_pop_long['Population'] = pd.to_numeric(
            df_pop_long['Population'].astype(str).str.replace(',', ''), 
            errors='coerce'
        )
        # Drop rows that became NaN (the footer text rows)
        df_pop_long.dropna(subset=['Population'], inplace=True)
        
        df_pop_long['Year'] = df_pop_long['Year'].astype(int)
        
        # Calculate Growth %
        df_pop_long.sort_values(['Region', 'Year'], inplace=True)
        df_pop_long['Pop_Growth_Pct'] = df_pop_long.groupby('Region')['Population'].pct_change() * 100
        
        # Clean City Names
        def clean_pop_city(val):
            val = str(val).split(',')[0].strip()
            for s in ['(CMA)', '(CA)', ' part', 'metro']:
                val = val.replace(s, '')
            return val.strip()

        df_pop_long['City_Map'] = df_pop_long['Region'].apply(clean_pop_city)
        
        df_pop_long.to_csv(f"{PROCESSED_PATH}/clean_population.csv", index=False)
        print(f"   ‚úÖ Saved Population: {len(df_pop_long)} rows")
    except Exception as e:
        print(f"   ‚ùå Error cleaning Population: {e}")

print("\n‚ú® REFINERY COMPLETE!")

üöÄ Starting Macro Refinery...

[1/4] Processing Interest Rates...
   üìç Found header at line 11 in boc_interest_rates.csv
   ‚úÖ Saved Interest Rates: 12 years

[2/4] Processing Unemployment...
   üìç Found header at line 10 in unemployment_stats.csv
   ‚úÖ Saved Unemployment: 836 rows

[3/4] Processing GDP...
   üìç Found header at line 10 in provincial_gdp.csv
   ‚úÖ Saved GDP: 100 rows

[4/4] Processing Population...
   üìç Found header at line 10 in Population.csv
   ‚úÖ Saved Population: 2321 rows

‚ú® REFINERY COMPLETE!
