In [26]:
# =========================================================
# V3 STEP 2: THE FUSION REACTOR (Fixed & Polished) ‚öõÔ∏è
# =========================================================
# Goal: Merge Housing Data with Interest Rates, GDP, Unemployment, and Population.

import pandas as pd
import numpy as np
import os

# 1. SETUP PATHS
BASE_DIR = ".." 
RAW_HOUSING_PATH = os.path.join(BASE_DIR, "data/raw_housing")
PROCESSED_PATH = os.path.join(BASE_DIR, "data/processed")

# Define File Map
FILES = {
    "Housing": os.path.join(RAW_HOUSING_PATH, "national_master_dataset.csv"),
    "Rates": os.path.join(PROCESSED_PATH, "clean_interest_rates.csv"),
    "Unemp": os.path.join(PROCESSED_PATH, "clean_unemployment.csv"),
    "GDP": os.path.join(PROCESSED_PATH, "clean_gdp.csv"),
    "Pop": os.path.join(PROCESSED_PATH, "clean_population.csv")
}

print("‚öõÔ∏è  Initializing Fusion Reactor...")

# ---------------------------------------------------------
# MODULE 1: AUTO-REPAIR
# ---------------------------------------------------------
def repair_macro_data():
    print("   üîß Checking Data Integrity...")
    for key, fpath in FILES.items():
        if not os.path.exists(fpath):
             raise FileNotFoundError(f"üö® CRITICAL: {key} file missing at: {fpath}")
        print(f"      ‚úÖ Found {key}")

repair_macro_data()

# ---------------------------------------------------------
# MODULE 2: GEOGRAPHY MAPPING (Mega-Dictionary) üó∫Ô∏è
# ---------------------------------------------------------
# Maps City -> (Province, Economic Region)
# Note: Territories are mapped to provinces as GDP proxies if local data missing
GEO_MAP = {
    # TERRITORIES (Proxies for GDP)
    'Yellowknife': ('Alberta', 'Yellowknife'), # Proxy Alberta
    'Whitehorse': ('British Columbia', 'Whitehorse'), # Proxy BC
    'Iqaluit': ('Quebec', 'Iqaluit'), # Proxy Quebec
    
    # ONTARIO
    'Toronto': ('Ontario', 'Toronto'), 'Ottawa': ('Ontario', 'Ottawa'), 'Hamilton': ('Ontario', 'Hamilton-Niagara Peninsula'),
    'Kitchener': ('Ontario', 'Kitchener-Waterloo-Barrie'), 'London': ('Ontario', 'London'), 'Windsor': ('Ontario', 'Windsor-Sarnia'),
    'Oshawa': ('Ontario', 'Toronto'), 'Barrie': ('Ontario', 'Kitchener-Waterloo-Barrie'), 'Kingston': ('Ontario', 'Kingston-Pembroke'),
    'Guelph': ('Ontario', 'Kitchener-Waterloo-Barrie'), 'Sudbury': ('Ontario', 'Northeast'), 'Thunder Bay': ('Ontario', 'Northwest'),
    'Peterborough': ('Ontario', 'Muskoka-Kawarthas'), 'Brantford': ('Ontario', 'Hamilton-Niagara Peninsula'),
    'Belleville': ('Ontario', 'Kingston-Pembroke'), 'Sarnia': ('Ontario', 'Windsor-Sarnia'), 'Sault Ste. Marie': ('Ontario', 'Northeast'),
    'St. Catharines-Niagara': ('Ontario', 'Hamilton-Niagara Peninsula'), 'St. Catharines': ('Ontario', 'Hamilton-Niagara Peninsula'),
    'Niagara Falls': ('Ontario', 'Hamilton-Niagara Peninsula'), 'Cambridge': ('Ontario', 'Kitchener-Waterloo-Barrie'),
    'Waterloo': ('Ontario', 'Kitchener-Waterloo-Barrie'), 'Mississauga': ('Ontario', 'Toronto'), 'Brampton': ('Ontario', 'Toronto'),
    'Markham': ('Ontario', 'Toronto'), 'Vaughan': ('Ontario', 'Toronto'), 'Richmond Hill': ('Ontario', 'Toronto'),
    'Oakville': ('Ontario', 'Toronto'), 'Burlington': ('Ontario', 'Hamilton-Niagara Peninsula'),
    # BC
    'Vancouver': ('British Columbia', 'Lower Mainland-Southwest'), 'Victoria': ('British Columbia', 'Vancouver Island and Coast'),
    'Kelowna': ('British Columbia', 'Thompson-Okanagan'), 'Abbotsford-Mission': ('British Columbia', 'Lower Mainland-Southwest'),
    'Abbotsford': ('British Columbia', 'Lower Mainland-Southwest'), 'Nanaimo': ('British Columbia', 'Vancouver Island and Coast'),
    'Kamloops': ('British Columbia', 'Thompson-Okanagan'), 'Chilliwack': ('British Columbia', 'Lower Mainland-Southwest'),
    'Prince George': ('British Columbia', 'Cariboo'), 'Surrey': ('British Columbia', 'Lower Mainland-Southwest'),
    'Burnaby': ('British Columbia', 'Lower Mainland-Southwest'), 'Richmond': ('British Columbia', 'Lower Mainland-Southwest'),
    # ALBERTA
    'Calgary': ('Alberta', 'Calgary'), 'Edmonton': ('Alberta', 'Edmonton'), 'Red Deer': ('Alberta', 'Red Deer'),
    'Lethbridge': ('Alberta', 'Lethbridge-Medicine Hat'), 'Medicine Hat': ('Alberta', 'Lethbridge-Medicine Hat'),
    'Wood Buffalo': ('Alberta', 'Wood Buffalo-Cold Lake'), 'Grande Prairie': ('Alberta', 'Banff-Jasper-Rocky Mountain House and Athabasca-Grande Prairie-Peace River'),
    # QUEBEC
    'Montreal': ('Quebec', 'Montreal'), 'Quebec': ('Quebec', 'Capitale-Nationale'), 'Gatineau': ('Quebec', 'Outaouais'),
    'Sherbrooke': ('Quebec', 'Estrie'), 'Trois-Rivieres': ('Quebec', 'Mauricie'), 'Saguenay': ('Quebec', 'Saguenay-Lac-Saint-Jean'),
    'Drummondville': ('Quebec', 'Mauricie'), 'Laval': ('Quebec', 'Laval'), 'Longueuil': ('Quebec', 'Mont√©r√©gie'),
    # PRAIRIES & ATLANTIC
    'Winnipeg': ('Manitoba', 'Winnipeg'), 'Saskatoon': ('Saskatchewan', 'Saskatoon-Biggar'),
    'Regina': ('Saskatchewan', 'Regina-Moose Mountain'), 'Halifax': ('Nova Scotia', 'Halifax'),
    'Moncton': ('New Brunswick', 'Moncton-Richibucto'), 'Saint John': ('New Brunswick', 'Saint John-St. Stephen'),
    'St. Johns': ('Newfoundland and Labrador', 'Avalon Peninsula'), 'Charlottetown': ('Prince Edward Island', 'Prince Edward Island')
}

# ---------------------------------------------------------
# MODULE 3: THE MERGE ENGINE üöÇ
# ---------------------------------------------------------
print("   üîÑ Loading Datasets...")
try:
    df_housing = pd.read_csv(FILES["Housing"])
    df_rates = pd.read_csv(FILES["Rates"])
    df_unemp = pd.read_csv(FILES["Unemp"])
    df_gdp = pd.read_csv(FILES["GDP"])
    df_pop = pd.read_csv(FILES["Pop"])
    
    # 1. Apply Geography Map
    print("   üó∫Ô∏è  Mapping Geography...")
    df_housing['City'] = df_housing['City'].str.strip()
    
    # Map Province and Region (Handle missing keys safely)
    df_housing['Province'] = df_housing['City'].apply(lambda c: GEO_MAP.get(c, ("Unknown", "Unknown"))[0])
    df_housing['Region_Map'] = df_housing['City'].apply(lambda c: GEO_MAP.get(c, (c, c))[1]) # Fallback to City Name
    
    # 2. Merge Interest Rates (on Year)
    print("   üîó Merging Interest Rates...")
    df_merged = pd.merge(df_housing, df_rates[['Year', 'Interest_Rate']], on='Year', how='left')
    
    # 3. Merge GDP (on Province + Year)
    print("   üîó Merging GDP...")
    df_merged = pd.merge(df_merged, df_gdp[['Province', 'Year', 'GDP_Growth_Pct']], on=['Province', 'Year'], how='left')
    
    # 4. Merge Unemployment (on Region + Year)
    print("   üîó Merging Unemployment...")
    df_merged = pd.merge(df_merged, df_unemp[['City_Map', 'Year', 'Unemployment_Rate']], 
                         left_on=['Region_Map', 'Year'], right_on=['City_Map', 'Year'], how='left')
    
    # 5. Merge Population (on City + Year)
    print("   üîó Merging Population...")
    # Using left_on='City' assumes Population file has clean names matching Housing file
    df_merged = pd.merge(df_merged, df_pop[['City_Map', 'Year', 'Pop_Growth_Pct']], 
                         left_on=['City', 'Year'], right_on=['City_Map', 'Year'], how='left')

    # ---------------------------------------------------------
    # MODULE 4: FINAL POLISH (Fixing the Error) ‚ú®
    # ---------------------------------------------------------
    print("   ‚ú® Final Polish...")
    
    # FIX: Use direct assignment instead of inplace=True to avoid ChainedAssignmentError
    
    # Unemployment: Fill missing with National Mean for that year
    means_unemp = df_merged.groupby('Year')['Unemployment_Rate'].transform('mean')
    df_merged['Unemployment_Rate'] = df_merged['Unemployment_Rate'].fillna(means_unemp)
    
    # GDP: Fill missing with 0.0
    df_merged['GDP_Growth_Pct'] = df_merged['GDP_Growth_Pct'].fillna(0.0)
    
    # Population: Fill missing with 0.0
    df_merged['Pop_Growth_Pct'] = df_merged['Pop_Growth_Pct'].fillna(0.0)
    
    # Drop Cleanup (Remove duplicate/helper columns)
    cols_to_drop = [c for c in df_merged.columns if 'City_Map' in c]
    if cols_to_drop:
        print(f"      Dropping helper columns: {cols_to_drop}")
        df_merged = df_merged.drop(columns=cols_to_drop)

    # Save
    out_file = os.path.join(PROCESSED_PATH, "hybrid_v3_dataset.csv")
    df_merged.to_csv(out_file, index=False)
    
    print("-" * 40)
    print(f"‚úÖ FUSION COMPLETE: {out_file}")
    print(f"   Total Rows: {len(df_merged)}")
    print(f"   Columns: {df_merged.columns.tolist()}")
    print("-" * 40)
    
    # Diagnostic Check
    sample_check = df_merged[['City', 'Year', 'GDP_Growth_Pct', 'Pop_Growth_Pct']].head()
    print("Sample Data Check:\n", sample_check)

except Exception as e:
    print(f"‚ùå FUSION ERROR: {e}")

‚öõÔ∏è  Initializing Fusion Reactor...
   üîß Checking Data Integrity...
      ‚úÖ Found Housing
      ‚úÖ Found Rates
      ‚úÖ Found Unemp
      ‚úÖ Found GDP
      ‚úÖ Found Pop
   üîÑ Loading Datasets...
   üó∫Ô∏è  Mapping Geography...
   üîó Merging Interest Rates...
   üîó Merging GDP...
   üîó Merging Unemployment...
   üîó Merging Population...
   ‚ú® Final Polish...
      Dropping helper columns: ['City_Map_x', 'City_Map_y']
----------------------------------------
‚úÖ FUSION COMPLETE: ../data/processed/hybrid_v3_dataset.csv
   Total Rows: 2189
   Columns: ['City', 'Turnover_Rate', 'Average rent ($)', 'Year', 'Total_Units', 'Buy_Price', 'Intl_Students_Prov', 'Province', 'Region_Map', 'Interest_Rate', 'GDP_Growth_Pct', 'Unemployment_Rate', 'Pop_Growth_Pct']
----------------------------------------
Sample Data Check:
                  City  Year  GDP_Growth_Pct  Pop_Growth_Pct
0  Abbotsford-Mission  2015        0.000000             0.0
1  Abbotsford-Mission  2016        2