# Phase 2: National Data Merger
**Project:** National Rent Intelligence Engine  
**Goal:** Clean, impute, and merge disparate data sources (CMHC, CREA, IRCC) into a single master training dataset.  
**Date:** February 2026

In [65]:
import pandas as pd
import numpy as np
import os
import glob

# Display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# ==========================================
# 1. DEFINE PATHS 
# ==========================================
if os.path.exists("data/raw"):
    RAW_PATH = "data/raw"
    PROCESSED_PATH = "data/processed"
elif os.path.exists("../data/raw"):
    RAW_PATH = "../data/raw"
    PROCESSED_PATH = "../data/processed"
else:
    # Fallback default
    RAW_PATH = "../data/raw"
    PROCESSED_PATH = "../data/processed"
    print("‚ö†Ô∏è Warning: Could not auto-detect data paths. Defaulting to '../data/raw'")

os.makedirs(PROCESSED_PATH, exist_ok=True)

print(f"Current Working Directory: {os.getcwd()}")
print(f"Using Raw Data Path:       {RAW_PATH}")

# ==========================================
# 2. DEFINE CITY MAPPING
# ==========================================
# We need to standardize City Names across 3 different agencies
city_map = {
    # CREA Name : Standard Name
    'GREATER_TORONTO': 'Toronto',
    'GREATER_VANCOUVER': 'Vancouver',
    'CALGARY': 'Calgary',
    'EDMONTON': 'Edmonton',
    'OTTAWA': 'Ottawa',
    'MONTREAL_CMA': 'Montreal',
    'HALIFAX_DARTMOUTH': 'Halifax',
    'WINNIPEG': 'Winnipeg',
    'VICTORIA': 'Victoria',
    'HAMILTON_BURLINGTON': 'Hamilton',
    'KITCHENER_WATERLOO': 'Kitchener',
    'LONDON_ST_THOMAS': 'London',
    'WINDSOR_ESSEX': 'Windsor',
    'QUEBEC_CMA': 'Quebec City',
    'ST_JOHNS_NL': 'St. Johns',
    'SASKATOON': 'Saskatoon',
    'REGINA': 'Regina'
}

# 3. DEFINE PROVINCE MAPPING (For Student Visa Broadcast)
prov_map = {
    'Toronto': 'Ontario',
    'Ottawa': 'Ontario',
    'Hamilton': 'Ontario',
    'Kitchener': 'Ontario',
    'London': 'Ontario',
    'Windsor': 'Ontario',
    'Vancouver': 'British Columbia',
    'Victoria': 'British Columbia',
    'Calgary': 'Alberta',
    'Edmonton': 'Alberta',
    'Montreal': 'Quebec',
    'Quebec City': 'Quebec',
    'Halifax': 'Nova Scotia',
    'Winnipeg': 'Manitoba',
    'Saskatoon': 'Saskatchewan',
    'Regina': 'Saskatchewan',
    'St. Johns': 'Newfoundland and Labrador'
}

print("Configuration loaded. Mappings defined.")

Current Working Directory: /Users/abigail/Downloads/DS Projects/RentPulse Ontario/National_Insight_Engine/src
Using Raw Data Path:       ../data/raw
Configuration loaded. Mappings defined.


## Process CMHC Data (Supply & Turnover)
We loop through the years (2019-2025) and extract:
* **Turnover Rate** from `Table 1.0`
* **Rental Universe (Total Units)** from `Table 4.1`

In [66]:
def clean_cmhc_table1_excel(file_path, year):
    """
    Extracts TURNOVER and AVERAGE RENT from Table 1.0 using Data Pattern Recognition.
    """
    try:
        # 1. Load and find header
        df_raw = pd.read_excel(file_path, sheet_name="Table 1.0", header=None)
        # Find row containing "Centre"
        header_row = df_raw[df_raw.astype(str).apply(lambda x: x.str.strip().eq('Centre').any(), axis=1)].index[0]
        df = pd.read_excel(file_path, sheet_name="Table 1.0", header=header_row)
        
        # 2. Standardize City Names
        df.rename(columns={df.columns[0]: 'Centre'}, inplace=True)
        df = df.dropna(subset=['Centre'])
        df = df[df['Centre'] != 'Centre'] # Remove repeated headers
        df['City'] = df['Centre'].astype(str).str.replace(' CMA', '').str.replace(' CA', '').str.strip()
        
        # 3. SMART COLUMN DETECTION
        turnover_col = None
        rent_col = None
        
        # Iterate through columns to find the best match based on DATA VALUES
        for col in df.columns:
            if col in ['Centre', 'City']: continue
            
            # Convert to numeric, forcing errors to NaN
            col_data = pd.to_numeric(df[col].astype(str).str.replace(',', '').str.replace('*', ''), errors='coerce')
            median_val = col_data.median()
            
            if pd.isna(median_val): continue
            
            # RULE 1: RENT is typically > $500 and < $5000
            if 500 < median_val < 5000:
                rent_col = col
                
            # RULE 2: TURNOVER is typically > 5% and < 50%
            elif 5 < median_val < 50:
                # Check header keyword to distinguish from other rates
                if "Turnover" in str(col) or "turnover" in str(col) or "Rotation" in str(col):
                    turnover_col = col
                elif turnover_col is None: 
                    turnover_col = col # Fallback

        # Fallback for Rent if logic missed it (Look for '$')
        if rent_col is None:
            dollar_cols = [c for c in df.columns if "$" in str(c)]
            if dollar_cols: rent_col = dollar_cols[-1]

        # Extract
        if turnover_col and rent_col:
            df_out = df[['City', turnover_col, rent_col]].copy()
            df_out.columns = ['City', 'Turnover_Rate', 'Average rent ($)']
            df_out['Year'] = year
            # Clean Numeric
            for c in ['Turnover_Rate', 'Average rent ($)']:
                df_out[c] = pd.to_numeric(df_out[c].astype(str).str.replace(',', '').str.replace('*', ''), errors='coerce')
            return df_out
        else:
            print(f"‚ö†Ô∏è Warning: Could not identify Rent/Turnover in {year} file.")
            return pd.DataFrame()
            
    except Exception as e:
        print(f"Error reading Table 1.0 in {os.path.basename(file_path)}: {e}")
        return pd.DataFrame()

def clean_cmhc_table4_excel(file_path, year):
    """
    Extracts UNIVERSE (Total Units) from Table 4.1 using Data Pattern Recognition.
    """
    try:
        df_raw = pd.read_excel(file_path, sheet_name="Table 4.1", header=None)
        header_row = df_raw[df_raw.astype(str).apply(lambda x: x.str.strip().eq('Centre').any(), axis=1)].index[0]
        df = pd.read_excel(file_path, sheet_name="Table 4.1", header=header_row)
        df.rename(columns={df.columns[0]: 'Centre'}, inplace=True)
        
        df['City'] = df['Centre'].astype(str).str.replace(' CMA', '').str.replace(' CA', '').str.strip()
        
        target_col = None
        
        # 3. SMART DETECTION for Universe
        for col in df.columns:
            if col in ['Centre', 'City']: continue
            
            col_data = pd.to_numeric(df[col].astype(str).str.replace(',', '').str.replace('*', ''), errors='coerce')
            median_val = col_data.median()
            
            if pd.isna(median_val): continue
            
            # RULE: Universe is typically > 1000 units
            if median_val > 1000:
                target_col = col
        
        if target_col:
            df_out = df[['City', target_col]].copy()
            df_out.columns = ['City', 'Total_Units']
            df_out['Year'] = year
            df_out['Total_Units'] = pd.to_numeric(df_out['Total_Units'].astype(str).str.replace(',', ''), errors='coerce')
            return df_out
        else:
            print(f"‚ö†Ô∏è Warning: Could not identify Universe in {year} file.")
            return pd.DataFrame()
            
    except Exception as e:
        print(f"Error reading Table 4.1 in {os.path.basename(file_path)}: {e}")
        return pd.DataFrame()

In [67]:
years = range(2019, 2026) # 2019 to 2025
cmhc_data = []

print(f"Starting merge process using path: {RAW_PATH} ...")

for y in years:
    file_name = f"Turnover-Rates-{y}.xlsx"
    file_path = os.path.join(RAW_PATH, file_name)
    
    if os.path.exists(file_path):
        print(f"Processing {y}...")
        
        try:
            # CALLING THE NEW PARSERS
            df_turn = clean_cmhc_table1_excel(file_path, y)
            df_univ = clean_cmhc_table4_excel(file_path, y)
            
            # Merge if both exist
            if not df_turn.empty and not df_univ.empty:
                df_merged = pd.merge(df_turn, df_univ, on=['City', 'Year'], how='inner')
                cmhc_data.append(df_merged)
            else:
                print(f"  -> ‚ö†Ô∏è Data incomplete for {y}")
                
        except Exception as e:
            print(f"  -> ‚ùå Unexpected error for {y}: {e}")
            
    else:
        print(f"  -> Skipping {y} (File not found)")

if cmhc_data:
    df_cmhc = pd.concat(cmhc_data)
    
    # City Name Cleanup
    city_corrections = {
        'St. John\'s': 'St. Johns', 
        'Montr√©al': 'Montreal', 
        'Qu√©bec': 'Quebec City',
        'Ottawa-Gatineau (Ont. part)': 'Ottawa',
        'Ottawa-Gatineau (Qu√©. part)': 'Gatineau'
    }
    df_cmhc['City'] = df_cmhc['City'].replace(city_corrections)
    
    print(f"\n‚úÖ SUCCESS: Total CMHC Rows: {len(df_cmhc)}")
    # Verify we have RENT data now
    print("Sample Data (Check 'Average rent' column):")
    display(df_cmhc.head())
else:
    print("\n‚ùå ERROR: No data was merged.")

Starting merge process using path: ../data/raw ...
Processing 2019...
Processing 2020...
Processing 2021...
Processing 2022...
Processing 2023...
Processing 2024...
Processing 2025...

‚úÖ SUCCESS: Total CMHC Rows: 161
Sample Data (Check 'Average rent' column):


Unnamed: 0,City,Turnover_Rate,Average rent ($),Year,Total_Units
0,Halifax,20.9,1202.0,2019,50769.0
1,Montreal,17.4,855.0,2019,590305.0
2,Gatineau,21.5,874.0,2019,23441.0
3,Quebec City,22.1,862.0,2019,91787.0
4,Hamilton,15.9,1219.0,2019,44012.0


Data audit

In [68]:
# --- DATA AUDIT BLOCK ---
print(f"Total Rows Loaded: {len(df_cmhc)}")
print(f"Years Covered: {df_cmhc['Year'].unique()}")
print("\n--- Row Counts per City (Should be ~7 for 2019-2025) ---")
print(df_cmhc['City'].value_counts())

print("\n--- Sample of Toronto Data ---")
# Check if Toronto data actually has numbers
check_city = df_cmhc[df_cmhc['City'].str.contains("Toronto")]
if not check_city.empty:
    display(check_city.sort_values('Year'))
else:
    print("‚ö†Ô∏è WARNING: Toronto data is missing!")

print("\n--- Check for Missing Values ---")
print(df_cmhc.isnull().sum())

Total Rows Loaded: 161
Years Covered: [2019 2020 2021 2022 2023 2024 2025]

--- Row Counts per City (Should be ~7 for 2019-2025) ---
City
Halifax                                                                                                                                   7
Montreal                                                                                                                                  7
Gatineau                                                                                                                                  7
Quebec City                                                                                                                               7
Hamilton                                                                                                                                  7
Kitchener-Cambridge-Waterloo                                                                                                              7
London                

Unnamed: 0,City,Turnover_Rate,Average rent ($),Year,Total_Units
8,Toronto,11.2,1562.0,2019,315630.0
8,Toronto,9.5,1635.0,2020,318613.0
8,Toronto,8.0,1679.0,2021,320152.0
8,Toronto,14.4,1779.0,2022,327263.0
8,Toronto,9.8,1961.0,2023,325494.0
8,Toronto,8.3,1974.0,2024,334748.0
8,Toronto,6.4,2046.0,2025,336180.0



--- Check for Missing Values ---
City                 0
Turnover_Rate       42
Average rent ($)    42
Year                 0
Total_Units         42
dtype: int64


### üìù Strategic Data Methodology

To ensure the "National Insight Engine" prioritizes **precision over noise**, we enforced the following two strategic data exclusions:

**Valuation Source: "Not Seasonally Adjusted (Monthly)" Only**
* **Decision:** We exclusively use the **Monthly, Non-Seasonally Adjusted (NSA)** price data from CREA.
* **Reason:** The CMHC Rental Survey occurs specifically in **October**. Annual averages smooth out this signal, and "Seasonally Adjusted" prices are statistical abstractions. Using raw October NSA prices ensures our **Cap Rate** calculation ($Rent \div Price$) reflects the actual market reality at the exact moment of survey.

In [69]:
# ---------------------------------------------------------
# 2. PROCESS CREA DATA (Handling Multi-Sheet Excel)
# ---------------------------------------------------------
crea_file_path = os.path.join(RAW_PATH, "Not Seasonally Adjusted (M).xlsx")
crea_data = []

if os.path.exists(crea_file_path):
    print(f"Found CREA Excel: {crea_file_path}")
    print("Reading sheets... (This might take a moment)")
    
    # Load the Excel file wrapper to list all sheet names
    xls = pd.ExcelFile(crea_file_path)
    print(f"Sheets found: {xls.sheet_names[:5]} ... and more")
    
    for sheet in xls.sheet_names:
        # Match Sheet Name to Our Standard City Name
        # We look for the sheet name in our city_map keys
        # E.g. Sheet "GREATER_TORONTO" -> Map to "Toronto"
        
        # Check if this sheet matches any of our target keys
        matched_city = None
        for key, std_name in city_map.items():
            if key in sheet: 
                matched_city = std_name
                break
        
        if matched_city:
            # Read that specific sheet
            df = pd.read_excel(xls, sheet_name=sheet)
            
            # Ensure Date parsing
            df['Date'] = pd.to_datetime(df['Date'])
            df['Year'] = df['Date'].dt.year
            
            # We need Apartment Benchmark. Priority: Apartment_Benchmark > Apartment_HPI
            if 'Apartment_Benchmark' in df.columns:
                # Group by Year to get Annual Average Price
                df_annual = df.groupby('Year')['Apartment_Benchmark'].mean().reset_index()
                df_annual['City'] = matched_city
                df_annual.columns = ['Year', 'Buy_Price', 'City']
                crea_data.append(df_annual)
            else:
                print(f"  -> Sheet {sheet} missing 'Apartment_Benchmark'")
                
    if crea_data:
        df_crea = pd.concat(crea_data)
        print(f"\n‚úÖ CREA Data Loaded! Found prices for: {df_crea['City'].unique()}")
        display(df_crea.head())
    else:
        print("‚ùå Error: No matching cities found in CREA Excel.")

else:
    print(f"‚ùå Error: CREA file not found at {crea_file_path}")

Found CREA Excel: ../data/raw/Not Seasonally Adjusted (M).xlsx
Reading sheets... (This might take a moment)
Sheets found: ['AGGREGATE', 'BRITISH_COLUMBIA', 'VANCOUVER_ISLAND', 'VICTORIA', 'LOWER_MAINLAND'] ... and more

‚úÖ CREA Data Loaded! Found prices for: <StringArray>
['Victoria', 'Vancouver', 'Calgary', 'Edmonton', 'Regina', 'Saskatoon', 'Winnipeg', 'Hamilton', 'Kitchener', 'London', 'Ottawa', 'Toronto', 'Windsor', 'Montreal', 'Quebec City', 'Halifax', 'St. Johns']
Length: 17, dtype: str


Unnamed: 0,Year,Buy_Price,City
0,2005,190116.666667,Victoria
1,2006,223050.0,Victoria
2,2007,247725.0,Victoria
3,2008,262925.0,Victoria
4,2009,252033.333333,Victoria


In [70]:
# ---------------------------------------------------------
# 3. PROCESS IRCC DATA (Student Visas)
# ---------------------------------------------------------
print("üéì Processing Student Visa Data...")

# 1. Find the file (Auto-detect Excel or CSV)
ircc_files = glob.glob(os.path.join(RAW_PATH, "EN_ODP*.xlsx"))
if not ircc_files:
     ircc_files = glob.glob(os.path.join(RAW_PATH, "EN_ODP*.csv"))

if ircc_files:
    ircc_file = ircc_files[0]
    print(f"   -> Found file: {os.path.basename(ircc_file)}")
    
    try:
        # Load Data
        # IRCC files usually have the header on row 3 (index 2)
        if ircc_file.endswith('.xlsx'):
            df_ircc = pd.read_excel(ircc_file, header=2)
        else:
            df_ircc = pd.read_csv(ircc_file, header=2)

        # 2. Clean Columns
        # The first column is Province, the second (often unnamed) is Study Level
        df_ircc.rename(columns={df_ircc.columns[0]: 'Province_Raw', df_ircc.columns[1]: 'Study_Level'}, inplace=True)
        
        df_ircc['Province_Raw'] = df_ircc['Province_Raw'].astype(str).str.replace(' Total', '', regex=False).str.strip()
        
        # 3. Fill Down Province Names
        target_provinces = ['Ontario', 'British Columbia', 'Alberta', 'Quebec', 'Nova Scotia', 'Manitoba', 'Saskatchewan', 'Newfoundland and Labrador']
        
        # Helper to identify if a row IS a province header
        df_ircc['Detected_Prov'] = df_ircc['Province_Raw'].apply(lambda x: x if x in target_provinces else None)
        df_ircc['Province'] = df_ircc['Detected_Prov'].ffill()
        
        # 4. Filter for "Post Secondary"
        # We only want university/college students. We check the 'Study_Level' column.
        # If 'Study_Level' is empty, we check if the Province row itself has data (some files differ).
        
        # Strategy: Keep rows where Study_Level contains "Post Secondary" OR "Total" if levels aren't present
        # First, ensure Study_Level is string
        df_ircc['Study_Level'] = df_ircc['Study_Level'].fillna('')
        
        df_filtered = df_ircc[
            df_ircc['Study_Level'].astype(str).str.contains("Post Secondary|Total post secondary", case=False)
        ].copy()
        
        # 5. Transform (Wide to Long)
        year_cols = [c for c in df_ircc.columns if str(c).startswith('20')]
        df_melt = df_filtered.melt(id_vars=['Province'], value_vars=year_cols, var_name='Year', value_name='Student_Count')
        
        # Clean Numeric
        df_melt['Year'] = pd.to_numeric(df_melt['Year'], errors='coerce')
        df_melt['Student_Count'] = pd.to_numeric(df_melt['Student_Count'].astype(str).str.replace(',', ''), errors='coerce')
        
        # 6. Broadcast to Cities
        student_data = []
        for city, province in prov_map.items():
            prov_data = df_melt[df_melt['Province'] == province].copy()
            
            if not prov_data.empty:
                # Group by Year to handle cases where there might be multiple rows (e.g. Master + Bachelor)
                prov_data = prov_data.groupby('Year')['Student_Count'].sum().reset_index()
                
                prov_data['City'] = city
                prov_data.rename(columns={'Student_Count': 'Intl_Students_Prov'}, inplace=True)
                student_data.append(prov_data)
        
        if student_data:
            df_students = pd.concat(student_data)
            print(f"‚úÖ Student Data Loaded! Years: {df_students['Year'].min()}-{df_students['Year'].max()}")
            print(f"   (Matched {len(student_data)} cities)")
            display(df_students.head(3))
        else:
            print("‚ö†Ô∏è Warning: Student data processed but no cities matched the 'prov_map'. Check spelling.")
            df_students = pd.DataFrame(columns=['City', 'Year', 'Intl_Students_Prov'])

    except Exception as e:
        print(f"‚ùå Error processing student file: {e}")
        df_students = pd.DataFrame(columns=['City', 'Year', 'Intl_Students_Prov'])
else:
    print(f"‚ùå Error: Could not find student data file (EN_ODP...) in {RAW_PATH}")
    df_students = pd.DataFrame(columns=['City', 'Year', 'Intl_Students_Prov'])

üéì Processing Student Visa Data...
   -> Found file: EN_ODP_annual-TR-Study-IS_PT_study_level_year_end.xlsx
‚úÖ Student Data Loaded! Years: 2000-2024
   (Matched 17 cities)


Unnamed: 0,Year,Intl_Students_Prov,City
0,2000,1545.0,Toronto
1,2001,1810.0,Toronto
2,2002,2205.0,Toronto


In [71]:
# ---------------------------------------------------------
# 3.5. PROCESS HISTORICAL DATA (Primary Engine) - FIXED
# ---------------------------------------------------------
print("üìú Processing Primary Engine (Historical 2015-2025)...")
import csv

def process_primary_engine(file_path):
    try:
        data_records = []
        
        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
            # 1. Skip metadata
            for _ in range(8): next(f)
            
            reader = csv.reader(f)
            cities_row = next(reader)  # Row 9
            units_row = next(reader)   # Row 10
            years_row = next(reader)   # Row 11
            next(reader)               # Skip dollars row
            
            # 2. Find Data Row
            data_row = None
            for row in reader:
                if row and "Row and apartment structures" in row[0]:
                    data_row = row
                    break
            
            if not data_row: return pd.DataFrame()

            # 3. Extract Data (With Double Forward Fill)
            current_city = None
            current_unit = None  # <--- NEW: Track unit type across columns
            
            for i in range(1, len(cities_row)):
                # Forward Fill City
                if i < len(cities_row) and cities_row[i].strip():
                    current_city = cities_row[i]
                
                # Forward Fill Unit Type (The Fix!)
                if i < len(units_row) and units_row[i].strip():
                    current_unit = units_row[i]
                
                # Safety checks
                if i >= len(years_row) or i >= len(data_row): continue

                year = years_row[i]
                val = data_row[i]
                
                # Filter: We want "Two bedroom units" and valid years
                if current_city and current_unit and "Two bedroom" in current_unit and year.isdigit():
                    clean_val = val.replace(',', '').replace('$', '').replace('F', '').strip()
                    if clean_val.replace('.', '').isdigit():
                        data_records.append({
                            'City_Raw': current_city,
                            'Year': int(year),
                            'Historical_Rent': float(clean_val)
                        })

        df_hist = pd.DataFrame(data_records)
        
        # 4. Clean City Names
        def clean_city_name(raw_name):
            name = raw_name.split(',')[0].strip()
            if "Ottawa" in raw_name and "Ontario part" in raw_name: return "Ottawa"
            if "Gatineau" in raw_name and "Quebec part" in raw_name: return "Gatineau"
            if "St. John's" in name: return "St. Johns"
            if "Quebec" in name and "Quebec" in raw_name: return "Quebec City"
            return name

        if not df_hist.empty:
            df_hist['City'] = df_hist['City_Raw'].apply(clean_city_name)
            return df_hist[['City', 'Year', 'Historical_Rent']]
        else:
            return pd.DataFrame()

    except Exception as e:
        print(f"‚ùå Error processing Primary Engine: {e}")
        return pd.DataFrame()

# Run the parser
file_path = os.path.join(RAW_PATH, "Primary Engine.csv")
if os.path.exists(file_path):
    df_history = process_primary_engine(file_path)
    print(f"‚úÖ Historical Data Loaded: {len(df_history)} rows")
    if not df_history.empty:
        print(f"   Years: {df_history['Year'].min()}-{df_history['Year'].max()}")
        display(df_history.head())
else:
    print("‚ö†Ô∏è Primary Engine.csv not found.")
    df_history = pd.DataFrame()

üìú Processing Primary Engine (Historical 2015-2025)...
‚úÖ Historical Data Loaded: 2051 rows
   Years: 2015-2025


Unnamed: 0,City,Year,Historical_Rent
0,Bay Roberts,2015,615.0
1,Bay Roberts,2016,621.0
2,Bay Roberts,2017,593.0
3,Bay Roberts,2018,642.0
4,Bay Roberts,2019,665.0


In [72]:
# ---------------------------------------------------------
# 4. THE MASTER MERGE & EXPORT (FINAL VERSION)
# ---------------------------------------------------------
print("üîó Starting Master Merge...")

# 1. Start with CMHC (Supply Side - 2019-2025)
df_master = pd.merge(df_cmhc, df_crea, on=['City', 'Year'], how='outer')

# 2. Merge Historical Rent (2015-2018)
# We check if df_history exists (from the Primary Engine step)
if 'df_history' in locals() and not df_history.empty:
    df_master = pd.merge(df_master, df_history, on=['City', 'Year'], how='outer')
    # Combine columns: Use 'Average rent ($)' if available, otherwise fill with 'Historical_Rent'
    df_master['Average rent ($)'] = df_master['Average rent ($)'].fillna(df_master['Historical_Rent'])
    df_master.drop(columns=['Historical_Rent'], inplace=True, errors='ignore')

# 3. Merge Student Data
df_master = pd.merge(df_master, df_students, on=['City', 'Year'], how='left')

# ---------------------------------------------------------
# JANITOR BLOCK: CLEAN CITY NAMES & REMOVE JUNK
# ---------------------------------------------------------
print("üßπ Cleaning City Names...")

# 1. Define Garbage Patterns (Footnotes)
# üü¢ FIX: We use double backslashes \\ to escape special characters like +
junk_keywords = [
    "Excellent", "Source", "Quality", "suppressed", "definitions", 
    "Copyright", "¬©", "Data Suppressed", "Change in rent", "No units exist",
    "Indicators", "\\+\\+", "--", "¬ß"
]

# Remove rows where City contains junk
# We force conversion to string first to avoid errors with numbers
df_master = df_master[~df_master['City'].astype(str).str.contains('|'.join(junk_keywords), case=False, regex=True)]

# 2. Standardize City Names (Merge duplicates)
city_mapping = {
    'Montr√©al': 'Montreal', 
    'Qu√©bec': 'Quebec City', 
    'Ottawa-Gatineau': 'Ottawa',
    'Kitchener-Cambridge-Waterloo': 'Kitchener', 
    'St. John\'s': 'St. Johns',
    'Saint John': 'Saint John',
    'Greater Sudbury': 'Sudbury',
    'Victoriaville': 'Victoriaville',
    'Victoria': 'Victoria'
}
df_master['City'] = df_master['City'].replace(city_mapping)

# ---------------------------------------------------------

# 4. Final Filter & Save
# Keep only valid years (2015+) and rows with Rent data
df_master = df_master[df_master['Year'] >= 2015]
df_master = df_master.dropna(subset=['Average rent ($)'])

output_path = os.path.join(PROCESSED_PATH, "national_master_dataset.csv")
df_master.to_csv(output_path, index=False)

print(f"üéâ SUCCESS! Master Dataset Saved: {len(df_master)} clean rows.")
print("Sample of Clean Data:")
display(df_master.sort_values(['City', 'Year']).tail())

üîó Starting Master Merge...
üßπ Cleaning City Names...
üéâ SUCCESS! Master Dataset Saved: 2057 clean rows.
Sample of Clean Data:


Unnamed: 0,City,Turnover_Rate,Average rent ($),Year,Total_Units,Buy_Price,Intl_Students_Prov
2275,Yorkton,,971.0,2021,,,
2276,Yorkton,,1001.0,2022,,,
2277,Yorkton,,1076.0,2023,,,
2278,Yorkton,,1146.0,2024,,,
2279,Yorkton,,1217.0,2025,,,
