# Processing data for Random Forest and SHAP

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path

# Data Processing: The Flexible Delta Approach
 
This script prepares the data for a Random Forest model by converting "snapshots" 
into "change events".
Logic:
1. Load data for all cities.
2. Calculate the Gentrification Gap (Gi) for every neighborhood/year.
3. Create 'Next Period' features to compare Year T vs Year T+n.
4. Calculate 'delta_t' (length of the time gap).
5. Define Target: Change in Gentrification Gap over delta_t.
6. Define Features: Initial state of the neighborhood at Year T.

# Define Paths


In [None]:
BARCELONA_PATH = "../data/preprocessed/barcelona/barcelona_master_dataframe.csv"
# TODO: Update these paths to where you saved your Paris and Milan preprocessed files
PARIS_PATH = "../data/preprocessed/paris/paris_master_dataframe.csv" 
MILAN_PATH = "../data/preprocessed/milan/milan_master_dataframe.csv"

OUTPUT_PATH = "../data/processed/modeling_dataframe_flexible.csv"


## 1. Load and Merge Data


In [None]:
def load_and_standardize(path, city_name):
    """
    Loads data and ensures necessary columns exist.
    """
    if not Path(path).exists():
        print(f"Warning: File not found at {path}. Skipping {city_name}.")
        return None
    
    df = pd.read_csv(path)
    df['city'] = city_name
    
    # Ensure geometry is dropped for the modeling dataframe (we don't need polygons for Random Forest)
    if 'geometry' in df.columns:
        df = df.drop(columns=['geometry'])
        
    return df

# Load datasets
df_bcn = load_and_standardize(BARCELONA_PATH, "Barcelona")
df_par = load_and_standardize(PARIS_PATH, "Paris")
df_mil = load_and_standardize(MILAN_PATH, "Milan")

# Concatenate into one Master DataFrame
# Note: It handles missing columns (e.g. if Milan is missing price data, those rows will be NaNs)
master_df = pd.concat([df for df in [df_bcn, df_par, df_mil] if df is not None], ignore_index=True)

print(f"Total rows loaded: {len(master_df)}")
print(f"Cities included: {master_df['city'].unique()}")

## 2. Calculate Gentrification Gap ($G_i$)
 
Formula: $G_i = Rank(P_i) - Rank(I_i)$
**Crucial:** The ranking must be done **per city** and **per year**. 
We compare a neighborhood to its peers in the same city at the same time.


In [None]:
# Define a helper to rank within groups
def percentile_rank(x):
    return x.rank(pct=True)

# Calculate Price Rank (Pi)
master_df['Pi_rank'] = master_df.groupby(['city', 'year'])['median_price_per_m2'].transform(percentile_rank)

# Calculate Income Rank (Ii)
master_df['Ii_rank'] = master_df.groupby(['city', 'year'])['median_household_income'].transform(percentile_rank)

# Calculate the Gap
master_df['Gentrification_Gap'] = master_df['Pi_rank'] - master_df['Ii_rank']

# Check for NaNs in the Target (some cities might miss price data for certain years)
print(f"Rows with valid Gentrification Gap: {master_df['Gentrification_Gap'].notna().sum()}")
master_df.dropna(subset=['Gentrification_Gap'], inplace=True)


## 3. The Transformation: Create "Change Events"
 
Instead of lagging 1 year back, we look 1 step *forward* in the available data.
If rows are sorted by Year, the "Next" row is the future state.


In [None]:
# 1. Sort strictly by Neighborhood and Year
master_df.sort_values(by=['neighborhood_id', 'year'], inplace=True)

# 2. Create 'Next' columns by shifting the data UPWARDS (-1) within each neighborhood group.
# This brings the data from Year T+n into the row for Year T.

cols_to_fetch_from_future = ['year', 'Gentrification_Gap']

for col in cols_to_fetch_from_future:
    master_df[f'future_{col}'] = master_df.groupby('neighborhood_id')[col].shift(-1)

# 3. Drop rows where there is no future data (i.e., the most recent year for each neighborhood)
# We can't use the last year as a starting point because we don't know what happens next.
clean_df = master_df.dropna(subset=['future_year']).copy()

## 4. Calculate Targets and Time Delta


In [None]:
# Calculate delta_t (The time gap)
clean_df['delta_t'] = clean_df['future_year'] - clean_df['year']

# Calculate the Target: The CHANGE in the Gap
clean_df['target_Gentrification_Gap_change'] = clean_df['future_Gentrification_Gap'] - clean_df['Gentrification_Gap']

# Sanity Check: Look at the distribution of time gaps
print("\n--- Distribution of Time Gaps (delta_t) ---")
print(clean_df['delta_t'].value_counts().sort_index())
# You should see:
# 1.0 -> Mostly Barcelona (consecutive years) and some Paris (2021-2022)
# 2.0 -> Paris (2019-2021)
# 4.0 -> Paris (2015-2019)

## 5. Finalize Features (X)
 
The features for the model are the **Initial Conditions** (the state at 'year').
We rename them to 'initial_' to be explicit.

In [None]:
# List of features you want to keep as predictors
# These capture the "State" of the neighborhood before the change happened.
feature_cols = [
    'median_household_income',
    'median_price_per_m2',
    'pct_higher_education',
    'population_density',
    'pct_young_adults',
    'std_price_per_m2',       # Standard deviations are excellent features
    'std_household_income'
]

# Rename columns to 'initial_' prefix
rename_map = {col: f'initial_{col}' for col in feature_cols}
clean_df.rename(columns=rename_map, inplace=True)

## 6. Save the Model-Ready Dataframe

In [None]:
# Select final columns for the output file
final_columns = [
    'neighborhood_id',
    'city',
    'start_year',      # Renaming 'year' to 'start_year' for clarity
    'delta_t',         # CRITICAL FEATURE
    'target_Gentrification_Gap_change', # TARGET
] + list(rename_map.values()) # The initial_ features

# Rename 'year' to 'start_year' before saving
clean_df.rename(columns={'year': 'start_year'}, inplace=True)

# Create final dataframe
model_df = clean_df[final_columns]

# Output info
print("\n--- Final Model DataFrame ---")
model_df.info()
print(model_df.head())

# Save
model_df.to_csv(OUTPUT_PATH, index=False)
print(f"\nSaved processed data to {OUTPUT_PATH}")