#### Next Steps - Chicago Census Tract Wise Hotspot Analysis / SHAP-Based Parameter Importance Model Using Climatic Variables

In [None]:
import pandas as pd
import numpy as np
import shap
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

Note: script from earlier that generated the zip files was changed to exclude any entries outside of the Chicago boundary (multipolygon).  

In [None]:
df = pd.read_csv("Chicago_CT_Point.csv")
print(df.shape)
print(df.columns)

Now: 
- SHAP Based parameter importance assesssment model 
- to predict mean hotspot result using climate variables (from the meteo api) for that day.
- plot the importance of the parameters.
- ex. "06_JLY" is the target variable

Now for the merging:

In [None]:
## merge my existing weather data for each day from june 15th to july 15th:
# get csv here -> https://drive.google.com/file/d/1_cwNNzQkhpV4pdPdvt5ezxpoOW8PoEdw/view?usp=drive_link
df_weather = pd.read_csv("per-census-agg-weather-data.csv")
df_weather

In [None]:
## finding the column I can use as the identifier to merge: 
df.dtypes

In [None]:
df.head()

In [None]:
df["GEOID10"].isna().sum()

In [None]:
df["TRACTCE10"].nunique()

In [None]:
df["GEOID10"].nunique()

In [None]:
df_weather.columns

In [None]:
# df = df.rename(columns={"GEOID10":"tract_id"})
# df

In [None]:
# merged = pd.merge(df, df_weather, left_on="TRACTCE10", right_on="census_tra", how="inner")
# merged.census_tra.nunique()

In [None]:
merged = pd.merge(df, df_weather, left_on="GEOID10", right_on="tract_id", how="inner")
merged.tract_id.nunique()

In [None]:
df_weather.columns

In [None]:
# defining the features matrix
features = ['temperature_180m', 'wind_direction_180m',
       'wind_speed_180m', 'surface_pressure', 'cloud_cover',
       'relative_humidity_2m', 'precipitation', 'visibility']

target_vars = ['15_JUN',
       '16_JUN', '17_JUN', '18_JUN', '19_JUN', '20_JUN', '21_JUN', '22_JUN',
       '23_JUN', '24_JUN', '25_JUN', '26_JUN', '27_JUN', '28_JUN', '29_JUN',
       '30_JUN', '01_JLY', '02_JLY', '03_JLY', '04_JLY', '05_JLY', '06_JLY',
       '07_JLY', '08_JLY', '09_JLY', '10_JLY', '11_JLY', '12_JLY', '13_JLY',
       '14_JLY', '15_JLY']
# each day contains the average howMany (birds) hotspot z-score for the day for one census tract 

X = merged[features]

In [None]:
for target in target_vars:
    print(f"Training and plotting for target: {target}")
    y = merged[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, random_state=30, n_jobs=-1) # 30 for testing
    model.fit(X_train, y_train)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    plt.figure()
    shap.summary_plot(shap_values, X_test, show=False)
    plt.title(f"shap summary plot for {target}, 2025")
    plt.tight_layout()

    os.makedirs("shap_plots", exist_ok=True)
    path = f"shap_plots/shap_summary_for_{target}.png"
    
    plt.savefig(path, dpi=150)
    plt.close()

    print(f"Saved plot to {path} (finished)")