#### Next Steps - Chicago Census Tract Wise Hotspot Analysis / SHAP-Based Parameter Importance Model Using Climatic Variables

In [2]:
import pandas as pd
import numpy as np
import shap
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

  from .autonotebook import tqdm as notebook_tqdm


Note: script from earlier that generated the zip files was changed to exclude any entries outside of the Chicago boundary (multipolygon).  

In [3]:
df = pd.read_csv("Chicago_CT_Point.csv")
print(df.shape)
print(df.columns)

(801, 42)
Index(['TRACTCE10', 'GEOID10', 'NAME10', 'NAMELSAD10', 'Long', 'Lat',
       'GEOID_Matc', 'State', 'County', 'Community', 'Communit_1', '15_JUN',
       '16_JUN', '17_JUN', '18_JUN', '19_JUN', '20_JUN', '21_JUN', '22_JUN',
       '23_JUN', '24_JUN', '25_JUN', '26_JUN', '27_JUN', '28_JUN', '29_JUN',
       '30_JUN', '01_JLY', '02_JLY', '03_JLY', '04_JLY', '05_JLY', '06_JLY',
       '07_JLY', '08_JLY', '09_JLY', '10_JLY', '11_JLY', '12_JLY', '13_JLY',
       '14_JLY', '15_JLY'],
      dtype='object')


Now: 
- SHAP Based parameter importance assesssment model 
- to predict mean hotspot result using climate variables (from the meteo api) for that day.
- plot the importance of the parameters.
- ex. "06_JLY" is the target variable

Now for the merging:

In [None]:
## merge my existing weather data for each day from june 15th to july 15th:
# get csv here -> https://drive.google.com/file/d/1_cwNNzQkhpV4pdPdvt5ezxpoOW8PoEdw/view?usp=drive_link
df_weather = pd.read_csv("per-census-agg-weather-data.csv")
df_weather

In [5]:
## finding the column I can use as the identifier to merge: 
df.dtypes

TRACTCE10     float64
GEOID10       float64
NAME10        float64
NAMELSAD10     object
Long          float64
Lat           float64
GEOID_Matc    float64
State          object
County         object
Community     float64
Communit_1     object
15_JUN        float64
16_JUN        float64
17_JUN        float64
18_JUN        float64
19_JUN        float64
20_JUN        float64
21_JUN        float64
22_JUN        float64
23_JUN        float64
24_JUN        float64
25_JUN        float64
26_JUN        float64
27_JUN        float64
28_JUN        float64
29_JUN        float64
30_JUN        float64
01_JLY        float64
02_JLY        float64
03_JLY        float64
04_JLY        float64
05_JLY        float64
06_JLY        float64
07_JLY        float64
08_JLY        float64
09_JLY        float64
10_JLY        float64
11_JLY        float64
12_JLY        float64
13_JLY        float64
14_JLY        float64
15_JLY        float64
dtype: object

In [6]:
df.head()

Unnamed: 0,TRACTCE10,GEOID10,NAME10,NAMELSAD10,Long,Lat,GEOID_Matc,State,County,Community,...,06_JLY,07_JLY,08_JLY,09_JLY,10_JLY,11_JLY,12_JLY,13_JLY,14_JLY,15_JLY
0,10100.0,17031010000.0,101.0,Census Tract 101,-87.669844,42.021262,17031010000.0,ILLINOIS,Cook,1.0,...,2.08279,0.030678,0.733111,0.196165,1.00551,1.92015,0.852281,1.44613,-1.62697,0.957384
1,10201.0,17031010000.0,102.01,Census Tract 102.01,-87.680149,42.016008,17031010000.0,ILLINOIS,Cook,1.0,...,2.12943,-0.356848,0.721191,0.187405,0.796464,1.91168,0.840111,1.46105,-1.61553,0.985709
2,10202.0,17031010000.0,102.02,Census Tract 102.02,-87.673322,42.01605,17031010000.0,ILLINOIS,Cook,1.0,...,2.21761,-0.069992,0.735649,0.214665,0.905013,1.9099,0.8487,1.47602,-1.55933,0.959192
3,10300.0,17031010000.0,103.0,Census Tract 103,-87.666535,42.015941,17031010000.0,ILLINOIS,Cook,1.0,...,2.20066,0.142566,0.746841,0.244895,0.855881,1.91138,0.857554,1.46493,-1.64321,0.938424
4,10400.0,17031010000.0,104.0,Census Tract 104,-87.657167,42.00544,17031010000.0,ILLINOIS,Cook,1.0,...,2.19104,0.586577,0.805455,0.269934,0.491964,1.89636,0.879911,1.49512,-1.77987,0.899077


In [7]:
df["GEOID10"].isna().sum()

np.int64(0)

In [8]:
df["TRACTCE10"].nunique()

798

In [9]:
df["GEOID10"].nunique()

727

In [10]:
df_weather.columns

Index(['cleaned_time', 'tract_id', 'temperature_180m', 'wind_direction_180m',
       'wind_speed_180m', 'surface_pressure', 'cloud_cover',
       'relative_humidity_2m', 'precipitation', 'visibility', 'objectid',
       'census_tra', 'tract_fips', 'tract_cent', 'tract_ce_1', 'tract_ce_2',
       'tract_ce_3', 'tract_comm', 'tract_numa', 'tract_cens', 'perimeter',
       'data_admin', 'tract_crea', 'date_tract', 'time_tract', 'shape_area',
       'shape_len', 'geometry', 'latitude', 'longitude'],
      dtype='object')

In [11]:
# df = df.rename(columns={"GEOID10":"tract_id"})
# df

In [12]:
# merged = pd.merge(df, df_weather, left_on="TRACTCE10", right_on="census_tra", how="inner")
# merged.census_tra.nunique()

In [13]:
merged = pd.merge(df, df_weather, left_on="GEOID10", right_on="tract_id", how="inner")
merged.tract_id.nunique()

616

In [14]:
df_weather.columns

Index(['cleaned_time', 'tract_id', 'temperature_180m', 'wind_direction_180m',
       'wind_speed_180m', 'surface_pressure', 'cloud_cover',
       'relative_humidity_2m', 'precipitation', 'visibility', 'objectid',
       'census_tra', 'tract_fips', 'tract_cent', 'tract_ce_1', 'tract_ce_2',
       'tract_ce_3', 'tract_comm', 'tract_numa', 'tract_cens', 'perimeter',
       'data_admin', 'tract_crea', 'date_tract', 'time_tract', 'shape_area',
       'shape_len', 'geometry', 'latitude', 'longitude'],
      dtype='object')

In [15]:
# defining the features matrix
features = ['temperature_180m', 'wind_direction_180m',
       'wind_speed_180m', 'surface_pressure', 'cloud_cover',
       'relative_humidity_2m', 'precipitation', 'visibility']

target_vars = ['15_JUN',
       '16_JUN', '17_JUN', '18_JUN', '19_JUN', '20_JUN', '21_JUN', '22_JUN',
       '23_JUN', '24_JUN', '25_JUN', '26_JUN', '27_JUN', '28_JUN', '29_JUN',
       '30_JUN', '01_JLY', '02_JLY', '03_JLY', '04_JLY', '05_JLY', '06_JLY',
       '07_JLY', '08_JLY', '09_JLY', '10_JLY', '11_JLY', '12_JLY', '13_JLY',
       '14_JLY', '15_JLY']
# each day contains the average howMany (birds) hotspot z-score for the day for one census tract 

X = merged[features]

In [None]:
for target in target_vars:
    print(f"Training and plotting for target: {target}")
    y = merged[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(n_estimators=100, random_state=30, n_jobs=-1) # 30 for testing
    model.fit(X_train, y_train)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    plt.figure()
    shap.summary_plot(shap_values, X_test, show=False)
    plt.title(f"shap summary plot for {target}, 2025")
    plt.tight_layout()

    os.makedirs("shap_plots", exist_ok=True)
    path = f"shap_plots/shap_summary_for_{target}.png"
    
    plt.savefig(path, dpi=150)
    plt.close()

    print(f"Saved plot to {path} (finished)")

Training and plotting for target: 15_JUN
