#### Next Steps - Chicago Census Tract Wise Hotspot Analysis / SHAP-Based Parameter Importance Model Using Climatic Variables

In [1]:
import pandas as pd
import numpy as np
import shap
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import os

  from .autonotebook import tqdm as notebook_tqdm


Note: script from earlier that generated the zip files was changed to exclude any entries outside of the Chicago boundary (multipolygon).  

In [2]:
df = pd.read_csv("Chicago_CT_Point.csv")
print(df.shape)
print(df.columns)

(801, 42)
Index(['TRACTCE10', 'GEOID10', 'NAME10', 'NAMELSAD10', 'Long', 'Lat',
       'GEOID_Matc', 'State', 'County', 'Community', 'Communit_1', '15_JUN',
       '16_JUN', '17_JUN', '18_JUN', '19_JUN', '20_JUN', '21_JUN', '22_JUN',
       '23_JUN', '24_JUN', '25_JUN', '26_JUN', '27_JUN', '28_JUN', '29_JUN',
       '30_JUN', '01_JLY', '02_JLY', '03_JLY', '04_JLY', '05_JLY', '06_JLY',
       '07_JLY', '08_JLY', '09_JLY', '10_JLY', '11_JLY', '12_JLY', '13_JLY',
       '14_JLY', '15_JLY'],
      dtype='object')


Now: 
- SHAP Based parameter importance assesssment model 
- to predict mean hotspot result using climate variables (from the meteo api) for that day.
- plot the importance of the parameters.
- ex. "06_JLY" is the target variable

Now for the merging:

In [3]:
## merge my existing weather data for each day from june 15th to july 15th:

df_weather = pd.read_csv("per-census-agg-weather-data.csv")
df_weather

Unnamed: 0,cleaned_time,tract_id,temperature_180m,wind_direction_180m,wind_speed_180m,surface_pressure,cloud_cover,relative_humidity_2m,precipitation,visibility,...,countyfp10,tractce10,name10,namelsad10,commarea,commarea_n,notes,geometry,latitude,longitude
0,2025-06-15,17031010100,67.387500,66.208333,10.437500,996.566667,24.833333,77.583333,0.0,61871.172208,...,31,10100,101.00,Census Tract 101,1,1.0,,POLYGON ((-87.66368000002299 42.01939800001483...,42.021262,-87.669844
1,2025-06-15,17031010201,66.091667,60.500000,9.879167,996.204167,25.666667,79.625000,0.0,59752.296625,...,31,10201,102.01,Census Tract 102.01,1,1.0,,"POLYGON ((-87.6800950000417 42.01253799999502,...",42.016008,-87.680149
2,2025-06-15,17031010202,67.400000,66.208333,10.437500,996.666667,24.833333,77.583333,0.0,61871.172208,...,31,10202,102.02,Census Tract 102.02,1,1.0,,POLYGON ((-87.67335799998422 42.01937400001867...,42.016050,-87.673322
3,2025-06-15,17031010300,67.387500,66.208333,10.437500,996.566667,24.833333,77.583333,0.0,61871.172208,...,31,10300,103.00,Census Tract 103,1,1.0,,POLYGON ((-87.66505999995483 42.01280100002973...,42.015941,-87.666535
4,2025-06-15,17031010400,67.520833,66.208333,10.437500,997.795833,29.541667,84.250000,0.0,52589.130042,...,31,10400,104.00,Census Tract 104,1,1.0,,POLYGON ((-87.65080299995667 41.99848500000089...,42.005440,-87.657166
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24826,2025-07-15,17031843700,78.629167,170.750000,7.554167,994.012500,31.208333,73.666667,0.0,69813.539167,...,31,843700,8437.00,Census Tract 8437,5,5.0,,POLYGON ((-87.69666899996376 41.94854100002745...,41.944826,-87.690788
24827,2025-07-15,17031843800,78.558333,166.083333,8.750000,994.737500,29.833333,71.083333,0.0,76060.805708,...,31,843800,8438.00,Census Tract 8438,61,61.0,,"POLYGON ((-87.6451599999826 41.79430200000245,...",41.801657,-87.640476
24828,2025-07-15,17031843900,78.625000,166.083333,8.750000,995.441667,25.083333,73.708333,0.0,71221.567042,...,31,843900,8439.00,Census Tract 8439,42,42.0,Small area in CA 43,POLYGON ((-87.55868599996295 41.77379199996368...,41.776599,-87.576017
24829,2025-07-15,17031980000,78.279167,181.666667,7.125000,992.770833,51.666667,65.875000,0.0,85137.795458,...,31,980000,9800.00,Census Tract 9800,76,76.0,Partially outside City Boundary (O'Hare),POLYGON ((-87.92062799997296 42.00453199998842...,41.980265,-87.903893


In [4]:
## finding the column I can use as the identifier to merge: 
df.dtypes

TRACTCE10     float64
GEOID10       float64
NAME10        float64
NAMELSAD10     object
Long          float64
Lat           float64
GEOID_Matc    float64
State          object
County         object
Community     float64
Communit_1     object
15_JUN        float64
16_JUN        float64
17_JUN        float64
18_JUN        float64
19_JUN        float64
20_JUN        float64
21_JUN        float64
22_JUN        float64
23_JUN        float64
24_JUN        float64
25_JUN        float64
26_JUN        float64
27_JUN        float64
28_JUN        float64
29_JUN        float64
30_JUN        float64
01_JLY        float64
02_JLY        float64
03_JLY        float64
04_JLY        float64
05_JLY        float64
06_JLY        float64
07_JLY        float64
08_JLY        float64
09_JLY        float64
10_JLY        float64
11_JLY        float64
12_JLY        float64
13_JLY        float64
14_JLY        float64
15_JLY        float64
dtype: object

In [5]:
df.head()

Unnamed: 0,TRACTCE10,GEOID10,NAME10,NAMELSAD10,Long,Lat,GEOID_Matc,State,County,Community,...,06_JLY,07_JLY,08_JLY,09_JLY,10_JLY,11_JLY,12_JLY,13_JLY,14_JLY,15_JLY
0,10100.0,17031010000.0,101.0,Census Tract 101,-87.669844,42.021262,17031010000.0,ILLINOIS,Cook,1.0,...,2.08279,0.030678,0.733111,0.196165,1.00551,1.92015,0.852281,1.44613,-1.62697,0.957384
1,10201.0,17031010000.0,102.01,Census Tract 102.01,-87.680149,42.016008,17031010000.0,ILLINOIS,Cook,1.0,...,2.12943,-0.356848,0.721191,0.187405,0.796464,1.91168,0.840111,1.46105,-1.61553,0.985709
2,10202.0,17031010000.0,102.02,Census Tract 102.02,-87.673322,42.01605,17031010000.0,ILLINOIS,Cook,1.0,...,2.21761,-0.069992,0.735649,0.214665,0.905013,1.9099,0.8487,1.47602,-1.55933,0.959192
3,10300.0,17031010000.0,103.0,Census Tract 103,-87.666535,42.015941,17031010000.0,ILLINOIS,Cook,1.0,...,2.20066,0.142566,0.746841,0.244895,0.855881,1.91138,0.857554,1.46493,-1.64321,0.938424
4,10400.0,17031010000.0,104.0,Census Tract 104,-87.657167,42.00544,17031010000.0,ILLINOIS,Cook,1.0,...,2.19104,0.586577,0.805455,0.269934,0.491964,1.89636,0.879911,1.49512,-1.77987,0.899077


In [6]:
df["GEOID10"].isna().sum()

np.int64(0)

In [7]:
df["TRACTCE10"].nunique()

798

In [8]:
df["GEOID10"].nunique()

727

In [9]:
df_weather.columns

Index(['cleaned_time', 'tract_id', 'temperature_180m', 'wind_direction_180m',
       'wind_speed_180m', 'surface_pressure', 'cloud_cover',
       'relative_humidity_2m', 'precipitation', 'visibility', 'statefp10',
       'countyfp10', 'tractce10', 'name10', 'namelsad10', 'commarea',
       'commarea_n', 'notes', 'geometry', 'latitude', 'longitude'],
      dtype='object')

In [10]:
df_weather[['tract_id','countyfp10', 'tractce10', 'name10', 'namelsad10', 'commarea']].nunique()

tract_id      801
countyfp10      1
tractce10     801
name10        801
namelsad10    801
commarea       77
dtype: int64

In [11]:
df[['TRACTCE10', 'GEOID10', 'NAME10', 'NAMELSAD10','GEOID_Matc']].nunique()

TRACTCE10     798
GEOID10       727
NAME10        798
NAMELSAD10    798
GEOID_Matc    727
dtype: int64

In [12]:
merged = pd.merge(df, df_weather, left_on="NAMELSAD10", right_on="namelsad10", how="inner")

merged.namelsad10.nunique()

798

In [13]:
merged.shape

(24831, 63)

In [14]:
# defining the features matrix
features = ['temperature_180m', 'wind_direction_180m',
       'wind_speed_180m', 'surface_pressure', 'cloud_cover',
       'relative_humidity_2m', 'precipitation', 'visibility']

target_vars = ['15_JUN',
       '16_JUN', '17_JUN', '18_JUN', '19_JUN', '20_JUN', '21_JUN', '22_JUN',
       '23_JUN', '24_JUN', '25_JUN', '26_JUN', '27_JUN', '28_JUN', '29_JUN',
       '30_JUN', '01_JLY', '02_JLY', '03_JLY', '04_JLY', '05_JLY', '06_JLY',
       '07_JLY', '08_JLY', '09_JLY', '10_JLY', '11_JLY', '12_JLY', '13_JLY',
       '14_JLY', '15_JLY']
# each day contains the average howMany (birds) hotspot z-score for the day for one census tract 

X = merged[features]

In [None]:
for target in target_vars:
    print(f"Training and plotting for target: {target}")
    y = merged[target]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    X_test_sample = X_test.sample(n=500, random_state=30)

    model = RandomForestRegressor(n_estimators=100, random_state=30, n_jobs=-1) # 30 for testing
    model.fit(X_train, y_train)

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test_sample)

    plt.figure()
    shap.summary_plot(shap_values, X_test_sample, show=False)
    plt.title(f"shap summary plot for {target}, 2025")
    plt.tight_layout()

    os.makedirs("shap_plots", exist_ok=True)
    path = f"shap_plots/shap_summary_for_{target}.png"
    
    plt.savefig(path, dpi=300)
    plt.close()

    print(f"Saved plot to {path} (finished)")

Training and plotting for target: 15_JUN
