#### Final part of the project - using the top six ranked climatic params to predict bird hotspot intensity at a tract-level for the next six days (after july 15th).


In [35]:
import pandas as pd
import geopandas as gpd
from pathlib import Path
from datetime import datetime
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

In [36]:
shap = pd.read_csv("all_days_shap_finished.csv")
# drop surface pressure and precipitation -> two lowest least-impactful params
shap = shap.rename(columns={"Unnamed: 0":"Day"})
shap = shap.drop(columns=['surface_pressure','precipitation'],axis=1)
print(f"top 6 most impactful features (according to shap ranking):\n{list(shap.columns[1:])}")

top 6 most impactful features (according to shap ranking):
['temperature_180m', 'wind_direction_180m', 'wind_speed_180m', 'cloud_cover', 'relative_humidity_2m', 'visibility']


In [37]:
intensity = pd.read_csv("Chicago_CT_Point.csv")
intensity.head()

Unnamed: 0,TRACTCE10,GEOID10,NAME10,NAMELSAD10,Long,Lat,GEOID_Matc,State,County,Community,...,06_JLY,07_JLY,08_JLY,09_JLY,10_JLY,11_JLY,12_JLY,13_JLY,14_JLY,15_JLY
0,10100.0,17031010000.0,101.0,Census Tract 101,-87.669844,42.021262,17031010000.0,ILLINOIS,Cook,1.0,...,2.08279,0.030678,0.733111,0.196165,1.00551,1.92015,0.852281,1.44613,-1.62697,0.957384
1,10201.0,17031010000.0,102.01,Census Tract 102.01,-87.680149,42.016008,17031010000.0,ILLINOIS,Cook,1.0,...,2.12943,-0.356848,0.721191,0.187405,0.796464,1.91168,0.840111,1.46105,-1.61553,0.985709
2,10202.0,17031010000.0,102.02,Census Tract 102.02,-87.673322,42.01605,17031010000.0,ILLINOIS,Cook,1.0,...,2.21761,-0.069992,0.735649,0.214665,0.905013,1.9099,0.8487,1.47602,-1.55933,0.959192
3,10300.0,17031010000.0,103.0,Census Tract 103,-87.666535,42.015941,17031010000.0,ILLINOIS,Cook,1.0,...,2.20066,0.142566,0.746841,0.244895,0.855881,1.91138,0.857554,1.46493,-1.64321,0.938424
4,10400.0,17031010000.0,104.0,Census Tract 104,-87.657167,42.00544,17031010000.0,ILLINOIS,Cook,1.0,...,2.19104,0.586577,0.805455,0.269934,0.491964,1.89636,0.879911,1.49512,-1.77987,0.899077


In [38]:
# THIS UPDATED WEATHER CSV CONTAINS DATA FROM 06-15 to 07-21 (end of prediction)

past_weather = pd.read_csv("per-census-agg-weather-data.csv")
six_day_weather = pd.read_csv("six-day-per-census-agg-weather-data.csv")

weather = pd.concat([past_weather, six_day_weather]) #combiningi vertically
weather.shape

(29637, 21)

In [39]:
merged = pd.merge(intensity, weather, left_on="NAMELSAD10",right_on="namelsad10", how="inner")
merged["NAMELSAD10"].nunique()

798

In [40]:
# this will ultimately define the features matrices but I need to drop a lot of columns
cols_to_keep = ['tract_id','cleaned_time','temperature_180m', 'wind_direction_180m', 'wind_speed_180m', 'cloud_cover', 'relative_humidity_2m', 'visibility','15_JUN',
       '16_JUN', '17_JUN', '18_JUN', '19_JUN', '20_JUN', '21_JUN', '22_JUN',
       '23_JUN', '24_JUN', '25_JUN', '26_JUN', '27_JUN', '28_JUN', '29_JUN',
       '30_JUN', '01_JLY', '02_JLY', '03_JLY', '04_JLY', '05_JLY', '06_JLY',
       '07_JLY', '08_JLY', '09_JLY', '10_JLY', '11_JLY', '12_JLY', '13_JLY',
       '14_JLY', '15_JLY']

getis_scores=['15_JUN',
       '16_JUN', '17_JUN', '18_JUN', '19_JUN', '20_JUN', '21_JUN', '22_JUN',
       '23_JUN', '24_JUN', '25_JUN', '26_JUN', '27_JUN', '28_JUN', '29_JUN',
       '30_JUN', '01_JLY', '02_JLY', '03_JLY', '04_JLY', '05_JLY', '06_JLY',
       '07_JLY', '08_JLY', '09_JLY', '10_JLY', '11_JLY', '12_JLY', '13_JLY',
       '14_JLY', '15_JLY']

In [41]:
## IMPORTANT STEP ###
# my merged df currrently has z-score values for dates greater than 7/15 which is misleading
# fix -> if date > 7/15 -> merged[col] = NaN 

filter_date = pd.to_datetime("2025-07-15")
merged["cleaned_time"] = pd.to_datetime(merged["cleaned_time"])

for col in getis_scores:
    merged.loc[merged["cleaned_time"] > filter_date, col] = np.nan

merged

Unnamed: 0,TRACTCE10,GEOID10,NAME10,NAMELSAD10,Long,Lat,GEOID_Matc,State,County,Community,...,countyfp10,tractce10,name10,namelsad10,commarea,commarea_n,notes,geometry,latitude,longitude
0,10100.0,1.703101e+10,101.0,Census Tract 101,-87.669844,42.021262,1.703101e+10,ILLINOIS,Cook,1.0,...,31,10100,101.0,Census Tract 101,1,1.0,,POLYGON ((-87.66368000002299 42.01939800001483...,42.021262,-87.669844
1,10100.0,1.703101e+10,101.0,Census Tract 101,-87.669844,42.021262,1.703101e+10,ILLINOIS,Cook,1.0,...,31,10100,101.0,Census Tract 101,1,1.0,,POLYGON ((-87.66368000002299 42.01939800001483...,42.021262,-87.669844
2,10100.0,1.703101e+10,101.0,Census Tract 101,-87.669844,42.021262,1.703101e+10,ILLINOIS,Cook,1.0,...,31,10100,101.0,Census Tract 101,1,1.0,,POLYGON ((-87.66368000002299 42.01939800001483...,42.021262,-87.669844
3,10100.0,1.703101e+10,101.0,Census Tract 101,-87.669844,42.021262,1.703101e+10,ILLINOIS,Cook,1.0,...,31,10100,101.0,Census Tract 101,1,1.0,,POLYGON ((-87.66368000002299 42.01939800001483...,42.021262,-87.669844
4,10100.0,1.703101e+10,101.0,Census Tract 101,-87.669844,42.021262,1.703101e+10,ILLINOIS,Cook,1.0,...,31,10100,101.0,Census Tract 101,1,1.0,,POLYGON ((-87.66368000002299 42.01939800001483...,42.021262,-87.669844
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29632,810400.0,1.703181e+10,8104.0,Census Tract 8104,-87.829438,41.987575,1.703181e+10,ILLINOIS,Cook,10.0,...,31,810400,8104.0,Census Tract 8104,10,10.0,Partially outside City Boundary,POLYGON ((-87.83657899999764 41.97976099998077...,41.987574,-87.829438
29633,810400.0,1.703181e+10,8104.0,Census Tract 8104,-87.829438,41.987575,1.703181e+10,ILLINOIS,Cook,10.0,...,31,810400,8104.0,Census Tract 8104,10,10.0,Partially outside City Boundary,POLYGON ((-87.83657899999764 41.97976099998077...,41.987574,-87.829438
29634,810400.0,1.703181e+10,8104.0,Census Tract 8104,-87.829438,41.987575,1.703181e+10,ILLINOIS,Cook,10.0,...,31,810400,8104.0,Census Tract 8104,10,10.0,Partially outside City Boundary,POLYGON ((-87.83657899999764 41.97976099998077...,41.987574,-87.829438
29635,810400.0,1.703181e+10,8104.0,Census Tract 8104,-87.829438,41.987575,1.703181e+10,ILLINOIS,Cook,10.0,...,31,810400,8104.0,Census Tract 8104,10,10.0,Partially outside City Boundary,POLYGON ((-87.83657899999764 41.97976099998077...,41.987574,-87.829438


In [42]:
zscore_long = pd.melt(
    merged[['tract_id', 'cleaned_time'] + getis_scores],
    id_vars=['tract_id', 'cleaned_time'],
    value_vars=getis_scores,
    var_name='date_str',
    value_name='z_score'
)

# get rid of any Nan rows (from 7/16-7/21)
zscore_long = zscore_long.dropna(subset=['z_score']).reset_index()

zscore_long

Unnamed: 0,index,tract_id,cleaned_time,date_str,z_score
0,0,17031010100,2025-06-15,15_JUN,2.866720
1,1,17031010100,2025-06-16,15_JUN,2.866720
2,2,17031010100,2025-06-17,15_JUN,2.866720
3,3,17031010100,2025-06-18,15_JUN,2.866720
4,4,17031010100,2025-06-19,15_JUN,2.866720
...,...,...,...,...,...
769756,918736,17031810400,2025-07-11,15_JLY,0.295905
769757,918737,17031810400,2025-07-12,15_JLY,0.295905
769758,918738,17031810400,2025-07-13,15_JLY,0.295905
769759,918739,17031810400,2025-07-14,15_JLY,0.295905


z_score is the target       
zscore_long contains-> every tract with each date -> each date -> weather & z-score for the day         
801 $\times$ 31 $\times$ 31 = 769761 rows

In [43]:
# Helper to convert to full datetime
def convert_zdate(d):
    d = d.replace("_JLY", "_JUL")  # Fix spelling
    return datetime.strptime(d, "%d_%b").replace(year=2025)

zscore_long['cleaned_time'] = zscore_long['date_str'].apply(convert_zdate)
zscore_long = zscore_long.drop(columns='date_str')

In [44]:
weather["cleaned_time"] = pd.to_datetime(weather["cleaned_time"])

In [45]:
zscore_long.columns # -> up to 7/15

Index(['index', 'tract_id', 'cleaned_time', 'z_score'], dtype='object')

In [46]:
weather.columns # -> 7/16 onward data

Index(['cleaned_time', 'tract_id', 'temperature_180m', 'wind_direction_180m',
       'wind_speed_180m', 'surface_pressure', 'cloud_cover',
       'relative_humidity_2m', 'precipitation', 'visibility', 'statefp10',
       'countyfp10', 'tractce10', 'name10', 'namelsad10', 'commarea',
       'commarea_n', 'notes', 'geometry', 'latitude', 'longitude'],
      dtype='object')

In [47]:
master_data.columns

Index(['cleaned_time', 'tract_id', 'temperature_180m', 'wind_direction_180m',
       'wind_speed_180m', 'surface_pressure', 'cloud_cover',
       'relative_humidity_2m', 'precipitation', 'visibility'],
      dtype='object')

In [48]:
master_data = pd.merge(weather, zscore_long, on=["tract_id","cleaned_time"], how="left")

## master data now has all weather data for 6/15 - 7/21 with z-score col being NaN for 7/16 onward
master_data = master_data[['cleaned_time', 'tract_id', 'temperature_180m', 'wind_direction_180m',
       'wind_speed_180m', 'surface_pressure', 'cloud_cover',
       'relative_humidity_2m', 'precipitation', 'visibility', 'z_score']]

In [49]:
master_data

Unnamed: 0,cleaned_time,tract_id,temperature_180m,wind_direction_180m,wind_speed_180m,surface_pressure,cloud_cover,relative_humidity_2m,precipitation,visibility,z_score
0,2025-06-15,17031010100,67.387500,66.208333,10.437500,996.566667,24.833333,77.583333,0.0,61871.172208,2.86672
1,2025-06-15,17031010100,67.387500,66.208333,10.437500,996.566667,24.833333,77.583333,0.0,61871.172208,2.86672
2,2025-06-15,17031010100,67.387500,66.208333,10.437500,996.566667,24.833333,77.583333,0.0,61871.172208,2.86672
3,2025-06-15,17031010100,67.387500,66.208333,10.437500,996.566667,24.833333,77.583333,0.0,61871.172208,2.86672
4,2025-06-15,17031010100,67.387500,66.208333,10.437500,996.566667,24.833333,77.583333,0.0,61871.172208,2.86672
...,...,...,...,...,...,...,...,...,...,...,...
774655,2025-07-21,17031843700,69.166667,74.083333,12.641667,994.816667,21.708333,79.833333,0.0,59424.212500,
774656,2025-07-21,17031843800,70.275000,74.416667,12.862500,995.458333,22.583333,75.875000,0.0,65411.744583,
774657,2025-07-21,17031843900,70.350000,74.416667,12.862500,996.166667,23.416667,75.875000,0.0,65329.724333,
774658,2025-07-21,17031980000,68.683333,77.125000,13.400000,993.554167,21.291667,73.416667,0.0,74611.768167,


<hr></hr>

In [None]:
# this six day bird data from july 16 to july 21 was acquired from bird_data.ipynb.
# establishing ground truth

# folder = Path("./six_day_data")
# shpfs = folder.glob("*.zip")

# ground_truth = pd.concat([gpd.read_file("zip://" + str(shp)) for shp in shpfs]).pipe(gpd.GeoDataFrame)
# ground_truth.shape

# WHEN I GET THE HOTSPOT INTENSITY FILE: 
# ground_truth = gpd.read_file(file)
# ground_truth

In [72]:
features = ['temperature_180m', 'wind_direction_180m', 'wind_speed_180m', 'cloud_cover', 'relative_humidity_2m', 'visibility']

In [75]:
clean_master_data = master_data[master_data['z_score'].notna()]

In [76]:
X_train = clean_master_data[features]
y_train = clean_master_data["z_score"]

In [77]:
model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=30)
model.fit(X_train, y_train)

In [78]:
test_data = master_data[(master_data['cleaned_time'] >= '2025-07-16') & (master_data['cleaned_time'] <= '2025-07-21')]

In [79]:
# weather features for 7/16-7/21
X_test = test_data[features] 
X_test

Unnamed: 0,temperature_180m,wind_direction_180m,wind_speed_180m,cloud_cover,relative_humidity_2m,visibility
769854,76.458333,230.958333,17.570833,66.041667,81.583333,57264.326792
769855,76.712500,233.666667,17.750000,61.666667,83.708333,54106.518042
769856,76.470833,230.958333,17.570833,66.041667,81.583333,57264.326792
769857,76.458333,230.958333,17.570833,66.041667,81.583333,57264.326792
769858,76.587500,230.958333,17.570833,59.958333,85.291667,53505.030875
...,...,...,...,...,...,...
774655,69.166667,74.083333,12.641667,21.708333,79.833333,59424.212500
774656,70.275000,74.416667,12.862500,22.583333,75.875000,65411.744583
774657,70.350000,74.416667,12.862500,23.416667,75.875000,65329.724333
774658,68.683333,77.125000,13.400000,21.291667,73.416667,74611.768167


In [80]:
y_pred = model.predict(X_test)

In [81]:
y_pred

array([-0.51100252, -0.249645  , -0.51100252, ...,  0.49112235,
       -0.709029  ,  0.01297629])

In [None]:
# this is for when I have the six day z_score cols (repeat reshaping process from earlier)
# grnd_truth_zscore_long = 

# test_data = grnd_truth_zscore_long[(grnd_truth_zscore_long['cleaned_time'] >= '2025-07-16') & (grnd_truth_zscore_long['cleaned_time'] <= '2025-07-21')]
# y_test = test_data['bird_hotspot_intensity']

# r2 = r2_score(y_test, y_pred)
# rmse = mean_squared_error(y_test, y_pred, squared=False)