In [31]:
import pandas as pd

In [32]:
# Load the saved datasets
final_fire_data = pd.read_csv("final_fire_data.csv")
final_home_price = pd.read_csv("final_home_price.csv")


In [33]:
# Convert ZIP code to string and pad if needed
final_fire_data["ZIP_CODE"] = final_fire_data["ZIP_CODE"].astype(str).str.zfill(5)
final_home_price["ZipCode"] = final_home_price["ZipCode"].astype(str).str.zfill(5)

# Convert year to int (if needed)
final_fire_data["YEAR_"] = final_fire_data["YEAR_"].astype(int)
final_home_price["YEAR"] = final_home_price["YEAR"].astype(int)


In [34]:
final_fire_data.shape

(1917, 7)

In [35]:
merged_data = pd.merge(
    final_home_price,
    final_fire_data,
    left_on=["ZipCode", "YEAR"],
    right_on=["ZIP_CODE", "YEAR_"],
    how="left",
    indicator=True
)

merged_data["_merge"].value_counts()

_merge
left_only     8750
both          1380
right_only       0
Name: count, dtype: int64

In [36]:
# Those are the ZIPs that aren’t included in Zillow’s home price data.

unmatched_zips = final_fire_data.loc[
    ~final_fire_data["ZIP_CODE"].isin(final_home_price["ZipCode"].unique()),
    "ZIP_CODE"
].value_counts()

unmatched_zips.head # 77 zips aren't included in zillow dataset

# there's 196 (1917 - 1721) fire that did not match because zillow does not include 77 zip codes
# those 77 ZIPs’ rows in the fire data are dropped in the final merge result, because we're joining from Zillow’s ZIPs only (left join), and those 77 ZIPs don’t exist there.

<bound method NDFrame.head of ZIP_CODE
95568    8
95043    7
96108    7
93633    7
93563    7
        ..
95372    1
95317    1
93628    1
96033    1
95552    1
Name: count, Length: 100, dtype: int64>

In [37]:
# Create a flag column for fire exposure
# 0 means no
# 1 means yes
merged_data["FIRE_EXPOSED"] = (merged_data["_merge"] == "both").astype(int)

# Drop merge indicator and other useless columns
col_drop = ["_merge", "ZIP_CODE", "YEAR_", "NEXT_YEAR_PRICE"]
merged_data.drop(columns=col_drop, inplace=True)

merged_data.head()


Unnamed: 0,ZipCode,YEAR,RegionID,City,Metro,CountyName,HOME_PRICE,PRICE_CHANGE,PCT_PRICE_CHANGE (%),NUM_FIRES,TOTAL_ACRES_BURNED_IN_ZIP,AVG_FIRE_DURATION_DAYS,MAX_PCT_ZIP_BURNED,ANY_MAJOR_FIRE,FIRE_EXPOSED
0,90001,2012,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,173120.15095,40741.986132,23.533936,,,,,,0
1,90001,2013,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,213862.137082,36859.316468,17.235083,,,,,,0
2,90001,2014,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,250721.45355,4484.383699,1.788592,,,,,,0
3,90001,2015,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,255205.837249,25174.378994,9.864343,,,,,,0
4,90001,2016,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,280380.216244,51139.85709,18.239467,,,,,,0


In [None]:
################################ Create time series feature to helo improve our model ###############################

# Rolling average of price change over the last 2 years (excluding current): Smooths short-term volatility to identify trends

merged_data['ROLLING_2yr_PRICE_CHANGE'] = merged_data.groupby('ZipCode')['PCT_PRICE_CHANGE (%)'].shift(1).rolling(2).mean()

# % price change for the previous year for the same ZIP. It helps capture local momentum in housing price trends

merged_data['LAG_PRICE_CHANGE (%)'] = merged_data.groupby('ZipCode')['PCT_PRICE_CHANGE (%)'].shift(1)

# Max % of ZIP burned from the previous year. Fires may have a lagged effect on market prices — impact might show up the year after.

merged_data['PREV_MAX_PCT_ZIP_BURNED (%)'] = merged_data.groupby('ZipCode')['MAX_PCT_ZIP_BURNED'].shift(1)


merged_data


Unnamed: 0,ZipCode,YEAR,RegionID,City,Metro,CountyName,HOME_PRICE,PRICE_CHANGE,PCT_PRICE_CHANGE (%),NUM_FIRES,TOTAL_ACRES_BURNED_IN_ZIP,AVG_FIRE_DURATION_DAYS,MAX_PCT_ZIP_BURNED,ANY_MAJOR_FIRE,FIRE_EXPOSED,ROLLING_2yr_PRICE_CHANGE,LAG_PRICE_CHANGE (%),PREV_MAX_PCT_ZIP_BURNED (%)
0,90001,2012,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,173120.150950,40741.986132,23.533936,,,,,,0,,,
1,90001,2013,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,213862.137082,36859.316468,17.235083,,,,,,0,,23.533936,
2,90001,2014,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,250721.453550,4484.383699,1.788592,,,,,,0,20.384510,17.235083,
3,90001,2015,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,255205.837249,25174.378994,9.864343,,,,,,0,9.511837,1.788592,
4,90001,2016,95982,Florence-Graham,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,280380.216244,51139.857090,18.239467,,,,,,0,5.826467,9.864343,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10125,96161,2014,98672,Truckee,"Truckee-Grass Valley, CA",Nevada County,550710.293321,17911.600932,3.252454,,,,,,0,10.972873,8.034760,
10126,96161,2015,98672,Truckee,"Truckee-Grass Valley, CA",Nevada County,568621.894253,29804.249300,5.241488,,,,,,0,5.643607,3.252454,
10127,96161,2016,98672,Truckee,"Truckee-Grass Valley, CA",Nevada County,598426.143553,45546.851495,7.611107,,,,,,0,4.246971,5.241488,
10128,96161,2017,98672,Truckee,"Truckee-Grass Valley, CA",Nevada County,643972.995049,36393.255854,5.651364,,,,,,0,6.426297,7.611107,


In [39]:
# FINAL XGBOOST DATA
Xgboost_data = merged_data.copy()
