# Zillow Housing Value Index Data
The goal of this notebook is to retrieve and clean zillow housing value index (ZHVI) data. The base dataset can be found here (https://www.zillow.com/research/data/) with the Data Type: "ZHVI All Homes (SFR, Condo/Co-Op) Time Series, Smoothed, Seasonally Adjusted($)" and "Zip Code" for "Geography" 

Here is a detailed guide to what ZHVI means-- https://www.zillow.com/research/zhvi-user-guide/
But briefly, according to the website-- "ZHVI represents the “typical” home value for a region. It’s calculated as a weighted average of the middle third of homes in a given region. 

So, it reflects the typical value for homes in the 35th to 65th percentile range.

In [None]:
import pandas as pd
from pathlib import Path

In [None]:
RAW_DATA = Path('raw_data/')
DATA_DIR = Path('data/')

In [None]:
zillow_df = pd.read_csv(RAW_DATA/ "Zillow.csv")
zillow_df.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31,...,2024-07-31,2024-08-31,2024-09-30,2024-10-31,2024-11-30,2024-12-31,2025-01-31,2025-02-28,2025-03-31,2025-04-30
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,215066.971693,...,509249.221787,509666.472992,510374.009847,511525.06708,512261.83626,512690.533943,513124.726141,512795.438355,511868.678296,510037.820751
1,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,119562.997796,...,538532.783501,543089.612558,548455.892446,553256.895924,556317.26186,558134.169652,558711.571389,560720.245574,563115.473107,566554.408648
2,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,105807.537522,...,287724.899068,287539.422639,287347.320971,286922.090683,286118.233044,285374.424312,284751.623163,284115.439117,283266.937552,282289.307355
3,62080,4,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,173775.027359,...,537983.848378,537241.140044,538367.583946,538076.439164,536637.327844,533981.510927,531493.203365,531361.977752,531016.967457,530904.03261
4,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,104735.755192,...,281213.908855,280934.997085,280676.299089,280157.795607,279545.476016,279198.88208,278951.034751,278540.286698,277663.952315,276602.561574


In [25]:
zillow_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26316 entries, 0 to 26315
Columns: 313 entries, RegionID to 2025-04-30
dtypes: float64(304), int64(3), object(6)
memory usage: 62.8+ MB


In [None]:
zillow_CA_df = zillow_df[zillow_df['State'] == 'CA']# Filter rows where the STATE column is 'CA'

zillow_CA_df.head()
# Save the filtered data to a new CSV file
# df_ca.to_csv("california_only.csv", index=False)

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31,...,2024-07-31,2024-08-31,2024-09-30,2024-10-31,2024-11-30,2024-12-31,2025-01-31,2025-02-28,2025-03-31,2025-04-30
9,95992,10,90011,zip,CA,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,122468.819722,...,619606.328113,623901.237628,630114.609171,635737.625228,640640.625467,644457.222856,643705.914227,638951.611711,629571.24412,622052.595559
12,96193,13,90650,zip,CA,CA,Norwalk,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,162650.585403,...,765207.924191,770500.141326,776689.718995,781487.168035,784865.05639,787343.70723,786610.153621,783285.394506,776680.006918,771499.616942
13,96361,14,91331,zip,CA,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,128518.226264,...,743231.422111,750232.765964,757811.080827,764126.468962,769532.128575,775179.782947,775682.123527,772493.777986,765187.549194,760378.780896
21,96025,22,90044,zip,CA,CA,Los Angeles,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,136453.783948,...,661624.594786,666417.004862,672842.370717,678200.799165,682472.969504,685391.398681,684214.456529,679904.818709,672211.897064,665171.11774
22,96817,23,92336,zip,CA,CA,Fontana,"Riverside-San Bernardino-Ontario, CA",San Bernardino County,166979.919657,...,716952.768684,719355.323294,721566.783423,723191.308516,725157.508674,726301.510539,726263.738599,725373.172761,722897.420792,720234.68818


In [None]:
non_date_cols = ['RegionID', 'State', 'CountyName','City']  # Adjust this list if your file has more

# Create date range from Jan 2015 to Apr 2025, formatted like 'YYYY-MM-DD'
date_cols = pd.date_range(start='2015-01-31', end='2025-04-30', freq='ME').strftime('%Y-%m-%d').tolist()

# Combine non-date columns and selected date columns
columns_to_keep = non_date_cols + date_cols

# Keep only those columns (skip missing ones to avoid KeyError)
existing_cols = [col for col in columns_to_keep if col in zillow_CA_df.columns]
zillow_CA_df_filtered = zillow_CA_df[existing_cols]
zillow_CA_df_filtered.head()

# Save the result to a new CSV
# zillow_CA_df_filtered.to_csv("ca_filtered_2015_2025.csv", index=False)

Unnamed: 0,RegionID,State,CountyName,City,2015-01-31,2015-02-28,2015-03-31,2015-04-30,2015-05-31,2015-06-30,...,2024-07-31,2024-08-31,2024-09-30,2024-10-31,2024-11-30,2024-12-31,2025-01-31,2025-02-28,2025-03-31,2025-04-30
9,95992,CA,Los Angeles County,Los Angeles,272797.74906,273731.41064,273037.591937,273652.184775,274956.914699,276678.954478,...,619606.328113,623901.237628,630114.609171,635737.625228,640640.625467,644457.222856,643705.914227,638951.611711,629571.24412,622052.595559
12,96193,CA,Los Angeles County,Norwalk,378354.070519,378582.823306,378376.678343,379111.58513,380911.787316,382967.799537,...,765207.924191,770500.141326,776689.718995,781487.168035,784865.05639,787343.70723,786610.153621,783285.394506,776680.006918,771499.616942
13,96361,CA,Los Angeles County,Los Angeles,336296.162543,337072.901053,336946.81908,337691.765656,339016.050421,340929.304834,...,743231.422111,750232.765964,757811.080827,764126.468962,769532.128575,775179.782947,775682.123527,772493.777986,765187.549194,760378.780896
21,96025,CA,Los Angeles County,Los Angeles,292682.463638,294284.588151,294559.510597,295333.652616,296319.065384,297981.934803,...,661624.594786,666417.004862,672842.370717,678200.799165,682472.969504,685391.398681,684214.456529,679904.818709,672211.897064,665171.11774
22,96817,CA,San Bernardino County,Fontana,363423.644238,364745.962739,365506.57403,366494.427219,368010.186795,369999.231631,...,716952.768684,719355.323294,721566.783423,723191.308516,725157.508674,726301.510539,726263.738599,725373.172761,722897.420792,720234.68818


In [None]:
#Save the result to a new CSV
zillow_CA_df_filtered.to_csv("/Users/abhaychaudhary/Desktop/ca_filtered_2015_2025.csv", index=False)