In [1]:
################################################################
# For documentation on the Zip_time_series.csv file, refer to  #
# https://www.kaggle.com/zillow/zecon#Zip_time_series.csv      #
################################################################

# Dependencies
import pandas as pd
import numpy as np
import datetime as dt

# Handy list of Austin zipcodes
atx_zip_codes = [78610, 78613, 78617, 78641, 78652, 78653, 78660, 78664, 78681, 78701, 78702, 78703, 78704, 78705, 78712, 78717, 78719, 78721, 78722, 78723, 78724, 78725, 78726, 78727, 78728, 78729, 78730, 78731, 78732, 78733, 78734, 78735, 78736, 78737, 78738, 78739, 78741, 78742, 78744, 78745, 78746, 78747, 78748, 78749, 78750, 78751, 78752, 78753, 78754, 78756, 78757, 78758, 78759]



In [2]:
# Import file as dataframe and preview
# .csv file needs to be in same directory as this file
df = pd.read_csv('Zip_time_series.csv')
df.head()

Unnamed: 0,Date,RegionName,InventorySeasonallyAdjusted_AllHomes,InventoryRaw_AllHomes,MedianListingPricePerSqft_1Bedroom,MedianListingPricePerSqft_2Bedroom,MedianListingPricePerSqft_3Bedroom,MedianListingPricePerSqft_4Bedroom,MedianListingPricePerSqft_5BedroomOrMore,MedianListingPricePerSqft_AllHomes,...,ZHVI_BottomTier,ZHVI_CondoCoop,ZHVI_MiddleTier,ZHVI_SingleFamilyResidence,ZHVI_TopTier,ZRI_AllHomes,ZRI_AllHomesPlusMultifamily,ZriPerSqft_AllHomes,Zri_MultiFamilyResidenceRental,Zri_SingleFamilyResidenceRental
0,1996-04-30,1001,,,,,,,,,...,68700.0,67000.0,101900.0,107000.0,124800.0,,,,,
1,1996-04-30,1002,,,,,,,,,...,97000.0,81300.0,135200.0,145800.0,213600.0,,,,,
2,1996-04-30,1005,,,,,,,,,...,85400.0,,101200.0,101200.0,125700.0,,,,,
3,1996-04-30,1007,,,,,,,,,...,97900.0,87600.0,124000.0,125000.0,162100.0,,,,,
4,1996-04-30,1008,,,,,,,,,...,81100.0,,109300.0,109300.0,148100.0,,,,,


In [3]:
# Rename RegionName to zipcode because that's what it is
df = df.rename(columns={'RegionName': 'zipcode'})

# Filter out all non-Austin area zipcodes
df = df[[(x in atx_zip_codes) for x in df.zipcode]]

# Convert Date column to pandas timestamp objects
df.Date = pd.to_datetime(df.Date)

# Filter out all years before 2007
df = df[[x.year >= 2007 for x in df.Date]]

## Now we have all the original data but only for the desired years in Austin zip codes.
## Take note in case we want to use something from this data later.
# df.to_csv('atx_zillow_data_2007-2017.csv')

In [4]:
# Now, take out everything except year, zipcode, and Zillow Home Value Index for all homes 
df['year'] = [x.year for x in df.Date]
mask = ['year', 'zipcode', 'ZHVI_AllHomes']
df = df[mask]

# Take note that some zip codes have no data here
# Others, the data only comes more recently
df[df.ZHVI_AllHomes.isna()].zipcode.value_counts()

# We drop what we have to
df = df.dropna()

# Group data by year, zipcode, find the mean ZHVI per year per zipcode
df = round(df.groupby(['year', 'zipcode']).ZHVI_AllHomes.mean(),2).to_frame()

# Preview
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ZHVI_AllHomes
year,zipcode,Unnamed: 2_level_1
2007,78610,184408.33
2007,78613,195816.67
2007,78617,129975.0
2007,78641,161408.33
2007,78664,147408.33


In [5]:
# Save to csv
df.to_csv('atx_mean_zhvi_2007-2017.csv')