In [166]:
# Initial imports
import os
import json
import requests
import pandas as pd
from pathlib import Path
import quandl
from dotenv import load_dotenv

%matplotlib inline

In [167]:
load_dotenv()

True

In [168]:
# Set API key from env variables
quandl.ApiConfig.api_key = os.getenv("QUANDL_API")

## Indicators
The indicators table explains how the housing data has been categorized

In [169]:
# Download housing data from quadl api
indicator_df = quandl.get_table("ZILLOW/INDICATORS")

In [170]:
# Show indicators dataframe
indicator_df.head(20)

Unnamed: 0_level_0,indicator_id,indicator,category
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,ZSFH,ZHVI Single-Family Homes Time Series ($),Home values
1,ZCON,ZHVI Condo/Co-op Time Series ($),Home values
2,ZATT,ZHVI All Homes- Top Tier Time Series ($),Home values
3,ZALL,"ZHVI All Homes (SFR, Condo/Co-op) Time Series ($)",Home values
4,ZABT,ZHVI All Homes- Bottom Tier Time Series ($),Home values
5,Z5BR,ZHVI 5+ Bedroom Time Series ($),Home values
6,Z4BR,ZHVI 4-Bedroom Time Series ($),Home values
7,Z3BR,ZHVI 3-Bedroom Time Series ($),Home values
8,Z2BR,ZHVI 2-Bedroom Time Series ($),Home values
9,Z1BR,ZHVI 1-Bedroom Time Series ($),Home values


## Regions
The regions table explains the different region types for the housing data

In [171]:
# Show regions dataframe
regions = quandl.get_table("ZILLOW/REGIONS", paginate=True)
regions.head()

Unnamed: 0_level_0,region_id,region_type,region
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,99999,zip,98847; WA; Wenatchee; Chelan County; Peshastin
1,99998,zip,98846; WA; Okanogan County; Pateros
2,99997,zip,98845; WA; Wenatchee; Douglas County; Palisades
3,99996,zip,98844; WA; Okanogan County; Oroville
4,99995,zip,98843; WA; Wenatchee; Douglas County; Orondo


## Data Table
The data table contains all the housing data by indicator and region.

The downloaded zip is extracted to a location outside of the repository due to it's large size.

In [172]:
# -- Code commented out as it only needs to be run a single time

# -------
# data = quandl.export_table('ZILLOW/DATA', filename="../../Repositories/local/zdata.zip")
# with zipfile.ZipFile('../../Repositories/local/zdata.zip', 'r') as zip_ref:
#    zip_ref.extractall('../../Repositories/local/')
# os.rename('../../Repositories/local/ZILLOW_DATA_962c837a6ccefddddf190101e0bafdaf.csv', '../../Repositories/local/zdata.csv')
# Read extracted zip file into a dataframe
# zdata = pd.read_csv('../../Repositories/local/zdata.csv')
# -------

In [173]:
zdata.shape

(127237469, 4)

In [174]:
regions.region_type.value_counts()

zip       31172
city      27471
neigh     16711
county     2887
metro       915
state        51
Name: region_type, dtype: int64

## Extract metro codes to list

In [176]:
# Casting the list values as int for use to make splits from the region_id column
metros = regions[regions.region_type == 'metro']
metros.tail()

Unnamed: 0_level_0,region_id,region_type,region
None,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
51628,394300,metro,"Ada, OK; OK"
51630,394299,metro,"Abilene, TX; TX"
51631,394298,metro,"Aberdeen, WA; WA"
51632,394297,metro,"Aberdeen, SD; SD"
78827,102001,metro,United States


In [177]:
metro_codes = [int(i) for i in metros.region_id.tolist()]
metro_codes[:5]

[753929, 753928, 753927, 753926, 753925]

## Extract home value codes to list

In [178]:
home_value_codes = []
for index, row in indicator_df.iterrows():
    if row[2] == "Home values":
        home_value_codes.append(row[0])
home_value_codes

['ZSFH',
 'ZCON',
 'ZATT',
 'ZALL',
 'ZABT',
 'Z5BR',
 'Z4BR',
 'Z3BR',
 'Z2BR',
 'Z1BR']

In [179]:
del home_value_codes[0]
del home_value_codes[1:4]
home_value_codes

['ZCON', 'Z5BR', 'Z4BR', 'Z3BR', 'Z2BR', 'Z1BR']

## Home values used to create a slice of zdata

In [180]:
# Dataframe containing all homes from 1BR to 5BR and Condos
homes = zdata[zdata.indicator_id.isin(home_value_codes)]
homes

Unnamed: 0,indicator_id,region_id,date,value
11667,ZCON,9,2020-10-31,537729.0
11960,ZCON,54,2020-10-31,185435.0
12253,ZCON,43,2020-10-31,580421.0
12546,ZCON,14,2020-10-31,196766.0
12839,ZCON,21,2020-10-31,202002.0
...,...,...,...,...
127237464,Z4BR,49589,2020-02-29,103928.0
127237465,Z4BR,49589,2020-03-31,105018.0
127237466,Z4BR,49589,2020-04-30,105616.0
127237467,Z4BR,49589,2020-06-30,105040.0


## Metro codes used to slice homes data

In [181]:
metro_homes = homes[homes.region_id.isin(metro_codes)]
metro_homes

Unnamed: 0,indicator_id,region_id,date,value
1553170,ZCON,102001,2020-06-30,272122.0
1553174,ZCON,394913,2020-06-30,498772.0
1553179,ZCON,753899,2020-06-30,547500.0
1553182,ZCON,394463,2020-06-30,206792.0
1553187,ZCON,394514,2020-06-30,184800.0
...,...,...,...,...
51837197,Z4BR,394743,2019-10-31,379724.0
51837198,Z4BR,394743,2019-11-30,380707.0
51837199,Z4BR,394743,2019-12-31,381119.0
51837200,Z4BR,394743,2020-01-31,381255.0


In [182]:
# Reset the index for the new dataframe before writing to csv
metro_homes = metro_homes.reset_index(drop=True)
metro_homes

Unnamed: 0,indicator_id,region_id,date,value
0,ZCON,102001,2020-06-30,272122.0
1,ZCON,394913,2020-06-30,498772.0
2,ZCON,753899,2020-06-30,547500.0
3,ZCON,394463,2020-06-30,206792.0
4,ZCON,394514,2020-06-30,184800.0
...,...,...,...,...
1297630,Z4BR,394743,2019-10-31,379724.0
1297631,Z4BR,394743,2019-11-30,380707.0
1297632,Z4BR,394743,2019-12-31,381119.0
1297633,Z4BR,394743,2020-01-31,381255.0


In [186]:
metro_homes.to_csv('./Resources/metro_homes.csv', index=False,)
metro_homes

Unnamed: 0,indicator_id,region_id,date,value
0,ZCON,102001,2020-06-30,272122.0
1,ZCON,394913,2020-06-30,498772.0
2,ZCON,753899,2020-06-30,547500.0
3,ZCON,394463,2020-06-30,206792.0
4,ZCON,394514,2020-06-30,184800.0
...,...,...,...,...
1297630,Z4BR,394743,2019-10-31,379724.0
1297631,Z4BR,394743,2019-11-30,380707.0
1297632,Z4BR,394743,2019-12-31,381119.0
1297633,Z4BR,394743,2020-01-31,381255.0
