In [1]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import json
import matplotlib.pyplot as plt
import plotly.express as px
import imageio
import warnings
warnings.filterwarnings('ignore')


Bad key "text.kerning_factor" on line 4 in
C:\Users\akgoel\.conda\envs\pyvizenv\lib\site-packages\matplotlib\mpl-data\stylelib\_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


In [2]:
#Read in the data from the raw Zillow CSV for ZALL
zillow_path = Path("../data/clean_data/zillow_data_houston_sa_austin_dallas.csv")
salary_path = Path()
zillow_df = pd.read_csv(zillow_path, infer_datetime_format=True, parse_dates=True)

zillow_df["date"] = pd.to_datetime(zillow_df["date"])
zillow_df.head()

Unnamed: 0,date,region,avg_value,zipcode,city
0,1996-01-31,75201; TX; Dallas-Fort Worth-Arlington; Dallas...,412559.0,75201,Dallas
1,1996-02-29,75201; TX; Dallas-Fort Worth-Arlington; Dallas...,413722.0,75201,Dallas
2,1996-03-31,75201; TX; Dallas-Fort Worth-Arlington; Dallas...,416215.0,75201,Dallas
3,1996-04-30,75201; TX; Dallas-Fort Worth-Arlington; Dallas...,422319.0,75201,Dallas
4,1996-05-31,75201; TX; Dallas-Fort Worth-Arlington; Dallas...,428461.0,75201,Dallas


In [3]:
#Read in the salary data from the IRS csv
salary_df = pd.read_csv("../data/clean_data/cnc_sal_data.csv", infer_datetime_format= True, parse_dates= True)
salary_df["Avg_AGI"] = salary_df["AGI"] / salary_df["NO. OF RETURNS"] * 1000
salary_df = salary_df.dropna()
salary_df.head()

Unnamed: 0,YEAR,ZIP CODE,NO. OF RETURNS,AGI,Avg_AGI
0,2007,73301,1798,7230,4021.134594
1,2007,73949,43,1915,44534.883721
2,2007,75001,7128,693056,97230.078563
3,2007,75002,26669,2002247,75077.693202
4,2007,75006,21953,1053748,48000.182207


In [4]:
#Clean up the salary Dataframe because we no longer need a couple of columns
salary_df = salary_df.drop(columns=["AGI","NO. OF RETURNS"])
salary_df.head()

Unnamed: 0,YEAR,ZIP CODE,Avg_AGI
0,2007,73301,4021.134594
1,2007,73949,44534.883721
2,2007,75001,97230.078563
3,2007,75002,75077.693202
4,2007,75006,48000.182207


In [5]:
#Clean up the Zillow CSV to Group By Year to get average house value by zip code by year, then rename the columns to match the salary data
zillow_year_df = zillow_df.groupby(["zipcode","city",zillow_df["date"].dt.year]).mean()
zillow_year_df = zillow_year_df.reset_index()
zillow_year_df = zillow_year_df.rename(columns={"date" : "YEAR", "zipcode" : "ZIP CODE", "city":"CITY"})
zillow_year_df.head()

Unnamed: 0,ZIP CODE,CITY,YEAR,avg_value
0,75201,Dallas,1996,425752.833333
1,75201,Dallas,1997,434680.833333
2,75201,Dallas,1998,432947.333333
3,75201,Dallas,1999,436265.25
4,75201,Dallas,2000,436422.583333


In [6]:
#Merge the Zillow data and the IRS data by year and zip code, then create a column for Opportunity Ratio
combined_df = pd.merge(zillow_year_df, salary_df, on=['YEAR', 'ZIP CODE'])
combined_df["OP_RATIO"] = combined_df['Avg_AGI'] / combined_df["avg_value"]
combined_df.head()

Unnamed: 0,ZIP CODE,CITY,YEAR,avg_value,Avg_AGI,OP_RATIO
0,75201,Dallas,2007,590771.5,402407.690759,0.681156
1,75201,Dallas,2008,610801.75,271227.125119,0.444051
2,75201,Dallas,2009,571372.75,190436.672968,0.333297
3,75201,Dallas,2010,547731.166667,192694.852396,0.351806
4,75201,Dallas,2011,521611.583333,192270.074877,0.368608


In [7]:
houston_2018_df = combined_df[(combined_df["CITY"] == "Houston") & (combined_df["YEAR"]==2018)]
houston_2018_df.head()

Unnamed: 0,ZIP CODE,CITY,YEAR,avg_value,Avg_AGI,OP_RATIO
911,77002,Houston,2018,244422.7,273011.260504,1.116964
923,77003,Houston,2018,304152.7,73518.214286,0.241715
935,77004,Houston,2018,308489.8,85244.487578,0.276328
947,77005,Houston,2018,1167376.0,420741.833031,0.360417
959,77006,Houston,2018,473962.0,145385.266458,0.306745


In [8]:
houston_max = houston_2018_df["OP_RATIO"].max()
houston_min = houston_2018_df["OP_RATIO"].min()

In [9]:
#Import the TX zipcode GeoGSON file
with open('../data/tx_texas_zip_codes_geo.min.json') as f:
    tx_zip = json.load(f)

In [15]:
fig = px.choropleth(houston_2018_df, geojson=tx_zip, locations='ZIP CODE', color='OP_RATIO',
                    featureidkey="properties.ZCTA5CE10",
                    color_continuous_scale="Viridis",
                    range_color=(houston_min,0.7),
                    scope="usa",
                    labels={'OP_RATIO':'Opportunity Ratio'},
                    title = "Opportunity Ratio for Houston in 2018"
                          )

fig.update_geos(fitbounds="locations", visible=False)
type(fig)

plotly.graph_objs._figure.Figure

In [16]:
fig.write_image("images/fig1.png", engine="kaleido")

In [25]:
for year in range(2007,2019):
    houston_df = combined_df[(combined_df["CITY"] == "Houston") & (combined_df["YEAR"]==year)]
    fig = px.choropleth(houston_df, geojson=tx_zip, locations='ZIP CODE', color='OP_RATIO',
                    featureidkey="properties.ZCTA5CE10",
                    color_continuous_scale="Viridis",
                    range_color=(houston_min,0.7),
                    scope="usa",
                    labels={'OP_RATIO':'Opportunity Ratio'},
                    title = f"Opportunity Ratio for Houston in {year}"
                    )
    fig.update_geos(fitbounds="locations", visible=False)
    type(fig) # prevents fig from printing to notebook and blowing up notebook size
    fig.write_image(f"images/ZALL/houston_{year}.png", engine="kaleido")
    print(year)

2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


In [26]:
import imageio

In [27]:
images=[]
for year in range(2007,2019):
    images.append(imageio.imread(f'images/houston_{year}.png'))
imageio.mimsave('images/ZALL/houston.gif', images, fps=1)

In [28]:
for year in range(2007,2019):
    austin_df = combined_df[(combined_df["CITY"] == "Austin") & (combined_df["YEAR"]==year)]
    sa_df = combined_df[(combined_df["CITY"] == "San Antonio") & (combined_df["YEAR"]==year)]
    dfw_df = combined_df[((combined_df["CITY"] == "Dallas") | (combined_df["CITY"] == "Fort Worth")) & (combined_df["YEAR"]==year)]
    
    austin_fig = px.choropleth(austin_df, geojson=tx_zip, locations='ZIP CODE', color='OP_RATIO',
                    featureidkey="properties.ZCTA5CE10",
                    color_continuous_scale="Viridis",
                    range_color=(0.2,0.7),
                    scope="usa",
                    labels={'OP_RATIO':'Opportunity Ratio'},
                    title = f"Opportunity Ratio for Austin in {year}"
                    )
    austin_fig.update_geos(fitbounds="locations", visible=False)
    type(austin_fig) # prevents fig from printing to notebook and blowing up notebook size
    austin_fig.write_image(f"images/ZALL/austin_{year}.png", engine="kaleido")
    
    sa_fig = px.choropleth(sa_df, geojson=tx_zip, locations='ZIP CODE', color='OP_RATIO',
                    featureidkey="properties.ZCTA5CE10",
                    color_continuous_scale="Viridis",
                    range_color=(0.2,0.7),
                    scope="usa",
                    labels={'OP_RATIO':'Opportunity Ratio'},
                    title = f"Opportunity Ratio for San Antonio in {year}"
                    )
    sa_fig.update_geos(fitbounds="locations", visible=False)
    type(sa_fig) # prevents fig from printing to notebook and blowing up notebook size
    sa_fig.write_image(f"images/ZALL/sa_{year}.png", engine="kaleido")
    
    dfw_fig = px.choropleth(dfw_df, geojson=tx_zip, locations='ZIP CODE', color='OP_RATIO',
                    featureidkey="properties.ZCTA5CE10",
                    color_continuous_scale="Viridis",
                    range_color=(0.2,0.7),
                    scope="usa",
                    labels={'OP_RATIO':'Opportunity Ratio'},
                    title = f"Opportunity Ratio for Dallas-Fort Worth in {year}"
                    )
    dfw_fig.update_geos(fitbounds="locations", visible=False)
    type(dfw_fig) # prevents fig from printing to notebook and blowing up notebook size
    dfw_fig.write_image(f"images/ZALL/dfw_{year}.png", engine="kaleido")
    
    
    
    
    print(year)

2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018


In [30]:
austin_images=[]
sa_images=[]
dfw_images=[]
for year in range(2007,2019):
    austin_images.append(imageio.imread(f'images/austin_{year}.png'))
    sa_images.append(imageio.imread(f'images/sa_{year}.png'))
    dfw_images.append(imageio.imread(f'images/dfw_{year}.png'))
imageio.mimsave('images/ZALL/austin.gif', austin_images, fps=1)
imageio.mimsave('images/ZALL/sa.gif', sa_images, fps=1)
imageio.mimsave('images/ZALL/dfw.gif', dfw_images, fps=1)