# DEPENDENCIES AND SETUP

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
from scipy.stats import linregress
from pprint import pprint



# Hide warning messages in notebook
import warnings
warnings.filterwarnings('ignore')



# LOAD RESEACHED DATA

In [3]:
# File to Load 
zillow = "Research_data/Zip_Residential_Zillow_.csv"
_
# Read the Zillow csv with Zip Codes and Medians price per Sq foot. (from 1996 to 2019)
zillow_df = pd.read_csv(zillow, encoding="utf-8")
zillow_df = zillow_df.fillna(0)
zillow_df.head()

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,...,2018-11,2018-12,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08
0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,200.0,200.0,201.0,...,1316,1304,1291,1289,1288,1275,1261,1256,1247,1240
1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,156.0,157.0,157.0,...,478,479,481,483,488,493,493,488,481,477
2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,359.0,359.0,359.0,...,1582,1571,1557,1542,1522,1500,1488,1487,1478,1469
3,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,67.0,68.0,68.0,...,113,114,114,114,114,114,114,114,113,112
4,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,199.0,200.0,201.0,...,525,527,529,532,534,534,531,523,515,509


In [4]:
# Format the Data Frame and check the NJ state for consistency
zillow_df = zillow_df.rename(columns ={"RegionName" : "zip_code"})
zillow_df["zip_code"] = zillow_df.zip_code.map("{:05}".format)
checknj = zillow_df[(zillow_df["State"] == "NJ")]

checknj

Unnamed: 0,RegionID,zip_code,City,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,...,2018-11,2018-12,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08
63,60545,07030,Hoboken,NJ,New York-Newark-Jersey City,Hudson County,64,158.0,158.0,158.0,...,728,727,727,727,727,726,723,720,718,718
79,61148,08701,Lakewood,NJ,New York-Newark-Jersey City,Ocean County,80,60.0,60.0,60.0,...,164,165,165,165,165,165,165,166,166,166
172,60639,07302,Jersey City,NJ,New York-Newark-Jersey City,Hudson County,173,113.0,114.0,114.0,...,801,800,795,790,788,785,780,776,776,775
186,61169,08753,Toms River,NJ,New York-Newark-Jersey City,Ocean County,187,74.0,74.0,74.0,...,165,165,165,165,165,163,163,164,165,165
212,60518,07002,Bayonne,NJ,New York-Newark-Jersey City,Hudson County,213,76.0,76.0,76.0,...,203,203,204,204,205,208,209,209,209,209
238,60599,07093,West New York,NJ,New York-Newark-Jersey City,Hudson County,239,82.0,81.0,81.0,...,288,291,293,293,294,296,298,299,300,302
282,61200,08831,Monroe,NJ,New York-Newark-Jersey City,Middlesex County,283,95.0,95.0,94.0,...,187,188,190,191,191,191,191,189,189,188
334,60594,07087,Union City,NJ,New York-Newark-Jersey City,Hudson County,335,60.0,60.0,60.0,...,266,268,269,269,269,265,260,259,260,261
541,60560,07047,North Bergen,NJ,New York-Newark-Jersey City,Hudson County,542,73.0,73.0,74.0,...,237,239,240,241,243,243,242,242,243,245
545,61175,08759,Manchester,NJ,New York-Newark-Jersey City,Ocean County,546,65.0,65.0,65.0,...,116,116,116,116,116,115,114,114,114,114


In [5]:
# getting column names

columns = list(zillow_df.columns.values)
print(len(columns))
columns

288


['RegionID',
 'zip_code',
 'City',
 'State',
 'Metro',
 'CountyName',
 'SizeRank',
 '1996-04',
 '1996-05',
 '1996-06',
 '1996-07',
 '1996-08',
 '1996-09',
 '1996-10',
 '1996-11',
 '1996-12',
 '1997-01',
 '1997-02',
 '1997-03',
 '1997-04',
 '1997-05',
 '1997-06',
 '1997-07',
 '1997-08',
 '1997-09',
 '1997-10',
 '1997-11',
 '1997-12',
 '1998-01',
 '1998-02',
 '1998-03',
 '1998-04',
 '1998-05',
 '1998-06',
 '1998-07',
 '1998-08',
 '1998-09',
 '1998-10',
 '1998-11',
 '1998-12',
 '1999-01',
 '1999-02',
 '1999-03',
 '1999-04',
 '1999-05',
 '1999-06',
 '1999-07',
 '1999-08',
 '1999-09',
 '1999-10',
 '1999-11',
 '1999-12',
 '2000-01',
 '2000-02',
 '2000-03',
 '2000-04',
 '2000-05',
 '2000-06',
 '2000-07',
 '2000-08',
 '2000-09',
 '2000-10',
 '2000-11',
 '2000-12',
 '2001-01',
 '2001-02',
 '2001-03',
 '2001-04',
 '2001-05',
 '2001-06',
 '2001-07',
 '2001-08',
 '2001-09',
 '2001-10',
 '2001-11',
 '2001-12',
 '2002-01',
 '2002-02',
 '2002-03',
 '2002-04',
 '2002-05',
 '2002-06',
 '2002-07',
 '200

In [6]:
# removing unnecessary columns names
columns.remove('RegionID')
columns.remove('zip_code')
columns.remove('City')
columns.remove('State')
columns.remove('Metro')
columns.remove('CountyName')
columns.remove('SizeRank')
columns

['1996-04',
 '1996-05',
 '1996-06',
 '1996-07',
 '1996-08',
 '1996-09',
 '1996-10',
 '1996-11',
 '1996-12',
 '1997-01',
 '1997-02',
 '1997-03',
 '1997-04',
 '1997-05',
 '1997-06',
 '1997-07',
 '1997-08',
 '1997-09',
 '1997-10',
 '1997-11',
 '1997-12',
 '1998-01',
 '1998-02',
 '1998-03',
 '1998-04',
 '1998-05',
 '1998-06',
 '1998-07',
 '1998-08',
 '1998-09',
 '1998-10',
 '1998-11',
 '1998-12',
 '1999-01',
 '1999-02',
 '1999-03',
 '1999-04',
 '1999-05',
 '1999-06',
 '1999-07',
 '1999-08',
 '1999-09',
 '1999-10',
 '1999-11',
 '1999-12',
 '2000-01',
 '2000-02',
 '2000-03',
 '2000-04',
 '2000-05',
 '2000-06',
 '2000-07',
 '2000-08',
 '2000-09',
 '2000-10',
 '2000-11',
 '2000-12',
 '2001-01',
 '2001-02',
 '2001-03',
 '2001-04',
 '2001-05',
 '2001-06',
 '2001-07',
 '2001-08',
 '2001-09',
 '2001-10',
 '2001-11',
 '2001-12',
 '2002-01',
 '2002-02',
 '2002-03',
 '2002-04',
 '2002-05',
 '2002-06',
 '2002-07',
 '2002-08',
 '2002-09',
 '2002-10',
 '2002-11',
 '2002-12',
 '2003-01',
 '2003-02',
 '20

# Analyzing columns
#before sub-prime crisis: columns 1 to 120
#after sub-prime crisis: columns  162 to 281
#sub-prime crisis: columns 121 to 161

# Iterations through rows and columns
#turning Median Price Per Square Foot (value from Zillow dataset) into an Index based value
#this was done to remove the difference in SQ Foot prices per zip code and focus only in the price variation (%)

#we created a dictionary and stored the zip codes as key values. then we calculated the indexes in tuples.

In [7]:
# Iterations through rows and columns

y = {}
for i, r in zillow_df.iloc[1:].iterrows():
    y[zillow_df.iloc[i,1]] = []

    for j in range (7, 288):
        if j == 7:
            vcalc = 100
            y[zillow_df.iloc[i,1]].append(vcalc)
        else:
            vcalc = ((zillow_df.iloc[i,j]/zillow_df.iloc[i,j-1]))*100
            y[zillow_df.iloc[i,1]].append(vcalc)

y       

{'60657': [100,
  100.64102564102564,
  100.0,
  100.63694267515923,
  100.0,
  100.0,
  100.63291139240506,
  100.0,
  100.62893081761007,
  100.0,
  100.62500000000001,
  100.0,
  100.62111801242236,
  100.0,
  100.61728395061729,
  100.0,
  101.22699386503066,
  100.60606060606061,
  100.60240963855422,
  101.19760479041918,
  101.18343195266273,
  101.75438596491229,
  101.14942528735634,
  101.13636363636364,
  101.68539325842696,
  101.10497237569061,
  101.63934426229508,
  101.61290322580645,
  101.58730158730158,
  101.5625,
  101.53846153846153,
  101.01010101010101,
  101.49999999999999,
  101.9704433497537,
  101.44927536231884,
  101.42857142857142,
  101.40845070422534,
  101.38888888888889,
  101.36986301369863,
  101.35135135135135,
  100.8888888888889,
  101.32158590308372,
  101.30434782608695,
  101.28755364806867,
  101.27118644067797,
  101.67364016736403,
  101.23456790123457,
  101.21951219512195,
  101.20481927710843,
  101.19047619047619,
  101.56862745098039,


# Changing the Dictionary to Data Frame

In [8]:
percent_df = pd.DataFrame.from_dict(y, orient="index", columns = columns)
percent_df.head()

Unnamed: 0,1996-04,1996-05,1996-06,1996-07,1996-08,1996-09,1996-10,1996-11,1996-12,1997-01,...,2018-11,2018-12,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08
60657,100,100.641026,100.0,100.636943,100.0,100.0,100.632911,100.0,100.628931,100.0,...,100.0,100.209205,100.417537,100.4158,101.035197,101.02459,100.0,98.985801,98.565574,99.168399
10023,100,100.0,100.0,100.0,100.278552,100.0,100.277778,100.0,100.554017,100.275482,...,99.309479,99.304678,99.108848,99.036609,98.702983,98.554534,99.2,99.932796,99.394755,99.391069
77494,100,101.492537,100.0,98.529412,100.0,100.0,98.507463,98.484848,100.0,98.461538,...,100.0,100.884956,100.0,100.0,100.0,100.0,100.0,100.0,99.122807,99.115044
60614,100,100.502513,100.5,100.0,100.0,100.0,100.0,99.502488,100.0,100.0,...,100.19084,100.380952,100.379507,100.567108,100.37594,100.0,99.438202,98.493409,98.470363,98.834951
77449,100,100.0,100.0,100.0,100.0,102.040816,100.0,100.0,100.0,100.0,...,100.0,100.0,101.075269,101.06383,100.0,98.947368,100.0,101.06383,101.052632,101.041667


In [9]:
#cleaning up the DF
percent_df = percent_df.fillna(100)
percent_df = percent_df.replace(np.inf, np.nan).dropna(how="all")
percent_df = percent_df.replace(0, 1)
percent_df.head()

Unnamed: 0,1996-04,1996-05,1996-06,1996-07,1996-08,1996-09,1996-10,1996-11,1996-12,1997-01,...,2018-11,2018-12,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08
60657,100,100.641026,100.0,100.636943,100.0,100.0,100.632911,100.0,100.628931,100.0,...,100.0,100.209205,100.417537,100.4158,101.035197,101.02459,100.0,98.985801,98.565574,99.168399
10023,100,100.0,100.0,100.0,100.278552,100.0,100.277778,100.0,100.554017,100.275482,...,99.309479,99.304678,99.108848,99.036609,98.702983,98.554534,99.2,99.932796,99.394755,99.391069
77494,100,101.492537,100.0,98.529412,100.0,100.0,98.507463,98.484848,100.0,98.461538,...,100.0,100.884956,100.0,100.0,100.0,100.0,100.0,100.0,99.122807,99.115044
60614,100,100.502513,100.5,100.0,100.0,100.0,100.0,99.502488,100.0,100.0,...,100.19084,100.380952,100.379507,100.567108,100.37594,100.0,99.438202,98.493409,98.470363,98.834951
77449,100,100.0,100.0,100.0,100.0,102.040816,100.0,100.0,100.0,100.0,...,100.0,100.0,101.075269,101.06383,100.0,98.947368,100.0,101.06383,101.052632,101.041667


In [20]:
# creating a new DF to manipulate numbers
percent_df2 = percent_df

# Drop 1st column, reset index and rename 1st new column 
# percent_df2 = percent_df2.drop(columns=['1996-04'])
percent_df2 = percent_df2.reset_index()
percent_df2 = percent_df2.rename(columns ={"index" : "zip_code"})
percent_df2.head()

Unnamed: 0,zip_code,1996-04,1996-05,1996-06,1996-07,1996-08,1996-09,1996-10,1996-11,1996-12,...,2018-11,2018-12,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08
0,60657,1,1.00641,1.0,1.006369,1.0,1.0,1.006329,1.0,1.006289,...,1.0,1.002092,1.004175,1.004158,1.010352,1.010246,1.0,0.989858,0.985656,0.991684
1,10023,1,1.0,1.0,1.0,1.002786,1.0,1.002778,1.0,1.00554,...,0.993095,0.993047,0.991088,0.990366,0.98703,0.985545,0.992,0.999328,0.993948,0.993911
2,77494,1,1.014925,1.0,0.985294,1.0,1.0,0.985075,0.984848,1.0,...,1.0,1.00885,1.0,1.0,1.0,1.0,1.0,1.0,0.991228,0.99115
3,60614,1,1.005025,1.005,1.0,1.0,1.0,1.0,0.995025,1.0,...,1.001908,1.00381,1.003795,1.005671,1.003759,1.0,0.994382,0.984934,0.984704,0.98835
4,77449,1,1.0,1.0,1.0,1.0,1.020408,1.0,1.0,1.0,...,1.0,1.0,1.010753,1.010638,1.0,0.989474,1.0,1.010638,1.010526,1.010417


# Calculate the product of all values in date columns (per row / zip code)

In [18]:
# # Iterations through rows and columns
# Row Product >> Multipling all  values in "date" columns (by row). to get the total variation over time.

z = {}
for i, r in percent_df2.iterrows():
    z[percent_df2.iloc[i,0]] = []
    vcalc2 = 1
    for q in range (1, 282):
        vcalc2 = vcalc2 * (percent_df2.iloc[i,q])
    vcalc2 = vcalc2*100
    z[percent_df2.iloc[i,0]].append(vcalc2)

z   

{'60657': [305.7692307692313],
 '10023': [409.19220055710264],
 '77494': [167.16417910447723],
 '60614': [255.77889447236157],
 '77449': [197.9591836734694],
 '79936': [170.37037037037013],
 '77084': [189.79591836734687],
 '10002': [482.9629629629627],
 '10467': [332.5581395348832],
 '11226': [621.0526315789468],
 '60640': [352.13675213675225],
 '94109': [431.7164179104471],
 '10016': [417.8694158075606],
 '78660': [189.8550724637676],
 '37013': [211.11111111111128],
 '32162': [285.4838709677422],
 '11235': [436.0294117647057],
 '11375': [453.2894736842102],
 '60647': [355.3191489361701],
 '90250': [427.1186440677967],
 '37211': [261.29032258064524],
 '10029': [390.41666666666737],
 '10009': [490.2985074626865],
 '77573': [196.5517241379309],
 '60618': [300.00000000000006],
 '78130': [202.8985507246373],
 '77584': [203.70370370370364],
 '10011': [569.7183098591543],
 '10128': [430.9352517985613],
 '28269': [169.11764705882374],
 '20002': [693.5064935064912],
 '78613': [181.818181818181

In [25]:
# Generating a Data Frame from the iteration above
final_df = pd.DataFrame.from_dict(z, orient="index")
final_df.head()

Unnamed: 0,0
60657,305.769231
10023,409.192201
77494,167.164179
60614,255.778894
77449,197.959184


In [22]:
# Sort the values
final_df = final_df.sort_values(by =[0], ascending=False)
#final_df = final_df.replace(np.inf, np.nan).dropna(how="all")

# Store the top and lower 10 market price variations
Greatest10 = final_df.head(10)
Lowest10 = final_df.tail(10)

print(Greatest10)
print(Lowest10)

                 0
19146  1533.333333
11101  1050.000000
80449   927.272727
02128   874.000000
11238   863.025210
11221   849.295775
11976   839.669421
11249   829.054054
57033   808.333333
90291   804.571429
               0
29340  66.666667
66860  66.666667
62549  66.433566
35098  64.492754
22503  63.483146
67455  63.461538
29015  62.745098
35062  57.823129
62801  53.571429
31520  44.554455


In [10]:
# Save the index 
percent_df.to_csv("Output_files/new_output_alldates.csv", encoding='utf-8')