In [7]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import requests
import time
import glob
from scipy.stats import linregress
from datetime import datetime
from itertools import islice

# Import API key
from config import CSMITH

In [9]:
fl_census_data = pd.read_csv('Resources/ACSST5Y2019.S1901_data_with_overlays_2021-05-24T165952.csv', header=1)
fl_codes = []

for i,j in fl_census_data.iterrows():
    fl_codes.append(j['Geographic Area Name'][6:11])


split_codes = [iter(fl_codes)] * 123
    
zip_codes = list(zip(*split_codes))

# zip_codes[0]

In [11]:
# count = 0
# for code in zip_codes[2]:
#     count += 1

# print(count)
# zip_codes[7][83:]
# fl_codes_int = []
# for code in fl_codes:
#     fl_codes_int.append(int(code))


fl_zipcodes = pd.Series(fl_codes, name='zipcode')
fl_zipcodes

0      32003
1      32008
2      32009
3      32011
4      32024
       ...  
979    34990
980    34994
981    34996
982    34997
983     Stat
Name: zipcode, Length: 984, dtype: object

In [104]:
# Read in historical mortgage rate
mortgage_data = pd.read_csv('Resources/morgage_rates.csv')
year_mortgage = []
month_mortgage = []
mortgage_rate = []

# Loop to limit data for 2019 & 2020
for i,j in mortgage_data.iterrows():
    if j['DATE']>='2019%' and j['DATE']<'2021%':
        year_mortgage.append(pd.to_datetime(j['DATE']).year)
        month_mortgage.append(pd.to_datetime(j['DATE']).month)
        mortgage_rate.append(j['MORTGAGE30US'])


# Create mortgage rate DF grouped by year and month to get the mean
mortgage_rates = pd.DataFrame({"year":year_mortgage, "month":month_mortgage, 
                        "mortgage_rate":mortgage_rate}).groupby(['year', 'month'])

# Export csv of calculated monthly rates
mortgage_rates.mean().to_csv('Resources/monthly_morgage_rates.csv')

In [72]:
# Establish API URL
property_url = "https://api.gateway.attomdata.com/propertyapi/v1.0.0/"

sales_trend = "salestrend/snapshot?geoid=ZI"

sales_trend_dates = "&interval=monthly&startyear=2019&endyear=2020&startmonth=january&endmonth=december"

headers = {'Accept': 'application/json', 'apikey': CSMITH}


# Establish variable for information desired
year = []
month = []
zip_code_sales = []
total_sales = []
average_price = []
median_price = []
sales_no_data = []

for code in sales_missing_zips:
    # Make API call
    sales_response = requests.post( f'{property_url}{sales_trend}{code}{sales_trend_dates}',
                                    headers = headers)
    
    if sales_response.status_code != 200:
        sales_no_data.append(sales_response.url)
    else:
        sales = sales_response.json()

        # Loop through API data and append to variables
        for i in sales['salestrends']:
            year.append(pd.to_datetime(i['daterange']['start']).year)
            month.append(pd.to_datetime(i['daterange']['start']).month)
            zip_code_sales.append(i['location']['geoID'][2:7])
            total_sales.append(i['SalesTrend']['homesalecount'])
            average_price.append(i['SalesTrend']['avgsaleprice'])
            median_price.append(i['SalesTrend']['medsaleprice'])
            
        # Set delay based on API call limitations
        time.sleep(7)

# Create new DF with data
sales_data = pd.DataFrame({"zipcode": zip_code_sales, "year":year, "month":month, 
                        "total_sales":total_sales, "avg_sale_price":average_price,
                        "med_sale_price":median_price})

sales_data

Unnamed: 0,zipcode,year,month,total_sales,avg_sale_price,med_sale_price
0,34945,2019,1,2,287500.0,287500.0
1,34945,2019,3,1,620000.0,620000.0
2,34945,2019,4,3,222330.0,228000.0
3,34945,2019,5,2,281500.0,281500.0
4,34945,2019,6,1,201000.0,201000.0
...,...,...,...,...,...,...
519,34997,2020,8,123,288527.0,251000.0
520,34997,2020,9,115,306618.0,256000.0
521,34997,2020,10,139,324712.0,285000.0
522,34997,2020,11,132,315426.0,259750.0


In [5]:
# Create new DF with data
sales_data_1 = pd.DataFrame({"zipcode": zip_code_sales, "year":year, "month":month, 
                        "total_sales":total_sales, "avg_sale_price":average_price,
                        "med_sale_price":median_price})


In [6]:

sales_data_1

Unnamed: 0,zipcode,year,month,total_sales,avg_sale_price,med_sale_price
0,34291,2019,1,10,231310.0,206000.0
1,34291,2019,2,13,221992.0,240000.0
2,34291,2019,3,21,266510.0,242000.0
3,34291,2019,4,22,267242.0,233000.0
4,34291,2019,5,31,232868.0,196500.0
...,...,...,...,...,...,...
1925,34747,2020,8,194,315594.0,272750.0
1926,34747,2020,9,177,307848.0,295000.0
1927,34747,2020,10,211,324094.0,282000.0
1928,34747,2020,11,184,384162.0,305000.0


In [12]:
sales_data_2 = pd.concat([sales_data_1, sales_data])

sales_data_2

Unnamed: 0,zipcode,year,month,total_sales,avg_sale_price,med_sale_price
0,34291,2019,1,10,231310.0,206000.0
1,34291,2019,2,13,221992.0,240000.0
2,34291,2019,3,21,266510.0,242000.0
3,34291,2019,4,22,267242.0,233000.0
4,34291,2019,5,31,232868.0,196500.0
...,...,...,...,...,...,...
912,34997,2020,8,123,288527.0,251000.0
913,34997,2020,9,115,306618.0,256000.0
914,34997,2020,10,139,324712.0,285000.0
915,34997,2020,11,132,315426.0,259750.0


In [103]:
rates = pd.read_csv('Resources/monthly_morgage_rates.csv')

final_sales = pd.merge(sales_data, rates, on=['year','month'])

final_sales

Unnamed: 0,zipcode,year,month,total_sales,avg_sale_price,med_sale_price,mortgage_rate
0,34945,2019,1,2,287500.0,287500.0,4.464
1,34946,2019,1,6,97683.0,80000.0,4.464
2,34947,2019,1,8,117275.0,81000.0,4.464
3,34949,2019,1,27,276930.0,240000.0,4.464
4,34950,2019,1,24,84782.0,72500.0,4.464
...,...,...,...,...,...,...,...
519,34987,2019,2,14,259786.0,255000.0,4.370
520,34990,2019,2,41,349615.0,329000.0,4.370
521,34994,2019,2,26,208607.0,172500.0,4.370
522,34996,2019,2,29,508266.0,245000.0,4.370


In [104]:
final_sales.to_csv('Resources/Sales_Area_Data/Sales/sales_missing.csv')

In [109]:
sales_path = r'Resources/Sales_Area_Data/Sales'
sales_files = glob.glob(sales_path + "/*.csv")

sales_df = []

for filename in sales_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    sales_df.append(df)

sales_19_20 = pd.concat(sales_df, axis=0, ignore_index=True)

sales_19_20 = sales_19_20.drop(columns='Unnamed: 0')

sales_19_20.to_csv('Resources/Sales_Area_Data/Sales/final_sales_data_19_20.csv')

sales_19_20

Unnamed: 0,zipcode,year,month,total_sales,avg_sale_price,med_sale_price,mortgage_rate
0,34291,2019,1,10,231310.0,206000.0,4.464
1,34292,2019,1,34,283729.0,272500.0,4.464
2,34293,2019,1,91,238282.0,217000.0,4.464
3,34420,2019,1,29,126858.0,115100.0,4.464
4,34428,2019,1,17,165129.0,147200.0,4.464
...,...,...,...,...,...,...,...
22345,33559,2020,11,26,277000.0,260750.0,2.765
22346,33563,2020,11,43,148678.0,157000.0,2.765
22347,33565,2020,11,21,213188.0,186000.0,2.765
22348,33566,2020,11,32,254442.0,262500.0,2.765


In [114]:
# tes = pd.concat(['sales_call_1.csv', 'sales_call_2.csv', 'sales_call_3.csv',
#                 'sales_call_4.csv', 'sales_call_5.csv', 'sales_call_6.csv',
#                 'sales_call_7.csv', 'sales_call_8.csv', 'sales_missing.csv' ], axis=0,
#                 ignore_index=True)
from os import listdir

filepaths = [f for f in listdir("./Resources/Sales_Area_Data/Sales") if f.endswith('.csv')]
df = pd.concat(map(pd.read_csv, filepaths))


# df = pd.concat(map(pd.read_csv, ['sales_call_1.csv', 'sales_call_2.csv', 'sales_call_3.csv',
#                 'sales_call_4.csv', 'sales_call_5.csv', 'sales_call_6.csv',
#                 'sales_call_7.csv', 'sales_call_8.csv', 'sales_missing.csv']))

df

FileNotFoundError: [Errno 2] File final_sales_data_19_20.csv does not exist: 'final_sales_data_19_20.csv'

In [100]:
sales_zip_group = sales_19_20.groupby('zipcode')
sales_zip_group.mean()

Unnamed: 0_level_0,year,month,total_sales,avg_sale_price,med_sale_price,mortgage_rate
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
32003,2019.500000,6.500000,46.958333,305006.916667,282449.041667,3.523708
32008,2019.521739,6.652174,4.956522,141085.565217,133581.086957,3.491478
32009,2019.523810,6.333333,3.428571,197787.571429,195767.523810,3.525976
32011,2019.500000,6.500000,19.708333,187515.166667,183814.583333,3.523708
32024,2019.500000,6.500000,22.750000,166671.166667,146029.166667,3.523708
...,...,...,...,...,...,...
34987,2019.500000,6.500000,33.958333,282638.708333,273252.208333,3.523708
34990,2019.500000,6.500000,74.708333,400646.791667,367229.166667,3.523708
34994,2019.500000,6.500000,38.583333,250662.375000,167126.208333,3.523708
34996,2019.500000,6.500000,42.291667,532194.583333,300856.250000,3.523708


In [101]:
sales_zipcodes_grouped = sales_zip_group['zipcode'].mean().tolist()

sales_zipcodes = pd.Series(sales_zipcodes_grouped)

missing_sales = pd.DataFrame({'Sales':sales_zipcodes, 'FL':fl_zipcodes})

missing_sales = missing_sales[missing_sales.isna().any(axis=1)]

sales_missing_zips = missing_sales['FL'].tolist()

sales_missing_zips

['34945',
 '34946',
 '34947',
 '34949',
 '34950',
 '34951',
 '34952',
 '34953',
 '34956',
 '34957',
 '34972',
 '34974',
 '34981',
 '34982',
 '34983',
 '34984',
 '34986',
 '34987',
 '34990',
 '34994',
 '34996',
 '34997',
 ' Stat']

In [13]:
# Establish API URL
community_url = "https://api.gateway.attomdata.com/communityapi/v2.0.0"

area_url = "/area/full?AreaId=ZI"

headers = {'Accept': 'application/json', 'apikey': CSMITH}

# Establish variable for information desired
zip_code_area = []
prop_tax = []
owner_occupied = []
renter_occupied = []
total_vacant =[]
total_dwellings = []
home_age = []
studio_rent = []
one_bed_rent = []
two_bed_rent = []
three_bed_rent = []
four_bed_rent = []
fte_employed = []
unemployed = []
avg_income = []
proj_income = []
expense_index = []
avg_commute = []
crime_index = []
area_no_data = []

for code in area_missing_zips:
    
    # Make API call
    area_response = requests.post( f'{community_url}{area_url}{code}', headers = headers)
    if area_response.status_code != 200:
        area_no_data.append(area_response.url)
    else:
        area = area_response.json()
        for i in area:
            zip_code_area.append(area['response']['inputparameter']['AreaId'][2:7])
        for i in area['response']['result']['package']['item']:
            prop_tax.append(i['avg_prop_tax'])
            owner_occupied.append(i['dwlowned'])
            renter_occupied.append(i['dwlrent'])
            total_vacant.append(i['dwlvacnt'])
            total_dwellings.append(i['dwltotal'])
            home_age.append(i['houmedage'])
            studio_rent.append(i['studio_county'])
            one_bed_rent.append(i['one_bed_county'])
            two_bed_rent.append(i['two_bed_county'])
            three_bed_rent.append(i['three_bed_county'])
            four_bed_rent.append(i['four_bed_county'])
            fte_employed.append(i['emptotal'])
            unemployed.append(i['empunemp'])
            avg_income.append(i['inccyavehh'])
            proj_income.append(i['incavehhpy_5'])
            expense_index.append(i['idxexptotal'])
            avg_commute.append(i['trwave'])
            crime_index.append(i['crmcytotc'])
            


    # Set delay based on API call limitations
    time.sleep(7)

area_data = pd.DataFrame({'zipcode': zip_code_area, 'property_tax': prop_tax,
                        'owner_occupied': owner_occupied, 'renter_occupied': renter_occupied,
                        'total_vacant': total_vacant, 'total_dwellings': total_dwellings,
                        'studio_rent': studio_rent, 'one_bed_rent': one_bed_rent,
                        'two_bed_rent': two_bed_rent, 'three_bed_rent': three_bed_rent,
                        'four_bed_rent': four_bed_rent, 'fte_employed': fte_employed,
                        'unemployed':unemployed, 'average_income':avg_income,
                        'projected_income':proj_income, 'expense_index':expense_index,
                        'average_commute':avg_commute, 'crime_index':crime_index})
area_data

Unnamed: 0,zipcode,property_tax,owner_occupied,renter_occupied,total_vacant,total_dwellings,studio_rent,one_bed_rent,two_bed_rent,three_bed_rent,four_bed_rent,fte_employed,unemployed,average_income,projected_income,expense_index,average_commute,crime_index
0,34756,3105,1124,215,180,1519,1055,1140,1321,1713,2057,393,450,95870,108969,99,30,106
1,34758,2215,9223,3519,2886,15628,1055,1140,1321,1713,2057,4534,3694,59330,66780,118,33,387
2,34759,2296,10659,3504,3780,17943,794,799,1023,1375,1764,2919,2303,57938,64290,103,37,203
3,34760,2564,229,118,36,382,1055,1140,1321,1713,2057,309,247,85790,100638,102,28,1068
4,34761,2815,11977,4278,1329,17583,1055,1140,1321,1713,2057,16249,4448,91575,107005,110,28,492
5,34762,2257,269,54,101,424,1055,1140,1321,1713,2057,386,68,58093,65772,81,27,386
6,34769,1862,6583,3440,1954,11977,1055,1140,1321,1713,2057,7513,4002,63687,72027,90,29,275
7,34771,3104,6601,2088,1451,10140,1055,1140,1321,1713,2057,2617,2134,86684,97295,103,32,304
8,34772,2814,8169,2896,1768,12833,1055,1140,1321,1713,2057,3195,2883,77883,87617,110,30,208
9,34773,3744,1441,500,572,2513,1055,1140,1321,1713,2057,734,541,83314,93835,110,38,500


In [9]:
area_data_1 = pd.DataFrame({'zipcode': zip_code_area, 'property_tax': prop_tax,
                        'owner_occupied': owner_occupied, 'renter_occupied': renter_occupied,
                        'total_vacant': total_vacant, 'total_dwellings': total_dwellings,
                        'studio_rent': studio_rent, 'one_bed_rent': one_bed_rent,
                        'two_bed_rent': two_bed_rent, 'three_bed_rent': three_bed_rent,
                        'four_bed_rent': four_bed_rent, 'fte_employed': fte_employed,
                        'unemployed':unemployed, 'average_income':avg_income,
                        'projected_income':proj_income, 'expense_index':expense_index,
                        'average_commute':avg_commute, 'crime_index':crime_index})
area_data_1

Unnamed: 0,zipcode,property_tax,owner_occupied,renter_occupied,total_vacant,total_dwellings,studio_rent,one_bed_rent,two_bed_rent,three_bed_rent,four_bed_rent,fte_employed,unemployed,average_income,projected_income,expense_index,average_commute,crime_index
0,33569,3310,8393,2363,1207,11962,989,1040,1271,1651,2028,9928,1552,87173,100534,106,27,233
1,33570,3140,6985,3007,2195,12187,989,1040,1271,1651,2028,4464,962,71461,82748,104,26,375
2,33572,4998,6853,2463,1429,10746,989,1040,1271,1651,2028,4188,605,111121,127182,98,30,271
3,33573,2551,12562,3892,2853,19307,989,1040,1271,1651,2028,4670,419,64187,74338,65,18,224
4,33576,2032,1857,308,318,2483,989,1040,1271,1651,2028,1831,205,87684,98954,81,26,151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98,33843,1162,3001,901,1585,5488,794,799,1023,1375,1764,1788,594,56561,62578,91,27,194
99,33844,2048,11259,4914,5705,21878,794,799,1023,1375,1764,8307,2008,55173,61635,94,29,275
100,33847,1093,54,17,11,82,794,799,1023,1375,1764,38,15,58266,64424,101,32,413
101,33848,1640,296,99,285,680,1055,1140,1321,1713,2057,1003,191,63436,71345,91,25,431


In [16]:
area_data = pd.concat([area_data_1, area_data_2])
area_data

Unnamed: 0,zipcode,property_tax,owner_occupied,renter_occupied,total_vacant,total_dwellings,studio_rent,one_bed_rent,two_bed_rent,three_bed_rent,four_bed_rent,fte_employed,unemployed,average_income,projected_income,expense_index,average_commute,crime_index
0,33569,3310,8393,2363,1207,11962,989,1040,1271,1651,2028,9928,1552,87173,100534,106,27,233
1,33570,3140,6985,3007,2195,12187,989,1040,1271,1651,2028,4464,962,71461,82748,104,26,375
2,33572,4998,6853,2463,1429,10746,989,1040,1271,1651,2028,4188,605,111121,127182,98,30,271
3,33573,2551,12562,3892,2853,19307,989,1040,1271,1651,2028,4670,419,64187,74338,65,18,224
4,33576,2032,1857,308,318,2483,989,1040,1271,1651,2028,1831,205,87684,98954,81,26,151
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14,33873,1240,2934,1576,695,5206,565,569,750,972,1168,4364,534,56587,64385,110,26,275
15,33875,1481,4357,788,896,6041,575,661,871,1100,1183,963,165,65706,72490,84,21,97
16,33876,1310,1821,471,713,3004,575,661,871,1100,1183,669,64,52153,57508,80,22,154
17,33877,852,76,44,26,147,794,799,1023,1375,1764,123,47,44884,49628,72,23,102


In [15]:
area_data.to_csv('Resources/Sales_Area_Data/area_call_missing.csv')

In [19]:
area_path = r'Resources/Sales_Area_Data/Area'
area_files = glob.glob(area_path + "/*.csv")

area_df = []

for filename in area_files:
    df = pd.read_csv(filename, index_col=None, header=0)
    area_df.append(df)


area_19 = pd.concat(area_df, axis=0, ignore_index=True).sort_values(by=['zipcode'])

area_19.to_csv('Resources/Sales_Area_Data/Area/final_area_data.csv')

In [18]:
area_zipcodes = area_19['zipcode'].astype(str)

missing_area = pd.DataFrame({'Area':area_zipcodes, 'FL':fl_zipcodes})

missing_area = missing_area[missing_area.isna().any(axis=1)]

area_missing_zips = missing_area['FL'].tolist()

area_missing_zips

[' Stat']