In [34]:
# Required Modules
import pandas as pd
import numpy as np
import sqlite3
from sqlite3 import Error
import os

# Identify database path
db_path = os.path.abspath('./sqlite.db')

# Connect to SQLite Database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [35]:
# FL Zip Codes from Census data for reference
fl_census_path = os.path.abspath('Resources/ACSST5Y2019.S1901_data_with_overlays_2021-05-24T165952.csv')
fl_census_df = pd.read_csv(fl_census_path, header=1)
fl_codes = []

for i,j in fl_census_df.iterrows():
    fl_codes.append(j['Geographic Area Name'][6:11])

fl_zipcodes = pd.DataFrame(fl_codes)
fl_zipcodes = fl_zipcodes.rename(columns={0:'zipcode'})

# Drop table if it exist
cursor.execute("DROP TABLE IF EXISTS zipcode")

# Create table
fl_zipcodes.to_sql('zipcode', conn, if_exists='replace', index=False)
fl_zipcodes['zipcode'].nunique()

984

In [36]:
# Sales data
sales_path = os.path.abspath('Resources/Sales_Area_Data/Sales/final_sales_data.csv')
sales_data_df = pd.read_csv(sales_path, header=0)
sales_data_df = sales_data_df[['zipcode','year','month','total_sales','avg_sale_price','med_sale_price','mortgage_rate']]

# Drop table if it exist
cursor.execute("DROP TABLE IF EXISTS sales")

# Create table
sales_data_df.to_sql('sales', conn, if_exists='replace', index=False)
sales_data_df['zipcode'].nunique()

962

In [37]:
# Sales data
area_path = os.path.abspath('Resources/Sales_Area_Data/Area/final_area_data.csv')
area_data_df = pd.read_csv(area_path, header=0)
area_data_df = area_data_df[['zipcode','property_tax','owner_occupied','renter_occupied','total_vacant','total_dwellings','studio_rent','one_bed_rent','two_bed_rent','three_bed_rent','four_bed_rent','fte_employed','unemployed','average_income','projected_income','expense_index','average_commute','crime_index']]
area_data_df.head()

# Drop table if it exist
cursor.execute("DROP TABLE IF EXISTS area")

# Create table
area_data_df.to_sql('area', conn, if_exists='replace', index=False)
area_data_df['zipcode'].nunique()

949

In [38]:
df = pd.read_sql('''SELECT      zip.zipcode AS Zip_Code, 
                                date((cast(y.year as text)||"-"||substr("0"||cast(m.month as text),-2,2)||"-01")) AS Date,
                                y.year AS Year, 
                                m.month AS Month,
                                ifnull(s.med_sale_price,0) AS Sale_Price,
                                ifnull(s.mortgage_rate,0) AS Interest_Rate,
                                ifnull(a.property_tax,0) AS Property_Tax,
                                ifnull(a.two_bed_rent,0) AS Rent_Price,
                                ifnull(inc.B19013001,0) AS Household_Income,
                                ifnull(a.two_bed_rent/(inc.B19013001/12),0) AS Rent_Affordability,
                                ifnull(s.total_sales,0) AS Total_Sales,
                                ifnull(fha.fha,0) AS FHA_Count,
                                ifnull(mob.mobility_rate,0) AS Mobility_Rate, 
                                ifnull(g.percent_total_points,0) AS School_Rating,
                                ifnull(g.letter_grade,"Unknown") AS School_Grade,
                                ifnull(a.owner_occupied,0) AS Owner_Occupied,
                                ifnull(a.renter_occupied,0) AS Renter_Occupied,
                                ifnull(a.total_vacant,0) AS Total_Vacant,
                                ifnull(a.total_dwellings,0) AS Total_Dwellings,
                                ifnull(a.fte_employed,0) AS FTE_Employed,
                                ifnull(a.unemployed,0) AS Unemployed,
                                ifnull(a.expense_index,0) AS Expense_Index,
                                ifnull(a.average_commute,0) AS Average_Commute,
                                ifnull(a.crime_index,0) AS Crime_Index
                        FROM zipcode AS zip
                        CROSS JOIN (SELECT 2019 AS year UNION SELECT 2020 AS year) AS y
                        CROSS JOIN (SELECT 1 AS month UNION SELECT 2 AS month UNION SELECT 3 AS month
                                UNION SELECT 4 AS month UNION SELECT 5 AS month UNION SELECT 6 AS month
                                UNION SELECT 7 AS month UNION SELECT 8 AS month UNION SELECT 9 AS month
                                UNION SELECT 10 AS month UNION SELECT 11 AS month UNION SELECT 12 AS month) AS m
                        LEFT JOIN fha_loans AS fha ON zip.zipcode = fha.zipcode AND y.year = fha.year AND m.month = fha.month
                        LEFT JOIN mobility_slim AS mob ON zip.zipcode = mob.name
                        LEFT JOIN household_income AS inc ON zip.zipcode = inc.name
                        LEFT JOIN grades AS g ON zip.zipcode = g.zip
                        LEFT JOIN sales AS s ON zip.zipcode = s.zipcode AND y.year = s.year AND m.month = s.month
                        LEFT JOIN area AS a ON zip.zipcode = a.zipcode
                        WHERE s.med_sale_price IS NOT NULL
                        AND s.mortgage_rate IS NOT NULL
                        AND y.year IS NOT NULL
                        AND m.month IS NOT NULL
                ''', conn)
df.head()

Unnamed: 0,Zip_Code,Date,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,Rent_Affordability,...,School_Grade,Owner_Occupied,Renter_Occupied,Total_Vacant,Total_Dwellings,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index
0,32003,2019-01-01,2019,1,244950.0,4.46,3778,1113,94154.0,0.14,...,Unknown,8834,2186,704,11724,10671,1069,104,35,28
1,32003,2019-02-01,2019,2,270000.0,4.37,3778,1113,94154.0,0.14,...,Unknown,8834,2186,704,11724,10671,1069,104,35,28
2,32003,2019-03-01,2019,3,275000.0,4.26,3778,1113,94154.0,0.14,...,Unknown,8834,2186,704,11724,10671,1069,104,35,28
3,32003,2019-04-01,2019,4,264500.0,4.14,3778,1113,94154.0,0.14,...,Unknown,8834,2186,704,11724,10671,1069,104,35,28
4,32003,2019-05-01,2019,5,281000.0,4.07,3778,1113,94154.0,0.14,...,Unknown,8834,2186,704,11724,10671,1069,104,35,28


In [42]:
# Calculate Home Affordability

# Loan Payment calc source: https://medium.com/personal-finance-analytics/mortgage-calculator-python-code-94d976d25a27
# df = df.append({'Zip_Code':32792,'Year':2011,'Month':10,'Sale_Price':178750,'Interest_Rate':2.875}, ignore_index=True) # PMT should = 593
df['Loan_Amount'] = df['Sale_Price']*.80 # Assuming 20% down payment
df['Loan_Term'] = float(30*12) # Assuming 30 year loan
df['Loan_R'] = 1+(df['Interest_Rate']/100/12)
df['Loan_Payment'] = df['Loan_Amount']*(df['Loan_R']**df['Loan_Term'])*(1-df['Loan_R'])/(1-df['Loan_R']**df['Loan_Term'])
# df['Home_Affordability'] = (df['Loan_Payment']+(df['Property_Tax']/12))/(df['Household_Income']/12)
df['Home_Affordability'] = (df['Loan_Payment']*12)/df['Household_Income']
df.tail()

Unnamed: 0,Zip_Code,Date,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,Rent_Affordability,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
22618,34997,2020-10-01,2020,10,285000.0,2.83,3067,1211,58789.0,0.25,...,15421,1267,91,24,113,228000.0,360.0,1.0,940.97,0.19
22619,34997,2020-11-01,2020,11,259750.0,2.77,3067,1211,58789.0,0.25,...,15421,1267,91,24,113,207800.0,360.0,1.0,849.98,0.17
22620,34997,2020-11-01,2020,11,259750.0,2.77,3067,1211,58789.0,0.25,...,15421,1267,91,24,113,207800.0,360.0,1.0,849.98,0.17
22621,34997,2020-12-01,2020,12,297000.0,2.68,3067,1211,58789.0,0.25,...,15421,1267,91,24,113,237600.0,360.0,1.0,961.69,0.2
22622,34997,2020-12-01,2020,12,297000.0,2.68,3067,1211,58789.0,0.25,...,15421,1267,91,24,113,237600.0,360.0,1.0,961.69,0.2


In [26]:
data = pd.read_csv('Resources/Final_Data.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Zip_Code,Date,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
0,0,32003,2019-01-01,2019,1,244950.0,4.46,3778,1113,94154.0,...,10671,1069,104,35,28,195960.0,360.0,1.0,988.71,0.01
1,1,32003,2019-02-01,2019,2,270000.0,4.37,3778,1113,94154.0,...,10671,1069,104,35,28,216000.0,360.0,1.0,1077.82,0.01
2,2,32003,2019-03-01,2019,3,275000.0,4.26,3778,1113,94154.0,...,10671,1069,104,35,28,220000.0,360.0,1.0,1084.2,0.01
3,3,32003,2019-04-01,2019,4,264500.0,4.14,3778,1113,94154.0,...,10671,1069,104,35,28,211600.0,360.0,1.0,1027.67,0.01
4,4,32003,2019-05-01,2019,5,281000.0,4.07,3778,1113,94154.0,...,10671,1069,104,35,28,224800.0,360.0,1.0,1082.58,0.01


In [22]:
ml_data.dtypes

Zip_Code                int64
Date                   object
Year                    int64
Month                   int64
Sale_Price            float64
Interest_Rate         float64
Property_Tax            int64
Rent_Price              int64
Household_Income      float64
Rent_Affordability    float64
Total_Sales             int64
FHA_Count               int64
Mobility_Rate         float64
School_Rating           int64
School_Grade           object
Owner_Occupied          int64
Renter_Occupied         int64
Total_Vacant            int64
Total_Dwellings         int64
FTE_Employed            int64
Unemployed              int64
Expense_Index           int64
Average_Commute         int64
Crime_Index             int64
Loan_Amount           float64
Loan_Term             float64
Loan_R                float64
Loan_Payment          float64
Home_Affordability    float64
dtype: object

In [33]:
ml_data = data[['Zip_Code', 'Household_Income', 'Year', 'Month', 'Total_Sales', 'FHA_Count', 
'Home_Affordability', 'Rent_Affordability', 'Loan_Payment', 'Property_Tax', 'Sale_Price' ]]
ml_data

Unnamed: 0,Zip_Code,Household_Income,Year,Month,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Loan_Payment,Property_Tax,Sale_Price
0,32003,94154.00,2019,1,32,7,0.01,0.14,988.71,3778,244950.00
1,32003,94154.00,2019,2,41,2,0.01,0.14,1077.82,3778,270000.00
2,32003,94154.00,2019,3,21,7,0.01,0.14,1084.20,3778,275000.00
3,32003,94154.00,2019,4,42,3,0.01,0.14,1027.67,3778,264500.00
4,32003,94154.00,2019,5,66,8,0.01,0.14,1082.58,3778,281000.00
...,...,...,...,...,...,...,...,...,...,...,...
22618,34997,58789.00,2020,10,139,17,0.02,0.25,940.97,3067,285000.00
22619,34997,58789.00,2020,11,132,11,0.02,0.25,849.98,3067,259750.00
22620,34997,58789.00,2020,11,132,11,0.02,0.25,849.98,3067,259750.00
22621,34997,58789.00,2020,12,142,16,0.02,0.25,961.69,3067,297000.00


In [10]:
ml_data['zipcode'] = ml_data['zipcode'].astype(int)
ml_data.dtypes

zipcode                 int64
year                    int64
month                   int64
home_affordability      int64
rent_affordability      int64
total_sales             int64
fha                     int64
avg_sale_price        float64
dtype: object

In [59]:
drop_zips = ['33132', '33606', '32190', '34140', '32461', '33154']
test = ml_data[ml_data['zipcode'].isin(drop_zips)]
drop_index = test.index.tolist()
drop_index

[1427, 3101, 3102, 7613, 7614, 7615, 7801, 7802, 11081, 14359]

In [60]:
clean_data = ml_data.drop(drop_index)
len(clean_data)

18371

In [11]:
import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [13]:
y_value = ml_data['avg_sale_price'].values

x_values = ml_data.drop(columns='avg_sale_price')

feature_names = x_values.columns

x_values
y_value

array([289290., 286173., 301620., ..., 315426., 350031., 350031.])

In [14]:

X_train, X_test, y_train, y_test = train_test_split(x_values, y_value, random_state=42)

In [15]:
print("Training set - Features: ", X_train.shape, "Target: ", y_train.shape)
print("Test set - Features: ", X_test.shape, "Target: ",y_test.shape)

Training set - Features:  (13785, 7) Target:  (13785,)
Test set - Features:  (4596, 7) Target:  (4596,)


In [16]:
import lazypredict
from lazypredict.Supervised import LazyRegressor

In [17]:
reg = LazyRegressor(verbose=0, ignore_warnings=False, custom_metric=None)
reg_models, reg_predictions = reg.fit(X_train, X_test, y_train, y_test)
print(reg_models)

100%|██████████| 42/42 [01:07<00:00,  1.62s/it]                               Adjusted R-Squared  R-Squared        RMSE  \
Model                                                                      
XGBRegressor                                 0.65       0.65    85967.56   
RandomForestRegressor                        0.59       0.59    93008.05   
BaggingRegressor                             0.54       0.54    98971.56   
LGBMRegressor                                0.52       0.52   100409.50   
HistGradientBoostingRegressor                0.52       0.52   100870.47   
ExtraTreesRegressor                          0.48       0.48   104896.20   
GradientBoostingRegressor                    0.45       0.45   108197.67   
DecisionTreeRegressor                        0.26       0.26   124958.17   
KNeighborsRegressor                          0.24       0.24   126908.50   
ExtraTreeRegressor                           0.15       0.15   134017.82   
PoissonRegressor                         

In [18]:
from lazypredict.Supervised import LazyClassifier

clf = LazyClassifier(verbose=0,ignore_warnings=False)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models

  0%|          | 0/29 [00:20<?, ?it/s]


KeyboardInterrupt: 

In [4]:
# monthly mortgage payments
# price = float(ml_data['avg_sale_price'])
price = float(220000)
#Estimated 20% down
down_payment = float(0)
loan_amount = price*(1-down_payment/100)
#Estimated 30 year loan
loan_type = float(30)
loan_term = int(12*loan_type)
# interest_rate = float(ml_data['rate'])
interest_rate = float(4)
# Interest rate calculations
R = 1 + (interest_rate)/(12*100)
# Monthly Payment
X = loan_amount*(R**loan_term)*(1-R)/(1-R**loan_term)
F = X + (4000/12)
print(X)

1050.3136500240225
