In [1]:
# Required Modules
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMClassifier

In [3]:
# Read in CSV data
data = pd.read_csv('Resources/Final_Data.csv')

data.head()

Unnamed: 0,Zip_Code,Date,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,Rent_Affordability,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
0,32003,2019-01-01,2019,1,244950.0,4.464,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,195960.0,360.0,1.00372,988.713278,0.126012
1,32003,2019-02-01,2019,2,270000.0,4.37,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,216000.0,360.0,1.003642,1077.819242,0.137369
2,32003,2019-03-01,2019,3,275000.0,4.265,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,220000.0,360.0,1.003554,1084.200547,0.138182
3,32003,2019-04-01,2019,4,264500.0,4.1425,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,211600.0,360.0,1.003452,1027.671397,0.130978
4,32003,2019-05-01,2019,5,281000.0,4.072,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,224800.0,360.0,1.003393,1082.581718,0.137976


In [4]:
data.shape

(24382, 29)

## Set aside 2021 data

In [6]:
ml_data_21 = data.loc[data.Year == 2021]
ml_data_21 = ml_data_21.sort_values(by = ['Zip_Code', 'Year', 'Month'], ascending = [True, True, True])

ml_data_21.head(100)

Unnamed: 0,Zip_Code,Date,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,Rent_Affordability,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
24,32003,2021-01-01,2021,1,299900.0,2.7350,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,239920.0,360.0,1.002279,977.547067,0.124589
25,32003,2021-02-01,2021,2,333800.0,2.8100,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,267040.0,360.0,1.002342,1098.672679,0.140027
26,32003,2021-03-01,2021,3,339950.0,3.0825,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,271960.0,360.0,1.002569,1158.730395,0.147681
50,32008,2021-01-01,2021,1,132000.0,2.7350,1120,800,33663.0,0.285180,...,1154,93,82,34,118,105600.0,360.0,1.002279,430.264131,0.153378
51,32008,2021-02-01,2021,2,53200.0,2.8100,1120,800,33663.0,0.285180,...,1154,93,82,34,118,42560.0,360.0,1.002342,175.103015,0.062420
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
885,32091,2021-01-01,2021,1,162000.0,2.7350,1092,743,42036.0,0.212104,...,7036,400,85,26,247,129600.0,360.0,1.002279,528.051433,0.150743
886,32091,2021-02-01,2021,2,145000.0,2.8100,1092,743,42036.0,0.212104,...,7036,400,85,26,247,116000.0,360.0,1.002342,477.254459,0.136242
887,32091,2021-03-01,2021,3,134000.0,3.0825,1092,743,42036.0,0.212104,...,7036,400,85,26,247,107200.0,360.0,1.002569,456.743265,0.130386
912,32092,2021-01-01,2021,1,330400.0,2.7350,3637,1113,90055.0,0.148309,...,7603,1510,107,30,209,264320.0,360.0,1.002279,1076.964157,0.143508


## Create train/test dataframe

In [8]:
# Select features for X
ml_data = data[['Zip_Code', 'Year', 'Month', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 'Rent_Affordability', 'Sale_Price' ]]

ml_data.shape

(24382, 13)

In [9]:
# Drop 2021 data
index_names = ml_data[ml_data['Year'] == 2021 ].index
ml_data.drop(index_names, inplace = True)

ml_data.shape

(21764, 13)

In [13]:
# Create copy to try three-month training data
ml_data_3mo = ml_data.sort_values(by = ['Zip_Code', 'Year', 'Month'], ascending = [True, True, True])
ml_data_3mo.loc[ml_data_3mo.Year == 2020, "Month"] += 12

ml_data_3mo.head(21764)

Unnamed: 0,Zip_Code,Year,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price
0,32003,2019,1,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0
1,32003,2019,2,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0
2,32003,2019,3,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0
3,32003,2019,4,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0
4,32003,2019,5,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24371,34997,2020,22,0.143471,91,113,2976,21551,139,17,0.192070,0.247189,285000.0
24372,34997,2020,23,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0
24373,34997,2020,23,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0
24374,34997,2020,24,0.143471,91,113,2976,21551,142,16,0.196301,0.247189,297000.0


In [14]:
# Add columns for future months sales prices
ml_data_3mo['Last_Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift()
ml_data_3mo['Future_Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(-1)
ml_data_3mo['Future_2Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(-2)
ml_data_3mo['Future_3Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(-3)

ml_data_3mo.head()

Unnamed: 0,Zip_Code,Year,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Future_Month_Price,Future_2Month_Price,Future_3Month_Price
0,32003,2019,1,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0,,270000.0,275000.0,264500.0
1,32003,2019,2,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0,244950.0,275000.0,264500.0,281000.0
2,32003,2019,3,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0,270000.0,264500.0,281000.0,275000.0
3,32003,2019,4,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0,275000.0,281000.0,275000.0,282500.0
4,32003,2019,5,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,275000.0,282500.0,269577.0


In [None]:
# Use only future 3 month data
# Keep ZIP and Month?
ml_data_3mo = ml_data_3mo.drop(columns=['Zip_Code', 'Year', 'Future_Month_Price', 'Future_2Month_Price'])

ml_data_3mo.head(3000)

In [None]:
ml_data_3mo.replace([np.inf, -np.inf], np.nan, inplace=True)
ml_data_3mo.dropna(inplace=True)

In [None]:
# Set X and y values
y = ml_data_3mo['Future_3Month_Price'].values

X = ml_data_3mo.drop(columns='Future_3Month_Price')

feature_names = X.columns

X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
print(len(X_test))
print(len(y_test))

In [None]:
# Fit Random Forest Regressor model
rfr_model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)
rfr_model.fit(X_train, y_train)

In [None]:
# Get R2 score
print(f"Training Data Score: {rfr_model.score(X_train, y_train)}")
print(f"Testing Data Score: {rfr_model.score(X_test, y_test)}")

In [None]:
predict_21 = ml_data_3mo[ml_data_3mo['Year'] == 2021]

predict_21

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, predict_test)
print('Confusion matrix\n\n', cm)
print('\nTrue Positives(TP) = ', cm[0,0])
print('\nTrue Negatives(TN) = ', cm[1,1])
print('\nFalse Positives(FP) = ', cm[0,1])
print('\nFalse Negatives(FN) = ', cm[1,0])

In [None]:
print("Accuracy:", metrics.accuracy_score(y_test, predict_rf))