In [1]:
# Required Modules
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error

In [3]:
# Read in CSV data
data = pd.read_csv('Resources/Final_Data.csv')

data.head()

Unnamed: 0,Zip_Code,Date,Date_Code,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
0,32003,2019-01-01,201901,2019,1,244950.0,4.464,3778,1113,94154.0,...,10671,1069,104,35,28,195960.0,360.0,1.00372,988.713278,0.126012
1,32003,2019-02-01,201902,2019,2,270000.0,4.37,3778,1113,94154.0,...,10671,1069,104,35,28,216000.0,360.0,1.003642,1077.819242,0.137369
2,32003,2019-03-01,201903,2019,3,275000.0,4.265,3778,1113,94154.0,...,10671,1069,104,35,28,220000.0,360.0,1.003554,1084.200547,0.138182
3,32003,2019-04-01,201904,2019,4,264500.0,4.1425,3778,1113,94154.0,...,10671,1069,104,35,28,211600.0,360.0,1.003452,1027.671397,0.130978
4,32003,2019-05-01,201905,2019,5,281000.0,4.072,3778,1113,94154.0,...,10671,1069,104,35,28,224800.0,360.0,1.003393,1082.581718,0.137976


In [4]:
data.shape

(23510, 30)

## Set aside 2021 data

In [5]:
ml_data_21 = data.loc[data.Year == 2021]
ml_data_21 = ml_data_21[['Zip_Code', 'Year', 'Month', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 
                         'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 
                         'Rent_Affordability', 'Sale_Price']]
ml_data_21 = ml_data_21.sort_values(by = ['Zip_Code', 'Year', 'Month'], ascending = [True, True, True])

In [6]:
# Add columns for previous months' sales prices
ml_data_21['Last_Month_Price'] = ml_data_21.groupby(['Zip_Code'])['Sale_Price'].shift()
ml_data_21['Last_Month_Diff'] = ml_data_21.groupby(['Zip_Code'])['Last_Month_Price'].diff()

ml_data_21['Last_2Month_Price'] = ml_data_21.groupby(['Zip_Code'])['Sale_Price'].shift(2)
ml_data_21['Last_2Month_Diff'] = ml_data_21.groupby(['Zip_Code'])['Last_2Month_Price'].diff()

ml_data_21['Last_3Month_Price'] = ml_data_21.groupby(['Zip_Code'])['Sale_Price'].shift(3)
ml_data_21['Last_3Month_Diff'] = ml_data_21.groupby(['Zip_Code'])['Last_3Month_Price'].diff()

# Drop columns not needed for X values
ml_data_21 = ml_data_21.drop(columns=['Zip_Code', 'Year', 'Month'])

# ml_data_21.replace([np.inf, -np.inf], np.nan, inplace=True)
# ml_data_21.dropna(inplace=True)

ml_data_21.head(20000)

Unnamed: 0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_Month_Diff,Last_2Month_Price,Last_2Month_Diff,Last_3Month_Price,Last_3Month_Diff
24,0.182327,104,28,704,11724,39,6,0.124589,0.141853,299900.0,,,,,,
25,0.182327,104,28,704,11724,30,8,0.140027,0.141853,333800.0,299900.0,,,,,
26,0.182327,104,28,704,11724,70,4,0.147681,0.141853,339950.0,333800.0,33900.0,299900.0,,,
50,0.067037,82,118,964,3247,6,0,0.153378,0.285180,132000.0,,,,,,
51,0.067037,82,118,964,3247,3,1,0.062420,0.285180,53200.0,132000.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23481,0.141276,78,140,2780,9249,57,1,0.225364,0.209918,395000.0,345000.0,,,,,
23482,0.141276,78,140,2780,9249,14,0,0.275304,0.209918,465950.0,395000.0,50000.0,345000.0,,,
23507,0.143471,91,113,2976,21551,81,10,0.202930,0.247189,305000.0,,,,,,
23508,0.143471,91,113,2976,21551,116,7,0.201553,0.247189,300000.0,305000.0,,,,,


## 2019/2020 Data

In [7]:
# Select features for X
ml_data = data[['Zip_Code', 'Year', 'Month', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 'Rent_Affordability', 'Sale_Price' ]]

ml_data.shape

(23510, 13)

In [8]:
# Drop 2021 data
# index_names = ml_data[ml_data['Year'] == 2021 ].index
# ml_data.drop(index_names, inplace = True)

# ml_data.shape

In [9]:
# Create copy to try three-month training data
ml_data_3mo = ml_data.sort_values(by = ['Zip_Code', 'Year', 'Month'], ascending = [True, True, True])
ml_data_3mo.loc[ml_data_3mo.Year == 2020, "Month"] += 12
ml_data_3mo.loc[ml_data_3mo.Year == 2021, "Month"] += 24

ml_data_3mo.head(22000)

Unnamed: 0,Zip_Code,Year,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price
0,32003,2019,1,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0
1,32003,2019,2,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0
2,32003,2019,3,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0
3,32003,2019,4,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0
4,32003,2019,5,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
21995,34685,2021,26,0.139297,96,268,746,8332,32,1,0.150293,0.184243,315000.0
21996,34685,2021,27,0.139297,96,268,746,8332,33,0,0.185287,0.184243,375000.0
21997,34688,2019,1,0.152133,102,160,449,3389,13,1,0.232598,0.180844,405000.0
21998,34688,2019,2,0.152133,102,160,449,3389,15,1,0.182325,0.180844,321000.0


In [10]:
# Add columns for previous months' sales prices
ml_data_3mo['Last_Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift()
ml_data_3mo['Last_Month_Diff'] = ml_data_3mo.groupby(['Zip_Code'])['Last_Month_Price'].diff()

ml_data_3mo['Last_2Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(2)
ml_data_3mo['Last_2Month_Diff'] = ml_data_3mo.groupby(['Zip_Code'])['Last_2Month_Price'].diff()

ml_data_3mo['Last_3Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(3)
ml_data_3mo['Last_3Month_Diff'] = ml_data_3mo.groupby(['Zip_Code'])['Last_3Month_Price'].diff()

ml_data_3mo.head()

Unnamed: 0,Zip_Code,Year,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_Month_Diff,Last_2Month_Price,Last_2Month_Diff,Last_3Month_Price,Last_3Month_Diff
0,32003,2019,1,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0,,,,,,
1,32003,2019,2,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0,244950.0,,,,,
2,32003,2019,3,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0,270000.0,25050.0,244950.0,,,
3,32003,2019,4,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0,275000.0,5000.0,270000.0,25050.0,244950.0,
4,32003,2019,5,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0


In [11]:
# Drop columns not needed for X values
# Keeping Month this time
ml_data_3mo = ml_data_3mo.drop(columns=['Zip_Code', 'Year', 'Month'])

ml_data_3mo.shape

(23510, 16)

In [12]:
ml_data_3mo.replace([np.inf, -np.inf], np.nan, inplace=True)
ml_data_3mo.dropna(inplace=True)

ml_data_3mo.shape

(19896, 16)

In [13]:
ml_data_3mo.head()

Unnamed: 0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_Month_Diff,Last_2Month_Price,Last_2Month_Diff,Last_3Month_Price,Last_3Month_Diff
4,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
5,0.182327,104,28,704,11724,43,3,0.13069,0.141853,275000.0,281000.0,16500.0,264500.0,-10500.0,275000.0,5000.0
6,0.182327,104,28,704,11724,90,7,0.133641,0.141853,282500.0,275000.0,-6000.0,281000.0,16500.0,264500.0,-10500.0
7,0.182327,104,28,704,11724,64,5,0.125212,0.141853,269577.0,282500.0,7500.0,275000.0,-6000.0,281000.0,16500.0
8,0.182327,104,28,704,11724,50,3,0.13266,0.141853,286000.0,269577.0,-12923.0,282500.0,7500.0,275000.0,-6000.0


## Train/test

In [14]:
# Set X and y values
y = ml_data_3mo['Last_3Month_Price'].values

X = ml_data_3mo.drop(columns='Last_3Month_Price')

feature_names = X.columns

X

Unnamed: 0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_Month_Diff,Last_2Month_Price,Last_2Month_Diff,Last_3Month_Diff
4,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,-10500.0,275000.0,5000.0,25050.0
5,0.182327,104,28,704,11724,43,3,0.130690,0.141853,275000.0,281000.0,16500.0,264500.0,-10500.0,5000.0
6,0.182327,104,28,704,11724,90,7,0.133641,0.141853,282500.0,275000.0,-6000.0,281000.0,16500.0,-10500.0
7,0.182327,104,28,704,11724,64,5,0.125212,0.141853,269577.0,282500.0,7500.0,275000.0,-6000.0,16500.0
8,0.182327,104,28,704,11724,50,3,0.132660,0.141853,286000.0,269577.0,-12923.0,282500.0,7500.0,-6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23505,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0,285000.0,29000.0,256000.0,5000.0,-4000.0
23506,0.143471,91,113,2976,21551,142,16,0.196301,0.247189,297000.0,259750.0,-25250.0,285000.0,29000.0,5000.0
23507,0.143471,91,113,2976,21551,81,10,0.202930,0.247189,305000.0,297000.0,37250.0,259750.0,-25250.0,29000.0
23508,0.143471,91,113,2976,21551,116,7,0.201553,0.247189,300000.0,305000.0,8000.0,297000.0,37250.0,-25250.0


In [15]:
# Do test-train-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [16]:
print(len(X_test))
print(len(y_test))

4974
4974


In [17]:
# Fit Random Forest Regressor model
rfr_model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)
rfr_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)

In [18]:
# Get R2 score
print(f"Training Data Score: {rfr_model.score(X_train, y_train)}")
print(f"Testing Data Score: {rfr_model.score(X_test, y_test)}")

Training Data Score: 0.9925392163341774
Testing Data Score: 0.976635657973184


## Prediction

In [19]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [20]:
rf_pred = rfr_model.predict(X_test)

error = rmsle(y_test, rf_pred)
    
print('Mean Error = %.5f' % np.mean(error))

Mean Error = 0.04911


In [21]:
output = pd.DataFrame({'Predicted':rf_pred, 'Actual': y_test})
output

Unnamed: 0,Predicted,Actual
0,386609.700,382500.0
1,168674.175,169913.0
2,175304.980,175000.0
3,254050.886,255000.0
4,94048.819,96500.0
...,...,...
4969,181297.641,180000.0
4970,204364.500,205000.0
4971,157378.405,158000.0
4972,338255.311,337500.0


In [22]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': [100, 500, 1000,]}
grid = GridSearchCV(rfr_model, param_grid, verbose=3)
grid

GridSearchCV(estimator=RandomForestRegressor(n_estimators=1000, n_jobs=-1,
                                             random_state=42),
             param_grid={'n_estimators': [100, 500, 1000]}, verbose=3)

In [None]:
r2 = r2_score(yts, y_pred)
print(f'The R2 score for predicting sale prices of January, 2021 is {r2}.')