In [1]:
# Required Modules
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [3]:
# Read in CSV data
data = pd.read_csv('Resources/Final_Data.csv')

data.head()

Unnamed: 0,Zip_Code,Date,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,Rent_Affordability,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
0,32003,2019-01-01,2019,1,244950.0,4.464,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,195960.0,360.0,1.00372,988.713278,0.126012
1,32003,2019-02-01,2019,2,270000.0,4.37,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,216000.0,360.0,1.003642,1077.819242,0.137369
2,32003,2019-03-01,2019,3,275000.0,4.265,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,220000.0,360.0,1.003554,1084.200547,0.138182
3,32003,2019-04-01,2019,4,264500.0,4.1425,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,211600.0,360.0,1.003452,1027.671397,0.130978
4,32003,2019-05-01,2019,5,281000.0,4.072,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,224800.0,360.0,1.003393,1082.581718,0.137976


In [4]:
data.shape

(24382, 29)

In [5]:
data_32003 = data.loc[data.Zip_Code == 32003]
data_32003 = data_32003[['Zip_Code', 'Year', 'Month', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 'Rent_Affordability', 'Sale_Price']]

## Set aside 2021 data

In [6]:
ml_data_21 = data.loc[data.Year == 2021]
ml_data_21 = ml_data_21[['Zip_Code', 'Year', 'Month', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 'Rent_Affordability', 'Sale_Price' ]]
ml_data_21 = ml_data_21.sort_values(by = ['Zip_Code', 'Year', 'Month'], ascending = [True, True, True])

In [7]:
# Add columns for previous months' sales prices
ml_data_21['Last_Month_Price'] = ml_data_21.groupby(['Zip_Code'])['Sale_Price'].shift()
ml_data_21['Last_Month_Diff'] = ml_data_21.groupby(['Zip_Code'])['Last_Month_Price'].diff()

ml_data_21['Last_2Month_Price'] = ml_data_21.groupby(['Zip_Code'])['Sale_Price'].shift(2)
ml_data_21['Last_2Month_Diff'] = ml_data_21.groupby(['Zip_Code'])['Last_2Month_Price'].diff()

ml_data_21['Last_3Month_Price'] = ml_data_21.groupby(['Zip_Code'])['Sale_Price'].shift(3)
ml_data_21['Last_3Month_Diff'] = ml_data_21.groupby(['Zip_Code'])['Last_3Month_Price'].diff()

# Drop columns not needed for X values
ml_data_21 = ml_data_21.drop(columns=['Zip_Code', 'Year', 'Month'])

ml_data_21.replace([np.inf, -np.inf], np.nan, inplace=True)
ml_data_21.dropna(inplace=True)

ml_data_21.head()

Unnamed: 0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_Month_Diff,Last_2Month_Price,Last_2Month_Diff,Last_3Month_Price,Last_3Month_Diff
22615,0.063615,99,106,180,1519,7,1,0.245128,0.21116,449900.0,362250.0,0.0,362250.0,34750.0,327500.0,0.0
22616,0.063615,99,106,180,1519,7,1,0.245128,0.21116,449900.0,449900.0,87650.0,362250.0,0.0,362250.0,34750.0
22669,0.13045,118,387,2886,15628,30,32,0.191097,0.296244,250000.0,227100.0,0.0,227100.0,7200.0,219900.0,0.0
22670,0.13045,118,387,2886,15628,30,32,0.191097,0.296244,250000.0,250000.0,22900.0,227100.0,0.0,227100.0,7200.0
22723,0.13304,103,203,3780,17943,67,67,0.177318,0.241901,220000.0,215000.0,0.0,215000.0,13800.0,201200.0,0.0


## 2019/2020 Data

In [8]:
# Select features for X
ml_data = data[['Zip_Code', 'Year', 'Month', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 'Rent_Affordability', 'Sale_Price' ]]

ml_data.shape

(24382, 13)

In [9]:
# Drop 2021 data
index_names = ml_data[ml_data['Year'] == 2021 ].index
ml_data.drop(index_names, inplace = True)

ml_data.shape

(21764, 13)

In [10]:
# Create copy to try three-month training data
ml_data_3mo = ml_data.sort_values(by = ['Zip_Code', 'Year', 'Month'], ascending = [True, True, True])
ml_data_3mo.loc[ml_data_3mo.Year == 2020, "Month"] += 12

ml_data_3mo.head(21764)

Unnamed: 0,Zip_Code,Year,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price
0,32003,2019,1,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0
1,32003,2019,2,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0
2,32003,2019,3,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0
3,32003,2019,4,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0
4,32003,2019,5,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24371,34997,2020,22,0.143471,91,113,2976,21551,139,17,0.192070,0.247189,285000.0
24372,34997,2020,23,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0
24373,34997,2020,23,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0
24374,34997,2020,24,0.143471,91,113,2976,21551,142,16,0.196301,0.247189,297000.0


In [11]:
# Add columns for previous months' sales prices
ml_data_3mo['Last_Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift()
ml_data_3mo['Last_Month_Diff'] = ml_data_3mo.groupby(['Zip_Code'])['Last_Month_Price'].diff()

ml_data_3mo['Last_2Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(2)
ml_data_3mo['Last_2Month_Diff'] = ml_data_3mo.groupby(['Zip_Code'])['Last_2Month_Price'].diff()

ml_data_3mo['Last_3Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(3)
ml_data_3mo['Last_3Month_Diff'] = ml_data_3mo.groupby(['Zip_Code'])['Last_3Month_Price'].diff()

ml_data_3mo.head()

Unnamed: 0,Zip_Code,Year,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_Month_Diff,Last_2Month_Price,Last_2Month_Diff,Last_3Month_Price,Last_3Month_Diff
0,32003,2019,1,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0,,,,,,
1,32003,2019,2,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0,244950.0,,,,,
2,32003,2019,3,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0,270000.0,25050.0,244950.0,,,
3,32003,2019,4,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0,275000.0,5000.0,270000.0,25050.0,244950.0,
4,32003,2019,5,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0


In [12]:
# Drop columns not needed for X values
# Keeping Month this time
ml_data_3mo = ml_data_3mo.drop(columns=['Zip_Code', 'Year', 'Month'])

ml_data_3mo.shape

(21764, 16)

In [13]:
ml_data_3mo.replace([np.inf, -np.inf], np.nan, inplace=True)
ml_data_3mo.dropna(inplace=True)

ml_data_3mo.shape

(18151, 16)

In [14]:
ml_data_3mo.head()

Unnamed: 0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_Month_Diff,Last_2Month_Price,Last_2Month_Diff,Last_3Month_Price,Last_3Month_Diff
4,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
5,0.182327,104,28,704,11724,43,3,0.13069,0.141853,275000.0,281000.0,16500.0,264500.0,-10500.0,275000.0,5000.0
6,0.182327,104,28,704,11724,90,7,0.133641,0.141853,282500.0,275000.0,-6000.0,281000.0,16500.0,264500.0,-10500.0
7,0.182327,104,28,704,11724,64,5,0.125212,0.141853,269577.0,282500.0,7500.0,275000.0,-6000.0,281000.0,16500.0
8,0.182327,104,28,704,11724,50,3,0.13266,0.141853,286000.0,269577.0,-12923.0,282500.0,7500.0,275000.0,-6000.0


## Train/test

In [15]:
# Set X and y values
y = ml_data_3mo['FHA_Count'].values

X = ml_data_3mo.drop(columns='FHA_Count')

feature_names = X.columns

X

Unnamed: 0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_Month_Diff,Last_2Month_Price,Last_2Month_Diff,Last_3Month_Price,Last_3Month_Diff
4,0.182327,104,28,704,11724,66,0.137976,0.141853,281000.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
5,0.182327,104,28,704,11724,43,0.130690,0.141853,275000.0,281000.0,16500.0,264500.0,-10500.0,275000.0,5000.0
6,0.182327,104,28,704,11724,90,0.133641,0.141853,282500.0,275000.0,-6000.0,281000.0,16500.0,264500.0,-10500.0
7,0.182327,104,28,704,11724,64,0.125212,0.141853,269577.0,282500.0,7500.0,275000.0,-6000.0,281000.0,16500.0
8,0.182327,104,28,704,11724,50,0.132660,0.141853,286000.0,269577.0,-12923.0,282500.0,7500.0,275000.0,-6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24371,0.143471,91,113,2976,21551,139,0.192070,0.247189,285000.0,285000.0,29000.0,256000.0,0.0,256000.0,5000.0
24372,0.143471,91,113,2976,21551,132,0.173497,0.247189,259750.0,285000.0,0.0,285000.0,29000.0,256000.0,0.0
24373,0.143471,91,113,2976,21551,132,0.173497,0.247189,259750.0,259750.0,-25250.0,285000.0,0.0,285000.0,29000.0
24374,0.143471,91,113,2976,21551,142,0.196301,0.247189,297000.0,259750.0,0.0,259750.0,-25250.0,285000.0,0.0


In [16]:
# Do test-train-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [17]:
print(len(X_test))
print(len(y_test))

4538
4538


In [18]:
# Fit Random Forest Regressor model
rfr_model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)
rfr_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)

In [19]:
# Get R2 score
print(f"Training Data Score: {rfr_model.score(X_train, y_train)}")
print(f"Testing Data Score: {rfr_model.score(X_test, y_test)}")

Training Data Score: 0.9834896400378529
Testing Data Score: 0.8597241792156507


## Prediction

In [20]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

In [21]:
rf_pred = rf_regr.predict(X_test)

error = rmsle(y_test, rf_pred)
    
print('Mean Error = %.5f' % np.mean(error))

NameError: name 'rf_regr' is not defined

In [None]:
output = pd.DataFrame({'Predicted':rf_pred, 'Actual': y_test})
output

In [None]:
r2 = r2_score(yts, y_pred)
print(f'The R2 score for predicting sale prices of January, 2021 is {r2}.')