In [1]:
# Required Modules
import pandas as pd
import numpy as np

In [2]:
import warnings
warnings.filterwarnings("ignore")

from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMClassifier

In [3]:
# Read in CSV data
data = pd.read_csv('Resources/Final_Data.csv')

data.head()

Unnamed: 0,Zip_Code,Date,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,Rent_Affordability,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
0,32003,2019-01-01,2019,1,244950.0,4.464,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,195960.0,360.0,1.00372,988.713278,0.126012
1,32003,2019-02-01,2019,2,270000.0,4.37,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,216000.0,360.0,1.003642,1077.819242,0.137369
2,32003,2019-03-01,2019,3,275000.0,4.265,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,220000.0,360.0,1.003554,1084.200547,0.138182
3,32003,2019-04-01,2019,4,264500.0,4.1425,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,211600.0,360.0,1.003452,1027.671397,0.130978
4,32003,2019-05-01,2019,5,281000.0,4.072,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,224800.0,360.0,1.003393,1082.581718,0.137976


In [4]:
data.shape

(24382, 29)

## Set aside 2021 data

In [5]:
ml_data_21 = data.loc[data.Year == 2021]
ml_data_21 = ml_data_21[['Zip_Code', 'Year', 'Month', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 'Rent_Affordability', 'Sale_Price' ]]
ml_data_21 = ml_data_21.sort_values(by = ['Zip_Code', 'Year', 'Month'], ascending = [True, True, True])

In [6]:
# Add columns for future months sales prices
ml_data_21['Last_Month_Price'] = ml_data_21.groupby(['Zip_Code'])['Sale_Price'].shift()
ml_data_21['Future_3Month_Price'] = ml_data_21.groupby(['Zip_Code'])['Sale_Price'].shift(-3)

# Drop columns not needed for X values
ml_data_21 = ml_data_21.drop(columns=['Zip_Code', 'Year'])

ml_data_21.replace([np.inf, -np.inf], np.nan, inplace=True)
ml_data_21.dropna(inplace=True)

ml_data_21.head()

Unnamed: 0,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Future_3Month_Price
22612,1,0.063615,99,106,180,1519,10,0,0.17064,0.21116,327500.0,327500.0,449900.0
22613,2,0.063615,99,106,180,1519,6,1,0.19059,0.21116,362250.0,327500.0,449900.0
22666,1,0.13045,118,387,2886,15628,47,36,0.160743,0.296244,219900.0,219900.0,250000.0
22667,2,0.13045,118,387,2886,15628,71,26,0.167628,0.296244,227100.0,219900.0,250000.0
22720,1,0.13304,103,203,3780,17943,80,58,0.155078,0.241901,201200.0,201200.0,220000.0


## Create train/test dataframe

In [7]:
# Select features for X
ml_data = data[['Zip_Code', 'Year', 'Month', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 'Rent_Affordability', 'Sale_Price' ]]

ml_data.shape

(24382, 13)

In [8]:
# Drop 2021 data
index_names = ml_data[ml_data['Year'] == 2021 ].index
ml_data.drop(index_names, inplace = True)

ml_data.shape

(21764, 13)

In [9]:
# Create copy to try three-month training data
ml_data_3mo = ml_data.sort_values(by = ['Zip_Code', 'Year', 'Month'], ascending = [True, True, True])
ml_data_3mo.loc[ml_data_3mo.Year == 2020, "Month"] += 12

ml_data_3mo.head(21764)

Unnamed: 0,Zip_Code,Year,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price
0,32003,2019,1,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0
1,32003,2019,2,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0
2,32003,2019,3,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0
3,32003,2019,4,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0
4,32003,2019,5,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
24371,34997,2020,22,0.143471,91,113,2976,21551,139,17,0.192070,0.247189,285000.0
24372,34997,2020,23,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0
24373,34997,2020,23,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0
24374,34997,2020,24,0.143471,91,113,2976,21551,142,16,0.196301,0.247189,297000.0


In [10]:
# Add columns for future months sales prices
ml_data_3mo['Last_Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift()
ml_data_3mo['Future_3Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(-3)

ml_data_3mo.head()

Unnamed: 0,Zip_Code,Year,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Future_3Month_Price
0,32003,2019,1,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0,,264500.0
1,32003,2019,2,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0,244950.0,281000.0
2,32003,2019,3,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0,270000.0,275000.0
3,32003,2019,4,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0,275000.0,282500.0
4,32003,2019,5,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,269577.0


In [11]:
# Drop columns not needed for X values
# Keeping Month this time
ml_data_3mo = ml_data_3mo.drop(columns=['Zip_Code', 'Year'])

ml_data_3mo.shape

(21764, 13)

In [12]:
ml_data_3mo.replace([np.inf, -np.inf], np.nan, inplace=True)
ml_data_3mo.dropna(inplace=True)

ml_data_3mo.shape

(18151, 13)

## Do train/test

In [13]:
# Set X and y values
y = ml_data_3mo['Future_3Month_Price'].values

X = ml_data_3mo.drop(columns='Future_3Month_Price')

feature_names = X.columns

X

Unnamed: 0,Month,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price
1,2,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0,244950.0
2,3,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0,270000.0
3,4,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0,275000.0
4,5,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0
5,6,0.182327,104,28,704,11724,43,3,0.130690,0.141853,275000.0,281000.0
...,...,...,...,...,...,...,...,...,...,...,...,...
24368,21,0.143471,91,113,2976,21551,115,18,0.173776,0.247189,256000.0,251000.0
24369,21,0.143471,91,113,2976,21551,115,18,0.173776,0.247189,256000.0,256000.0
24370,22,0.143471,91,113,2976,21551,139,17,0.192070,0.247189,285000.0,256000.0
24371,22,0.143471,91,113,2976,21551,139,17,0.192070,0.247189,285000.0,285000.0


In [14]:
# Do test-train-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [15]:
print(len(X_test))
print(len(y_test))

4538
4538


In [16]:
# Fit Random Forest Regressor model
rfr_model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)
rfr_model.fit(X_train, y_train)

RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)

In [17]:
# Get R2 score
print(f"Training Data Score: {rfr_model.score(X_train, y_train)}")
print(f"Testing Data Score: {rfr_model.score(X_test, y_test)}")

Training Data Score: 0.973406732589503
Testing Data Score: 0.8211468001945991


## Predict

In [27]:
# Use predict on 2021 data
y_21 = ml_data_21['Future_3Month_Price'].values
X_21 = ml_data_21.drop(columns='Future_3Month_Price')

y_pred = rfr_model.predict(X_21)
y_pred

array([244778.956, 367056.576, 222426.174, 230367.852, 208538.361,
       219732.926, 195181.062, 234079.69 , 197915.23 , 205666.481,
       274187.4  , 329340.49 , 262535.271, 269392.733, 298584.264,
       278284.96 , 237608.409, 337279.431, 463425.476, 470874.388,
       319579.255, 351735.158, 244555.752, 221983.049, 484455.607,
       371038.515,  98326.367, 210563.638, 265186.588, 271738.805,
       148622.46 , 124140.044, 398347.277, 389014.45 , 288495.23 ,
       375162.057, 311402.6  , 311111.473])