In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from math import sqrt

from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv('Resources/Clean_Data/Final_Data.csv')
data.head()

Unnamed: 0,Zip_Code,Date,Date_Code,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
0,32003,2019-01-01,201901,2019,1,244950.0,4.464,3778,1113,94154.0,...,10671,1069,104,35,28,195960.0,360.0,1.00372,988.713278,0.126012
1,32003,2019-02-01,201902,2019,2,270000.0,4.37,3778,1113,94154.0,...,10671,1069,104,35,28,216000.0,360.0,1.003642,1077.819242,0.137369
2,32003,2019-03-01,201903,2019,3,275000.0,4.265,3778,1113,94154.0,...,10671,1069,104,35,28,220000.0,360.0,1.003554,1084.200547,0.138182
3,32003,2019-04-01,201904,2019,4,264500.0,4.1425,3778,1113,94154.0,...,10671,1069,104,35,28,211600.0,360.0,1.003452,1027.671397,0.130978
4,32003,2019-05-01,201905,2019,5,281000.0,4.072,3778,1113,94154.0,...,10671,1069,104,35,28,224800.0,360.0,1.003393,1082.581718,0.137976


In [3]:
ml_data = data[['Zip_Code', 'Date_Code', 'Mobility_Rate', 'Expense_Index', 'Crime_Index' , 'Total_Vacant', 'Total_Dwellings', 'Total_Sales', 'FHA_Count', 'Home_Affordability', 'Rent_Affordability', 'Sale_Price' ]]
ml_data.head()

Unnamed: 0,Zip_Code,Date_Code,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price
0,32003,201901,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0
1,32003,201902,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0
2,32003,201903,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0
3,32003,201904,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0
4,32003,201905,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0


In [4]:
rf_data = ml_data.sort_values(by = ['Zip_Code','Date_Code'], ascending = [True, True])
rf_data['Zip_Code'] = rf_data['Zip_Code'].astype(str)
rf_data = rf_data.set_index('Zip_Code')


rf_data

Unnamed: 0_level_0,Date_Code,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
32003,201901,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0
32003,201902,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0
32003,201903,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0
32003,201904,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0
32003,201905,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0
...,...,...,...,...,...,...,...,...,...,...,...
34997,202011,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0
34997,202012,0.143471,91,113,2976,21551,142,16,0.196301,0.247189,297000.0
34997,202101,0.143471,91,113,2976,21551,81,10,0.202930,0.247189,305000.0
34997,202102,0.143471,91,113,2976,21551,116,7,0.201553,0.247189,300000.0


In [5]:
rf_data['Last_Month_Price'] = rf_data.groupby(['Zip_Code'])['Sale_Price'].shift()
rf_data['Last_2Month_Price'] = rf_data.groupby(['Zip_Code'])['Sale_Price'].shift(2)
rf_data['Last_3Month_Price'] = rf_data.groupby(['Zip_Code'])['Sale_Price'].shift(3)
# ml_data_3mo['Future_Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(-1)
# ml_data_3mo['Future_2Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(-2)
# ml_data_3mo['Future_3Month_Price'] = ml_data_3mo.groupby(['Zip_Code'])['Sale_Price'].shift(-3)


price_difference_1 = []
price_difference_2 = []
price_difference_3 = []


for column in rf_data[['Sale_Price', 'Last_Month_Price', 'Last_2Month_Price',
                    'Last_3Month_Price']]:
   # Select column contents by column name using [] operator
    current_sales = rf_data['Sale_Price'].values
    last_sales = rf_data['Last_Month_Price'].values
    last_2mo_sale = rf_data['Last_2Month_Price'].values
    last_3mo_sale = rf_data['Last_3Month_Price'].values
    price_difference_1.append(current_sales-last_sales)
    price_difference_2.append(last_sales-last_2mo_sale)
    price_difference_3.append(last_2mo_sale-last_3mo_sale)
    

rf_data['Last_Month_Diff'] = price_difference_1[0]
rf_data['Last_2Month_Diff'] = price_difference_2[0]
rf_data['Last_3Month_Diff'] = price_difference_3[0]

rf_data

Unnamed: 0_level_0,Date_Code,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
32003,201901,0.182327,104,28,704,11724,32,7,0.126012,0.141853,244950.0,,,,,,
32003,201902,0.182327,104,28,704,11724,41,2,0.137369,0.141853,270000.0,244950.0,,,25050.0,,
32003,201903,0.182327,104,28,704,11724,21,7,0.138182,0.141853,275000.0,270000.0,244950.0,,5000.0,25050.0,
32003,201904,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0,275000.0,270000.0,244950.0,-10500.0,5000.0,25050.0
32003,201905,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,275000.0,270000.0,16500.0,-10500.0,5000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34997,202011,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0,285000.0,256000.0,251000.0,-25250.0,29000.0,5000.0
34997,202012,0.143471,91,113,2976,21551,142,16,0.196301,0.247189,297000.0,259750.0,285000.0,256000.0,37250.0,-25250.0,29000.0
34997,202101,0.143471,91,113,2976,21551,81,10,0.202930,0.247189,305000.0,297000.0,259750.0,285000.0,8000.0,37250.0,-25250.0
34997,202102,0.143471,91,113,2976,21551,116,7,0.201553,0.247189,300000.0,305000.0,297000.0,259750.0,-5000.0,8000.0,37250.0


In [6]:
rf_data.replace([np.inf, -np.inf], np.nan, inplace=True)
rf_data.dropna(inplace=True)
rf_data.head()

Unnamed: 0_level_0,Date_Code,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
32003,201904,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0,275000.0,270000.0,244950.0,-10500.0,5000.0,25050.0
32003,201905,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,275000.0,270000.0,16500.0,-10500.0,5000.0
32003,201906,0.182327,104,28,704,11724,43,3,0.13069,0.141853,275000.0,281000.0,264500.0,275000.0,-6000.0,16500.0,-10500.0
32003,201907,0.182327,104,28,704,11724,90,7,0.133641,0.141853,282500.0,275000.0,281000.0,264500.0,7500.0,-6000.0,16500.0
32003,201908,0.182327,104,28,704,11724,64,5,0.125212,0.141853,269577.0,282500.0,275000.0,281000.0,-12923.0,7500.0,-6000.0


In [7]:
rf_data['Date_Code'] = rf_data['Date_Code'].astype(str)
rf_data

Unnamed: 0_level_0,Date_Code,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
32003,201904,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0,275000.0,270000.0,244950.0,-10500.0,5000.0,25050.0
32003,201905,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,275000.0,270000.0,16500.0,-10500.0,5000.0
32003,201906,0.182327,104,28,704,11724,43,3,0.130690,0.141853,275000.0,281000.0,264500.0,275000.0,-6000.0,16500.0,-10500.0
32003,201907,0.182327,104,28,704,11724,90,7,0.133641,0.141853,282500.0,275000.0,281000.0,264500.0,7500.0,-6000.0,16500.0
32003,201908,0.182327,104,28,704,11724,64,5,0.125212,0.141853,269577.0,282500.0,275000.0,281000.0,-12923.0,7500.0,-6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34997,202011,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0,285000.0,256000.0,251000.0,-25250.0,29000.0,5000.0
34997,202012,0.143471,91,113,2976,21551,142,16,0.196301,0.247189,297000.0,259750.0,285000.0,256000.0,37250.0,-25250.0,29000.0
34997,202101,0.143471,91,113,2976,21551,81,10,0.202930,0.247189,305000.0,297000.0,259750.0,285000.0,8000.0,37250.0,-25250.0
34997,202102,0.143471,91,113,2976,21551,116,7,0.201553,0.247189,300000.0,305000.0,297000.0,259750.0,-5000.0,8000.0,37250.0


In [8]:
test = rf_data[rf_data['Date_Code'].str.contains('2021')].drop(columns=['Date_Code'])
test

Unnamed: 0_level_0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
32003,0.182327,104,28,704,11724,39,6,0.124589,0.141853,299900.0,294000.0,336000.0,330000.0,5900.0,-42000.0,6000.0
32003,0.182327,104,28,704,11724,30,8,0.140027,0.141853,333800.0,299900.0,294000.0,336000.0,33900.0,5900.0,-42000.0
32003,0.182327,104,28,704,11724,70,4,0.147681,0.141853,339950.0,333800.0,299900.0,294000.0,6150.0,33900.0,5900.0
32008,0.067037,82,118,964,3247,6,0,0.153378,0.285180,132000.0,169500.0,72500.0,107000.0,-37500.0,97000.0,-34500.0
32008,0.067037,82,118,964,3247,3,1,0.062420,0.285180,53200.0,132000.0,169500.0,72500.0,-78800.0,-37500.0,97000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34996,0.141276,78,140,2780,9249,57,1,0.225364,0.209918,395000.0,345000.0,390000.0,400000.0,50000.0,-45000.0,-10000.0
34996,0.141276,78,140,2780,9249,14,0,0.275304,0.209918,465950.0,395000.0,345000.0,390000.0,70950.0,50000.0,-45000.0
34997,0.143471,91,113,2976,21551,81,10,0.202930,0.247189,305000.0,297000.0,259750.0,285000.0,8000.0,37250.0,-25250.0
34997,0.143471,91,113,2976,21551,116,7,0.201553,0.247189,300000.0,305000.0,297000.0,259750.0,-5000.0,8000.0,37250.0


In [9]:
sale_y_test = test['Sale_Price'].values

sale_X_test = test.drop(columns='Sale_Price')

sale_test_names = sale_X_test.columns

sale_X_test.head()

Unnamed: 0_level_0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
32003,0.182327,104,28,704,11724,39,6,0.124589,0.141853,294000.0,336000.0,330000.0,5900.0,-42000.0,6000.0
32003,0.182327,104,28,704,11724,30,8,0.140027,0.141853,299900.0,294000.0,336000.0,33900.0,5900.0,-42000.0
32003,0.182327,104,28,704,11724,70,4,0.147681,0.141853,333800.0,299900.0,294000.0,6150.0,33900.0,5900.0
32008,0.067037,82,118,964,3247,6,0,0.153378,0.28518,169500.0,72500.0,107000.0,-37500.0,97000.0,-34500.0
32008,0.067037,82,118,964,3247,3,1,0.06242,0.28518,132000.0,169500.0,72500.0,-78800.0,-37500.0,97000.0


In [10]:
training = rf_data[~rf_data['Date_Code'].str.contains('2021')].drop(columns=['Date_Code'])
training

Unnamed: 0_level_0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
32003,0.182327,104,28,704,11724,42,3,0.130978,0.141853,264500.0,275000.0,270000.0,244950.0,-10500.0,5000.0,25050.0
32003,0.182327,104,28,704,11724,66,8,0.137976,0.141853,281000.0,264500.0,275000.0,270000.0,16500.0,-10500.0,5000.0
32003,0.182327,104,28,704,11724,43,3,0.130690,0.141853,275000.0,281000.0,264500.0,275000.0,-6000.0,16500.0,-10500.0
32003,0.182327,104,28,704,11724,90,7,0.133641,0.141853,282500.0,275000.0,281000.0,264500.0,7500.0,-6000.0,16500.0
32003,0.182327,104,28,704,11724,64,5,0.125212,0.141853,269577.0,282500.0,275000.0,281000.0,-12923.0,7500.0,-6000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
34997,0.143471,91,113,2976,21551,123,18,0.171370,0.247189,251000.0,255000.0,259900.0,240000.0,-4000.0,-4900.0,19900.0
34997,0.143471,91,113,2976,21551,115,18,0.173776,0.247189,256000.0,251000.0,255000.0,259900.0,5000.0,-4000.0,-4900.0
34997,0.143471,91,113,2976,21551,139,17,0.192070,0.247189,285000.0,256000.0,251000.0,255000.0,29000.0,5000.0,-4000.0
34997,0.143471,91,113,2976,21551,132,11,0.173497,0.247189,259750.0,285000.0,256000.0,251000.0,-25250.0,29000.0,5000.0


In [11]:
sale_y_train= training['Sale_Price'].values

sale_X_train = training.drop(columns='Sale_Price')

sale_train_names = sale_X_train.columns

sale_X_train.head()

Unnamed: 0_level_0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,FHA_Count,Home_Affordability,Rent_Affordability,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
32003,0.182327,104,28,704,11724,42,3,0.130978,0.141853,275000.0,270000.0,244950.0,-10500.0,5000.0,25050.0
32003,0.182327,104,28,704,11724,66,8,0.137976,0.141853,264500.0,275000.0,270000.0,16500.0,-10500.0,5000.0
32003,0.182327,104,28,704,11724,43,3,0.13069,0.141853,281000.0,264500.0,275000.0,-6000.0,16500.0,-10500.0
32003,0.182327,104,28,704,11724,90,7,0.133641,0.141853,275000.0,281000.0,264500.0,7500.0,-6000.0,16500.0
32003,0.182327,104,28,704,11724,64,5,0.125212,0.141853,282500.0,275000.0,281000.0,-12923.0,7500.0,-6000.0


In [12]:
#Random Forest Regressor
rf_regr = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)
rf_regr.fit(sale_X_train, sale_y_train)

RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)

In [13]:
print(f"Training Data Score: {rf_regr.score(sale_X_train, sale_y_train)}")
print(f"Testing Data Score: {rf_regr.score(sale_X_test, sale_y_test)}")

Training Data Score: 0.9981845826685722
Testing Data Score: 0.978534981036313


In [14]:
rf_pred = rf_regr.predict(sale_X_test)

print('Mean Error = %.5f' % np.sqrt(mean_squared_log_error(sale_y_test, rf_pred)))

Mean Error = 0.07004


In [15]:
output = pd.DataFrame({'Predicted':rf_pred, 'Actual': sale_y_test})
output

Unnamed: 0,Predicted,Actual
0,300813.433,299900.0
1,333409.450,333800.0
2,339040.001,339950.0
3,133453.210,132000.0
4,65845.209,53200.0
...,...,...
2537,395403.237,395000.0
2538,464766.475,465950.0
2539,305500.998,305000.0
2540,300436.306,300000.0


In [16]:
# test['Predicted_Sale_Price'] = rf_pred
# test

## FHA Data

In [17]:
fha_y_test = test['FHA_Count'].values

fha_X_test = test.drop(columns='FHA_Count')

fha_test_names = fha_X_test.columns

fha_X_test.head()

Unnamed: 0_level_0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
32003,0.182327,104,28,704,11724,39,0.124589,0.141853,299900.0,294000.0,336000.0,330000.0,5900.0,-42000.0,6000.0
32003,0.182327,104,28,704,11724,30,0.140027,0.141853,333800.0,299900.0,294000.0,336000.0,33900.0,5900.0,-42000.0
32003,0.182327,104,28,704,11724,70,0.147681,0.141853,339950.0,333800.0,299900.0,294000.0,6150.0,33900.0,5900.0
32008,0.067037,82,118,964,3247,6,0.153378,0.28518,132000.0,169500.0,72500.0,107000.0,-37500.0,97000.0,-34500.0
32008,0.067037,82,118,964,3247,3,0.06242,0.28518,53200.0,132000.0,169500.0,72500.0,-78800.0,-37500.0,97000.0


In [18]:
fha_y_train= training['FHA_Count'].values

fha_X_train = training.drop(columns='FHA_Count')

fha_train_names = fha_X_train.columns

fha_X_train.head()

Unnamed: 0_level_0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,Total_Sales,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
32003,0.182327,104,28,704,11724,42,0.130978,0.141853,264500.0,275000.0,270000.0,244950.0,-10500.0,5000.0,25050.0
32003,0.182327,104,28,704,11724,66,0.137976,0.141853,281000.0,264500.0,275000.0,270000.0,16500.0,-10500.0,5000.0
32003,0.182327,104,28,704,11724,43,0.13069,0.141853,275000.0,281000.0,264500.0,275000.0,-6000.0,16500.0,-10500.0
32003,0.182327,104,28,704,11724,90,0.133641,0.141853,282500.0,275000.0,281000.0,264500.0,7500.0,-6000.0,16500.0
32003,0.182327,104,28,704,11724,64,0.125212,0.141853,269577.0,282500.0,275000.0,281000.0,-12923.0,7500.0,-6000.0


In [19]:
rf_regr_2 = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)
rf_regr_2.fit(fha_X_train, fha_y_train)

RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)

In [20]:
print(f"Training Data Score: {rf_regr_2.score(fha_X_train, fha_y_train)}")
print(f"Testing Data Score: {rf_regr_2.score(fha_X_test, fha_y_test)}")

Training Data Score: 0.9816220497006619
Testing Data Score: 0.8574538790391151


In [21]:
rf_pred_fha = rf_regr_2.predict(fha_X_test)

print('Mean Error = %.5f' % np.sqrt(mean_squared_log_error(fha_y_test, rf_pred_fha)))

Mean Error = 0.47695


## Total Sales Predict

In [22]:
ts_y_test = test['Total_Sales'].values

ts_X_test = test.drop(columns='Total_Sales')

ts_test_names = ts_X_test.columns

ts_X_test.head()

Unnamed: 0_level_0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
32003,0.182327,104,28,704,11724,6,0.124589,0.141853,299900.0,294000.0,336000.0,330000.0,5900.0,-42000.0,6000.0
32003,0.182327,104,28,704,11724,8,0.140027,0.141853,333800.0,299900.0,294000.0,336000.0,33900.0,5900.0,-42000.0
32003,0.182327,104,28,704,11724,4,0.147681,0.141853,339950.0,333800.0,299900.0,294000.0,6150.0,33900.0,5900.0
32008,0.067037,82,118,964,3247,0,0.153378,0.28518,132000.0,169500.0,72500.0,107000.0,-37500.0,97000.0,-34500.0
32008,0.067037,82,118,964,3247,1,0.06242,0.28518,53200.0,132000.0,169500.0,72500.0,-78800.0,-37500.0,97000.0


In [23]:
ts_y_train= training['Total_Sales'].values

ts_X_train = training.drop(columns='Total_Sales')

ts_train_names = ts_X_train.columns

ts_X_train.head()

Unnamed: 0_level_0,Mobility_Rate,Expense_Index,Crime_Index,Total_Vacant,Total_Dwellings,FHA_Count,Home_Affordability,Rent_Affordability,Sale_Price,Last_Month_Price,Last_2Month_Price,Last_3Month_Price,Last_Month_Diff,Last_2Month_Diff,Last_3Month_Diff
Zip_Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
32003,0.182327,104,28,704,11724,3,0.130978,0.141853,264500.0,275000.0,270000.0,244950.0,-10500.0,5000.0,25050.0
32003,0.182327,104,28,704,11724,8,0.137976,0.141853,281000.0,264500.0,275000.0,270000.0,16500.0,-10500.0,5000.0
32003,0.182327,104,28,704,11724,3,0.13069,0.141853,275000.0,281000.0,264500.0,275000.0,-6000.0,16500.0,-10500.0
32003,0.182327,104,28,704,11724,7,0.133641,0.141853,282500.0,275000.0,281000.0,264500.0,7500.0,-6000.0,16500.0
32003,0.182327,104,28,704,11724,5,0.125212,0.141853,269577.0,282500.0,275000.0,281000.0,-12923.0,7500.0,-6000.0


In [24]:
rf_regr_3 = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)
rf_regr_3.fit(ts_X_train, ts_y_train)

RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=42)

In [25]:
print(f"Training Data Score: {rf_regr_3.score(ts_X_train, ts_y_train)}")
print(f"Testing Data Score: {rf_regr_3.score(ts_X_test, ts_y_test)}")

Training Data Score: 0.9812066037303133
Testing Data Score: 0.6837658160118194


In [27]:
rf_pred_ts = rf_regr_3.predict(ts_X_test)

print('Mean Error = %.5f' % np.sqrt(mean_squared_log_error(ts_y_test, rf_pred_ts)))

Mean Error = 0.48731
