In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [2]:
# Read csv file
data = pd.read_csv('./Resources/Final_Data.csv')

data.head(20000)

Unnamed: 0,Zip_Code,Date,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,Rent_Affordability,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
0,32003,2019-01-01,2019,1,244950.0,4.4640,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,195960.0,360.0,1.003720,988.713278,0.126012
1,32003,2019-02-01,2019,2,270000.0,4.3700,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,216000.0,360.0,1.003642,1077.819242,0.137369
2,32003,2019-03-01,2019,3,275000.0,4.2650,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,220000.0,360.0,1.003554,1084.200547,0.138182
3,32003,2019-04-01,2019,4,264500.0,4.1425,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,211600.0,360.0,1.003452,1027.671397,0.130978
4,32003,2019-05-01,2019,5,281000.0,4.0720,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,224800.0,360.0,1.003393,1082.581718,0.137976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,34744,2020-06-01,2020,6,252000.0,3.1625,2568,1321,52191.0,0.303731,...,10564,6384,106,26,326,201600.0,360.0,1.002635,867.723674,0.199511
19996,34744,2020-07-01,2020,7,251000.0,3.0160,2568,1321,52191.0,0.303731,...,10564,6384,106,26,326,200800.0,360.0,1.002513,848.314626,0.195048
19997,34744,2020-08-01,2020,8,266500.0,2.9350,2568,1321,52191.0,0.303731,...,10564,6384,106,26,326,213200.0,360.0,1.002446,891.403118,0.204956
19998,34744,2020-09-01,2020,9,242050.0,2.8900,2568,1321,52191.0,0.303731,...,10564,6384,106,26,326,193640.0,360.0,1.002408,804.951197,0.185078


In [3]:
data.loc[data.Year == 2020, "Month"] += 12

In [4]:
data = data.groupby(['Zip_Code','Month'])['Sale_Price'].agg(['sum']).reset_index()
data = data.pivot_table(values='sum', columns='Month', index='Zip_Code')
data.columns.name = None
data = data.reset_index()

data.head()

Unnamed: 0,Zip_Code,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
0,32003,244950.0,270000.0,275000.0,264500.0,281000.0,275000.0,282500.0,269577.0,286000.0,...,299500.0,300000.0,265000.0,278950.0,305000.0,307000.0,295000.0,330000.0,336000.0,294000.0
1,32008,86000.0,106250.0,,121500.0,74150.0,138750.0,92500.0,150000.0,100500.0,...,77500.0,85000.0,118000.0,123500.0,144750.0,293300.0,126215.0,107000.0,72500.0,169500.0
2,32009,275000.0,95000.0,115000.0,,74868.0,185000.0,240000.0,,47100.0,...,227500.0,233000.0,225000.0,85500.0,181500.0,244000.0,250000.0,237000.0,,175000.0
3,32011,270000.0,150000.0,134000.0,180000.0,232500.0,88400.0,220000.0,145500.0,172500.0,...,121500.0,312450.0,165000.0,222000.0,194000.0,170000.0,202000.0,223250.0,219450.0,225000.0
4,32024,162500.0,90000.0,81950.0,156450.0,165850.0,139300.0,145000.0,90000.0,104000.0,...,145850.0,134000.0,155000.0,139900.0,190000.0,220000.0,182500.0,139950.0,191500.0,118950.0


In [5]:
melt = data.melt(id_vars='Zip_Code', var_name='Month', value_name='Sale_Price')

melt['Zip_Code'] = melt['Zip_Code']
melt['Month'] = melt['Month']

melt = melt.sort_values(['Month', 'Zip_Code'])
melt.head(20000)

Unnamed: 0,Zip_Code,Month,Sale_Price
0,32003,1,244950.0
1,32008,1,86000.0
2,32009,1,275000.0
3,32011,1,270000.0
4,32024,1,162500.0
...,...,...,...
19995,34994,22,171450.0
19996,34996,22,670000.0
19997,34997,22,570000.0
19998,32003,23,336000.0


In [6]:
index = melt.index
number_of_rows = len(index)
 
print(number_of_rows)

21816


In [7]:
melt2 = melt.copy()
melt2['Last_Month_Sales'] = melt2.groupby(['Zip_Code'])['Sale_Price'].shift()
melt2['Last_Month_Diff'] = melt2.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt2 = melt2.dropna()
melt2.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff
1818,32003,3,275000.0,270000.0,25050.0
1820,32009,3,115000.0,95000.0,-180000.0
1821,32011,3,134000.0,150000.0,-120000.0
1822,32024,3,81950.0,90000.0,-72500.0
1823,32025,3,87000.0,95500.0,-12750.0


In [8]:
index = melt2.index
number_of_rows = len(index)
 
print(number_of_rows)

18711


In [9]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

### Establishing Baseline

In [10]:
mean_error = []
for month in range(17,19):
    train = melt2[melt2['Month'] < month]
    val = melt2[melt2['Month'] == month]

    p = val['Last_Month_Sales'].values

    error = rmsle(val['Sale_Price'].values, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)

print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.28449
Month 18 - Error 0.28322
Mean Error = 0.28386


### Training

In [11]:
mean_error = []
for month in range(17,19):
    train = melt2[melt2['Month'] < month].copy()
    val = melt2[melt2['Month'] == month].copy()
    
    # train['Month'] = train['Month'] % 12 + 1
    
    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.26321
Month 18 - Error 0.24062
Mean Error = 0.25192


In [12]:
melt3 = melt.copy()
melt3['Last_Month_Sales'] = melt3.groupby(['Zip_Code'])['Sale_Price'].shift()
melt3['Last_Month_Diff'] = melt3.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt3['Last-1_Month_Sales'] = melt3.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt3['Last-1_Month_Diff'] = melt3.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt3 = melt3.dropna()
melt3.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff
2727,32003,4,264500.0,275000.0,5000.0,270000.0,25050.0
2730,32011,4,180000.0,134000.0,-16000.0,150000.0,-120000.0
2731,32024,4,156450.0,81950.0,-8050.0,90000.0,-72500.0
2732,32025,4,105500.0,87000.0,-8500.0,95500.0,-12750.0
2733,32033,4,121500.0,255000.0,29000.0,226000.0,36000.0


In [13]:
mean_error = []
for month in range(17,19):
    train = melt3[melt3['Month'] < month]
    val = melt3[melt3['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.24361
Month 18 - Error 0.23621
Mean Error = 0.23991


In [14]:
melt4 = melt.copy()
melt4['Last_Month_Sales'] = melt4.groupby(['Zip_Code'])['Sale_Price'].shift()
melt4['Last_Month_Diff'] = melt4.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt4['Last-1_Month_Sales'] = melt4.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt4['Last-1_Month_Diff'] = melt4.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt4['Last-2_Month_Sales'] = melt4.groupby(['Zip_Code'])['Sale_Price'].shift(3)
melt4['Last-2_Month_Diff'] = melt4.groupby(['Zip_Code'])['Last-2_Month_Sales'].diff()
melt4 = melt4.dropna()
melt4.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff
3636,32003,5,281000.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
3639,32011,5,232500.0,180000.0,46000.0,134000.0,-16000.0,150000.0,-120000.0
3640,32024,5,165850.0,156450.0,74500.0,81950.0,-8050.0,90000.0,-72500.0
3641,32025,5,125000.0,105500.0,18500.0,87000.0,-8500.0,95500.0,-12750.0
3642,32033,5,155250.0,121500.0,-133500.0,255000.0,29000.0,226000.0,36000.0


In [15]:
mean_error = []
for month in range(17,19):
    train = melt4[melt4['Month'] < month]
    val = melt4[melt4['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.22984
Month 18 - Error 0.21586
Mean Error = 0.22285


In [16]:
melt5 = melt.copy()
melt5['Last_Month_Sales'] = melt5.groupby(['Zip_Code'])['Sale_Price'].shift()
melt5['Last_Month_Diff'] = melt5.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt5['Last-1_Month_Sales'] = melt5.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt5['Last-1_Month_Diff'] = melt5.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt5['Last-2_Month_Sales'] = melt5.groupby(['Zip_Code'])['Sale_Price'].shift(3)
melt5['Last-2_Month_Diff'] = melt5.groupby(['Zip_Code'])['Last-2_Month_Sales'].diff()
melt5['Last-3_Month_Sales'] = melt5.groupby(['Zip_Code'])['Sale_Price'].shift(4)
melt5['Last-3_Month_Diff'] = melt5.groupby(['Zip_Code'])['Last-3_Month_Sales'].diff()
melt5 = melt5.dropna()
melt5.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff,Last-3_Month_Sales,Last-3_Month_Diff
4545,32003,6,275000.0,281000.0,16500.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
4548,32011,6,88400.0,232500.0,52500.0,180000.0,46000.0,134000.0,-16000.0,150000.0,-120000.0
4549,32024,6,139300.0,165850.0,9400.0,156450.0,74500.0,81950.0,-8050.0,90000.0,-72500.0
4550,32025,6,129000.0,125000.0,19500.0,105500.0,18500.0,87000.0,-8500.0,95500.0,-12750.0
4551,32033,6,177000.0,155250.0,33750.0,121500.0,-133500.0,255000.0,29000.0,226000.0,36000.0


In [17]:
mean_error = []
for month in range(17,19):
    train = melt5[melt5['Month'] < month]
    val = melt5[melt5['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.22250
Month 18 - Error 0.21228
Mean Error = 0.21739


In [18]:
melt6 = melt.copy()
melt6['Last_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift()
melt6['Last_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt6['Last-1_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt6['Last-1_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt6['Last-2_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift(3)
melt6['Last-2_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last-2_Month_Sales'].diff()
melt6['Last-3_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift(4)
melt6['Last-3_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last-3_Month_Sales'].diff()
melt6['Last-4_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift(5)
melt6['Last-4_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last-4_Month_Sales'].diff()

melt6 = melt6.dropna()
melt6.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff,Last-3_Month_Sales,Last-3_Month_Diff,Last-4_Month_Sales,Last-4_Month_Diff
5454,32003,7,282500.0,275000.0,-6000.0,281000.0,16500.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
5457,32011,7,220000.0,88400.0,-144100.0,232500.0,52500.0,180000.0,46000.0,134000.0,-16000.0,150000.0,-120000.0
5458,32024,7,145000.0,139300.0,-26550.0,165850.0,9400.0,156450.0,74500.0,81950.0,-8050.0,90000.0,-72500.0
5459,32025,7,120000.0,129000.0,4000.0,125000.0,19500.0,105500.0,18500.0,87000.0,-8500.0,95500.0,-12750.0
5460,32033,7,242000.0,177000.0,21750.0,155250.0,33750.0,121500.0,-133500.0,255000.0,29000.0,226000.0,36000.0


In [19]:
mean_error = []
for month in range(17,19):
    train = melt6[melt6['Month'] < month]
    val = melt6[melt6['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.21916
Month 18 - Error 0.21015
Mean Error = 0.21466


In [20]:
melt7 = melt.copy()
melt7['Last_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift()
melt7['Last_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt7['Last-1_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt7['Last-1_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt7['Last-2_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(3)
melt7['Last-2_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-2_Month_Sales'].diff()
melt7['Last-3_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(4)
melt7['Last-3_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-3_Month_Sales'].diff()
melt7['Last-4_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(5)
melt7['Last-4_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-4_Month_Sales'].diff()
melt7['Last-5_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(6)
melt7['Last-5_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-5_Month_Sales'].diff()

melt7 = melt7.dropna()
melt7.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff,Last-3_Month_Sales,Last-3_Month_Diff,Last-4_Month_Sales,Last-4_Month_Diff,Last-5_Month_Sales,Last-5_Month_Diff
6363,32003,8,269577.0,282500.0,7500.0,275000.0,-6000.0,281000.0,16500.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
6366,32011,8,145500.0,220000.0,131600.0,88400.0,-144100.0,232500.0,52500.0,180000.0,46000.0,134000.0,-16000.0,150000.0,-120000.0
6367,32024,8,90000.0,145000.0,5700.0,139300.0,-26550.0,165850.0,9400.0,156450.0,74500.0,81950.0,-8050.0,90000.0,-72500.0
6368,32025,8,149000.0,120000.0,-9000.0,129000.0,4000.0,125000.0,19500.0,105500.0,18500.0,87000.0,-8500.0,95500.0,-12750.0
6369,32033,8,225000.0,242000.0,65000.0,177000.0,21750.0,155250.0,33750.0,121500.0,-133500.0,255000.0,29000.0,226000.0,36000.0


In [21]:
mean_error = []
for month in range(17,19):
    train = melt7[melt7['Month'] < month]
    val = melt7[melt7['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.21256
Month 18 - Error 0.20878
Mean Error = 0.21067


### Validation

In [22]:
val = melt7[melt7['Month'] == 19]

xts = val.drop(['Sale_Price'], axis=1)

Y_pred = model.predict(xts)

In [23]:
Y_pred[0]

277843.948

In [24]:
melt7.loc[15453]

Zip_Code               32003
Month                     18
Sale_Price            278950
Last_Month_Sales      265000
Last_Month_Diff       -35000
Last-1_Month_Sales    300000
Last-1_Month_Diff        500
Last-2_Month_Sales    299500
Last-2_Month_Diff      54500
Last-3_Month_Sales    245000
Last-3_Month_Diff      -2300
Last-4_Month_Sales    247300
Last-4_Month_Diff     -39200
Last-5_Month_Sales    286500
Last-5_Month_Diff       3500
Name: 15453, dtype: object

In [25]:
yts[0]

278950.0

In [26]:
val = melt7[melt7['Month'] < 19]

xts = val.drop(['Sale_Price'], axis=1)

Y_pred = model.predict(xts)

In [33]:
xts = xts.astype(float)
xts.dtypes

Zip_Code              float64
Month                 float64
Last_Month_Sales      float64
Last_Month_Diff       float64
Last-1_Month_Sales    float64
Last-1_Month_Diff     float64
Last-2_Month_Sales    float64
Last-2_Month_Diff     float64
Last-3_Month_Sales    float64
Last-3_Month_Diff     float64
Last-4_Month_Sales    float64
Last-4_Month_Diff     float64
Last-5_Month_Sales    float64
Last-5_Month_Diff     float64
dtype: object

In [39]:
# Visualizing results of test
X_grid = np.arange(min(xts), max(xts), 500)
X_grid = X_grid.reshape((len(X_grid), 1))

# Scatter plot of original data
plt.scatter(xts, yts, color='green')

# Plot of predicted data
plt.plot(X_grid, model.predict(X_grid), color='red')
plt.title('Random Forest Regression')
plt.xlabel('Month')
plt.ylabel('Sale Price')

plt.show()

KeyError: 'Sales_Price'