In [298]:
# Import dependencies
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [299]:
# Read csv file
data = pd.read_csv('./Resources/Final_Data.csv')

# Keep only rows from 2019 for training; drop 2020 and 2021 data
index_names = data[(data['Year'] == 2020) & (data['Month'] == 7) | (data['Year'] == 2020) & (data['Month'] == 8)
                  | (data['Year'] == 2020) & (data['Month'] == 9) | (data['Year'] == 2020) & (data['Month'] == 10) | 
                  (data['Year'] == 2020) & (data['Month'] == 11) | (data['Year'] == 2020) & (data['Month'] == 12)].index

data.drop(index_names, inplace = True)

data = data.sort_values(by=['Year', 'Month'], axis=0)
  
data.head(15000)

Unnamed: 0,Zip_Code,Date,Year,Month,Sale_Price,Interest_Rate,Property_Tax,Rent_Price,Household_Income,Rent_Affordability,...,FTE_Employed,Unemployed,Expense_Index,Average_Commute,Crime_Index,Loan_Amount,Loan_Term,Loan_R,Loan_Payment,Home_Affordability
0,32003,2019-01-01,2019,1,244950.0,4.4640,3778,1113,94154.0,0.141853,...,10671,1069,104,35,28,195960.0,360.0,1.003720,988.713278,0.126012
24,32008,2019-01-01,2019,1,86000.0,4.4640,1120,800,33663.0,0.285180,...,1154,93,82,34,118,68800.0,360.0,1.003720,347.129381,0.123743
47,32009,2019-01-01,2019,1,275000.0,4.4640,1493,1113,68929.0,0.193765,...,214,84,104,39,143,220000.0,360.0,1.003720,1110.006742,0.193243
68,32011,2019-01-01,2019,1,270000.0,4.4640,1495,1113,63772.0,0.209434,...,2053,506,96,30,123,216000.0,360.0,1.003720,1089.824801,0.205073
92,32024,2019-01-01,2019,1,162500.0,4.4640,1374,869,56830.0,0.183495,...,2185,661,90,26,275,130000.0,360.0,1.003720,655.913075,0.138500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11910,33461,2020-05-01,2020,5,179000.0,3.2325,2049,1468,44107.0,0.399392,...,12267,2160,101,25,118,143200.0,360.0,1.002694,621.840894,0.169182
11934,33462,2020-05-01,2020,5,285000.0,3.2325,3494,1468,53428.0,0.329715,...,16007,2009,95,24,181,228000.0,360.0,1.002694,990.081871,0.222374
11958,33463,2020-05-01,2020,5,236500.0,3.2325,2859,1468,54417.0,0.323722,...,11858,3829,117,28,173,189200.0,360.0,1.002694,821.594254,0.181177
11982,33467,2020-05-01,2020,5,310000.0,3.2325,3433,1468,71821.0,0.245276,...,10184,2053,100,29,106,248000.0,360.0,1.002694,1076.931158,0.179936


In [300]:
data.loc[data.Year == 2020, "Month"] += 12

In [301]:
data.describe

<bound method NDFrame.describe of        Zip_Code        Date  Year  Month  Sale_Price  Interest_Rate  \
0         32003  2019-01-01  2019      1    244950.0         4.4640   
24        32008  2019-01-01  2019      1     86000.0         4.4640   
47        32009  2019-01-01  2019      1    275000.0         4.4640   
68        32011  2019-01-01  2019      1    270000.0         4.4640   
92        32024  2019-01-01  2019      1    162500.0         4.4640   
...         ...         ...   ...    ...         ...            ...   
21661     34994  2020-06-01  2020     18    139550.0         3.1625   
21702     34996  2020-06-01  2020     18    363000.0         3.1625   
21703     34996  2020-06-01  2020     18    363000.0         3.1625   
21750     34997  2020-06-01  2020     18    259900.0         3.1625   
21751     34997  2020-06-01  2020     18    259900.0         3.1625   

       Property_Tax  Rent_Price  Household_Income  Rent_Affordability  ...  \
0              3778        1113    

In [302]:
data = data.groupby(['Zip_Code','Month'])['Sale_Price'].agg(['sum']).reset_index()
data = data.pivot_table(values='sum', columns='Month', index='Zip_Code')
data.columns.name = None
data = data.reset_index()

data.head()

Unnamed: 0,Zip_Code,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18
0,32003,244950.0,270000.0,275000.0,264500.0,281000.0,275000.0,282500.0,269577.0,286000.0,258000.0,283000.0,286500.0,247300.0,245000.0,299500.0,300000.0,265000.0,278950.0
1,32008,86000.0,106250.0,,121500.0,74150.0,138750.0,92500.0,150000.0,100500.0,132000.0,126000.0,194250.0,277000.0,156200.0,77500.0,85000.0,118000.0,123500.0
2,32009,275000.0,95000.0,115000.0,,74868.0,185000.0,240000.0,,47100.0,242000.0,180000.0,352500.0,174150.0,272000.0,227500.0,233000.0,225000.0,85500.0
3,32011,270000.0,150000.0,134000.0,180000.0,232500.0,88400.0,220000.0,145500.0,172500.0,160000.0,134000.0,142000.0,121000.0,207000.0,121500.0,312450.0,165000.0,222000.0
4,32024,162500.0,90000.0,81950.0,156450.0,165850.0,139300.0,145000.0,90000.0,104000.0,160500.0,185000.0,139000.0,132500.0,135000.0,145850.0,134000.0,155000.0,139900.0


In [295]:
data.dtypes

Zip_Code      int64
1           float64
2           float64
3           float64
4           float64
5           float64
6           float64
7           float64
8           float64
9           float64
10          float64
11          float64
12          float64
13          float64
14          float64
15          float64
16          float64
17          float64
18          float64
dtype: object

In [303]:
melt = data.melt(id_vars='Zip_Code', var_name='Month', value_name='Sale_Price')

melt['Zip_Code'] = melt['Zip_Code']
melt['Month'] = melt['Month']

melt = melt.sort_values(['Month', 'Zip_Code'])
melt.head(15000)

Unnamed: 0,Zip_Code,Month,Sale_Price
0,32003,1,244950.0
1,32008,1,86000.0
2,32009,1,275000.0
3,32011,1,270000.0
4,32024,1,162500.0
...,...,...,...
14995,33183,17,214900.0
14996,33184,17,280000.0
14997,33185,17,367000.0
14998,33186,17,305000.0


In [305]:
index = melt.index
number_of_rows = len(index)
 
print(number_of_rows)

16362


In [306]:
melt2 = melt.copy()
melt2['Last_Month_Sales'] = melt2.groupby(['Zip_Code'])['Sale_Price'].shift()
melt2['Last_Month_Diff'] = melt2.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt2 = melt2.dropna()
melt2.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff
1818,32003,3,275000.0,270000.0,25050.0
1820,32009,3,115000.0,95000.0,-180000.0
1821,32011,3,134000.0,150000.0,-120000.0
1822,32024,3,81950.0,90000.0,-72500.0
1823,32025,3,87000.0,95500.0,-12750.0


In [307]:
index = melt2.index
number_of_rows = len(index)
 
print(number_of_rows)

13581


In [308]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

### Establishing Baseline

In [310]:
mean_error = []
for month in range(17,19):
    train = melt2[melt2['Month'] < month]
    val = melt2[melt2['Month'] == month]

    p = val['Last_Month_Sales'].values

    error = rmsle(val['Sale_Price'].values, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)

print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.28449
Month 18 - Error 0.28322
Mean Error = 0.28386


### Training

In [311]:
mean_error = []
for month in range(17,19):
    train = melt2[melt2['Month'] < month]
    val = melt2[melt2['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.26321
Month 18 - Error 0.24062
Mean Error = 0.25192


In [313]:
melt3 = melt.copy()
melt3['Last_Month_Sales'] = melt3.groupby(['Zip_Code'])['Sale_Price'].shift()
melt3['Last_Month_Diff'] = melt3.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt3['Last-1_Month_Sales'] = melt3.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt3['Last-1_Month_Diff'] = melt3.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt3 = melt3.dropna()
melt3.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff
2727,32003,4,264500.0,275000.0,5000.0,270000.0,25050.0
2730,32011,4,180000.0,134000.0,-16000.0,150000.0,-120000.0
2731,32024,4,156450.0,81950.0,-8050.0,90000.0,-72500.0
2732,32025,4,105500.0,87000.0,-8500.0,95500.0,-12750.0
2733,32033,4,121500.0,255000.0,29000.0,226000.0,36000.0


In [314]:
mean_error = []
for month in range(17,19):
    train = melt3[melt3['Month'] < month]
    val = melt3[melt3['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.24361
Month 18 - Error 0.23621
Mean Error = 0.23991


In [315]:
melt4 = melt.copy()
melt4['Last_Month_Sales'] = melt4.groupby(['Zip_Code'])['Sale_Price'].shift()
melt4['Last_Month_Diff'] = melt4.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt4['Last-1_Month_Sales'] = melt4.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt4['Last-1_Month_Diff'] = melt4.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt4['Last-2_Month_Sales'] = melt4.groupby(['Zip_Code'])['Sale_Price'].shift(3)
melt4['Last-2_Month_Diff'] = melt4.groupby(['Zip_Code'])['Last-2_Month_Sales'].diff()
melt4 = melt4.dropna()
melt4.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff
3636,32003,5,281000.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
3639,32011,5,232500.0,180000.0,46000.0,134000.0,-16000.0,150000.0,-120000.0
3640,32024,5,165850.0,156450.0,74500.0,81950.0,-8050.0,90000.0,-72500.0
3641,32025,5,125000.0,105500.0,18500.0,87000.0,-8500.0,95500.0,-12750.0
3642,32033,5,155250.0,121500.0,-133500.0,255000.0,29000.0,226000.0,36000.0


In [317]:
mean_error = []
for month in range(17,19):
    train = melt4[melt4['Month'] < month]
    val = melt4[melt4['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.22984
Month 18 - Error 0.21586
Mean Error = 0.22285


In [318]:
melt5 = melt.copy()
melt5['Last_Month_Sales'] = melt5.groupby(['Zip_Code'])['Sale_Price'].shift()
melt5['Last_Month_Diff'] = melt5.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt5['Last-1_Month_Sales'] = melt5.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt5['Last-1_Month_Diff'] = melt5.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt5['Last-2_Month_Sales'] = melt5.groupby(['Zip_Code'])['Sale_Price'].shift(3)
melt5['Last-2_Month_Diff'] = melt5.groupby(['Zip_Code'])['Last-2_Month_Sales'].diff()
melt5['Last-3_Month_Sales'] = melt5.groupby(['Zip_Code'])['Sale_Price'].shift(4)
melt5['Last-3_Month_Diff'] = melt5.groupby(['Zip_Code'])['Last-3_Month_Sales'].diff()
melt5 = melt5.dropna()
melt5.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff,Last-3_Month_Sales,Last-3_Month_Diff
4545,32003,6,275000.0,281000.0,16500.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
4548,32011,6,88400.0,232500.0,52500.0,180000.0,46000.0,134000.0,-16000.0,150000.0,-120000.0
4549,32024,6,139300.0,165850.0,9400.0,156450.0,74500.0,81950.0,-8050.0,90000.0,-72500.0
4550,32025,6,129000.0,125000.0,19500.0,105500.0,18500.0,87000.0,-8500.0,95500.0,-12750.0
4551,32033,6,177000.0,155250.0,33750.0,121500.0,-133500.0,255000.0,29000.0,226000.0,36000.0


In [319]:
mean_error = []
for month in range(17,19):
    train = melt5[melt5['Month'] < month]
    val = melt5[melt5['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.22250
Month 18 - Error 0.21228
Mean Error = 0.21739


In [320]:
melt6 = melt.copy()
melt6['Last_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift()
melt6['Last_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt6['Last-1_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt6['Last-1_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt6['Last-2_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift(3)
melt6['Last-2_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last-2_Month_Sales'].diff()
melt6['Last-3_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift(4)
melt6['Last-3_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last-3_Month_Sales'].diff()
melt6['Last-4_Month_Sales'] = melt6.groupby(['Zip_Code'])['Sale_Price'].shift(5)
melt6['Last-4_Month_Diff'] = melt6.groupby(['Zip_Code'])['Last-4_Month_Sales'].diff()

melt6 = melt6.dropna()
melt6.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff,Last-3_Month_Sales,Last-3_Month_Diff,Last-4_Month_Sales,Last-4_Month_Diff
5454,32003,7,282500.0,275000.0,-6000.0,281000.0,16500.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
5457,32011,7,220000.0,88400.0,-144100.0,232500.0,52500.0,180000.0,46000.0,134000.0,-16000.0,150000.0,-120000.0
5458,32024,7,145000.0,139300.0,-26550.0,165850.0,9400.0,156450.0,74500.0,81950.0,-8050.0,90000.0,-72500.0
5459,32025,7,120000.0,129000.0,4000.0,125000.0,19500.0,105500.0,18500.0,87000.0,-8500.0,95500.0,-12750.0
5460,32033,7,242000.0,177000.0,21750.0,155250.0,33750.0,121500.0,-133500.0,255000.0,29000.0,226000.0,36000.0


In [321]:
mean_error = []
for month in range(17,19):
    train = melt6[melt6['Month'] < month]
    val = melt6[melt6['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.21916
Month 18 - Error 0.21015
Mean Error = 0.21466


In [324]:
melt7 = melt.copy()
melt7['Last_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift()
melt7['Last_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last_Month_Sales'].diff()
melt7['Last-1_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(2)
melt7['Last-1_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-1_Month_Sales'].diff()
melt7['Last-2_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(3)
melt7['Last-2_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-2_Month_Sales'].diff()
melt7['Last-3_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(4)
melt7['Last-3_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-3_Month_Sales'].diff()
melt7['Last-4_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(5)
melt7['Last-4_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-4_Month_Sales'].diff()
melt7['Last-5_Month_Sales'] = melt7.groupby(['Zip_Code'])['Sale_Price'].shift(6)
melt7['Last-5_Month_Diff'] = melt7.groupby(['Zip_Code'])['Last-5_Month_Sales'].diff()

melt7 = melt7.dropna()
melt7.head()

Unnamed: 0,Zip_Code,Month,Sale_Price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff,Last-3_Month_Sales,Last-3_Month_Diff,Last-4_Month_Sales,Last-4_Month_Diff,Last-5_Month_Sales,Last-5_Month_Diff
6363,32003,8,269577.0,282500.0,7500.0,275000.0,-6000.0,281000.0,16500.0,264500.0,-10500.0,275000.0,5000.0,270000.0,25050.0
6366,32011,8,145500.0,220000.0,131600.0,88400.0,-144100.0,232500.0,52500.0,180000.0,46000.0,134000.0,-16000.0,150000.0,-120000.0
6367,32024,8,90000.0,145000.0,5700.0,139300.0,-26550.0,165850.0,9400.0,156450.0,74500.0,81950.0,-8050.0,90000.0,-72500.0
6368,32025,8,149000.0,120000.0,-9000.0,129000.0,4000.0,125000.0,19500.0,105500.0,18500.0,87000.0,-8500.0,95500.0,-12750.0
6369,32033,8,225000.0,242000.0,65000.0,177000.0,21750.0,155250.0,33750.0,121500.0,-133500.0,255000.0,29000.0,226000.0,36000.0


In [325]:
mean_error = []
for month in range(17,19):
    train = melt7[melt7['Month'] < month]
    val = melt7[melt7['Month'] == month]

    xtr, xts = train.drop(['Sale_Price'], axis=1), val.drop(['Sale_Price'], axis=1)
    ytr, yts = train['Sale_Price'].values, val['Sale_Price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 17 - Error 0.21256
Month 18 - Error 0.20878
Mean Error = 0.21067


### Validation