In [151]:
# Import dependencies
import pandas as pd
import numpy as np
%matplotlib inline

from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor

In [152]:
# Read csv file
data = pd.read_csv('./Resources/Final_Data.csv')

# Keep only rows from 2019 for training; drop 2020 and 2021 data
index_names = data[data['year'] != 2019 | 2020].index
data.drop(index_names, inplace = True)

data = data.sort_values(by=['year', 'month'], axis=0)
  
data.head()

FileNotFoundError: [Errno 2] File ./Resources/Final_Data.csv does not exist: './Resources/Final_Data.csv'

In [103]:
data = data.groupby(['zipcode','month'])['avg_sale_price'].agg(['sum']).reset_index()
data = data.pivot_table(values='sum', columns='month', index='zipcode')
data.columns.name = None
data = data.reset_index()

data.head()

Unnamed: 0,zipcode,1,2,3,4,5,6,7,8,9,10,11,12
0,32003,289290.0,286173.0,301620.0,263475.0,293816.0,298865.0,295053.0,280793.0,299109.0,293246.0,307094.0,312718.0
1,32008,94875.0,106250.0,,132475.0,76825.0,153000.0,131322.0,168143.0,126875.0,104500.0,139929.0,178917.0
2,32009,275000.0,99000.0,112000.0,,113994.0,150220.0,227400.0,,150333.0,201667.0,177115.0,352500.0
3,32011,219310.0,152160.0,160508.0,171022.0,210804.0,144637.0,220224.0,170269.0,214582.0,193573.0,146888.0,150112.0
4,32024,252829.0,114341.0,87738.0,157358.0,147920.0,137455.0,157163.0,108509.0,145418.0,170477.0,192769.0,150936.0


In [104]:
data.dtypes

zipcode      int64
1          float64
2          float64
3          float64
4          float64
5          float64
6          float64
7          float64
8          float64
9          float64
10         float64
11         float64
12         float64
dtype: object

In [105]:
melt = data.melt(id_vars='zipcode', var_name='month', value_name='avg_sale_price')

melt['zipcode'] = melt['zipcode']
melt['month'] = melt['month']

melt = melt.sort_values(['month', 'zipcode'])
melt.head(3000)

Unnamed: 0,zipcode,month,avg_sale_price
0,32003,1,289290.0
1,32008,1,94875.0
2,32009,1,275000.0
3,32011,1,219310.0
4,32024,1,252829.0
...,...,...,...
2995,32343,4,136365.0
2996,32344,4,146219.0
2997,32346,4,214350.0
2998,32347,4,87338.0


In [106]:
melt.dtypes

zipcode             int64
month              object
avg_sale_price    float64
dtype: object

In [107]:
melt.astype({'month': 'float64'}).dtypes


zipcode             int64
month             float64
avg_sale_price    float64
dtype: object

In [108]:
index = melt.index
number_of_rows = len(index)
 
print(number_of_rows)

11412


In [109]:
melt2 = melt.copy()
melt2['last_month_sales'] = melt2.groupby(['zipcode'])['avg_sale_price'].shift()
melt2['last_month_diff'] = melt2.groupby(['zipcode'])['last_month_sales'].diff()
melt2 = melt2.dropna()
melt2.head()

Unnamed: 0,zipcode,month,avg_sale_price,last_month_sales,last_month_diff
1902,32003,3,301620.0,286173.0,-3117.0
1904,32009,3,112000.0,99000.0,-176000.0
1905,32011,3,160508.0,152160.0,-67150.0
1906,32024,3,87738.0,114341.0,-138488.0
1907,32025,3,100786.0,118032.0,16522.0


In [110]:
index = melt2.index
number_of_rows = len(index)
 
print(number_of_rows)

8762


In [111]:
def rmsle(ytrue, ypred):
    return np.sqrt(mean_squared_log_error(ytrue, ypred))

### Establishing Baseline

In [112]:
mean_error = []
for month in range(11,13):
    train = melt2[melt2['month'] < month]
    val = melt2[melt2['month'] == month]

    p = val['last_month_sales'].values

    error = rmsle(val['avg_sale_price'].values, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)

print('Mean Error = %.5f' % np.mean(mean_error))

Month 11 - Error 0.26051
Month 12 - Error 0.28529
Mean Error = 0.27290


In [113]:
melt2.dtypes

zipcode               int64
month                object
avg_sale_price      float64
last_month_sales    float64
last_month_diff     float64
dtype: object

### Training

In [114]:
mean_error = []
for month in range(11,13):
    train = melt2[melt2['month'] < month]
    val = melt2[melt2['month'] == month]

    xtr, xts = train.drop(['avg_sale_price'], axis=1), val.drop(['avg_sale_price'], axis=1)
    ytr, yts = train['avg_sale_price'].values, val['avg_sale_price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 11 - Error 0.24223
Month 12 - Error 0.24292
Mean Error = 0.24258


In [115]:
melt3 = melt.copy()
melt3['Last_Month_Sales'] = melt3.groupby(['zipcode'])['avg_sale_price'].shift()
melt3['Last_Month_Diff'] = melt3.groupby(['zipcode'])['Last_Month_Sales'].diff()
melt3['Last-1_Month_Sales'] = melt3.groupby(['zipcode'])['avg_sale_price'].shift(2)
melt3['Last-1_Month_Diff'] = melt3.groupby(['zipcode'])['Last-1_Month_Sales'].diff()
melt3 = melt3.dropna()
melt3.head()

Unnamed: 0,zipcode,month,avg_sale_price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff
2853,32003,4,263475.0,301620.0,15447.0,286173.0,-3117.0
2856,32011,4,171022.0,160508.0,8348.0,152160.0,-67150.0
2857,32024,4,157358.0,87738.0,-26603.0,114341.0,-138488.0
2858,32025,4,110079.0,100786.0,-17246.0,118032.0,16522.0
2859,32033,4,121500.0,243000.0,-5794.0,248794.0,69419.0


In [116]:
mean_error = []
for month in range(11,13):
    train = melt3[melt3['month'] < month]
    val = melt3[melt3['month'] == month]

    xtr, xts = train.drop(['avg_sale_price'], axis=1), val.drop(['avg_sale_price'], axis=1)
    ytr, yts = train['avg_sale_price'].values, val['avg_sale_price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 11 - Error 0.23398
Month 12 - Error 0.21673
Mean Error = 0.22536


In [117]:
melt4 = melt.copy()
melt4['Last_Month_Sales'] = melt4.groupby(['zipcode'])['avg_sale_price'].shift()
melt4['Last_Month_Diff'] = melt4.groupby(['zipcode'])['Last_Month_Sales'].diff()
melt4['Last-1_Month_Sales'] = melt4.groupby(['zipcode'])['avg_sale_price'].shift(2)
melt4['Last-1_Month_Diff'] = melt4.groupby(['zipcode'])['Last-1_Month_Sales'].diff()
melt4['Last-2_Month_Sales'] = melt4.groupby(['zipcode'])['avg_sale_price'].shift(3)
melt4['Last-2_Month_Diff'] = melt4.groupby(['zipcode'])['Last-2_Month_Sales'].diff()
melt4 = melt4.dropna()
melt4.head()

Unnamed: 0,zipcode,month,avg_sale_price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff
3804,32003,5,293816.0,263475.0,-38145.0,301620.0,15447.0,286173.0,-3117.0
3807,32011,5,210804.0,171022.0,10514.0,160508.0,8348.0,152160.0,-67150.0
3808,32024,5,147920.0,157358.0,69620.0,87738.0,-26603.0,114341.0,-138488.0
3809,32025,5,156524.0,110079.0,9293.0,100786.0,-17246.0,118032.0,16522.0
3810,32033,5,156700.0,121500.0,-121500.0,243000.0,-5794.0,248794.0,69419.0


In [118]:
mean_error = []
for month in range(11,13):
    train = melt4[melt4['month'] < month]
    val = melt4[melt4['month'] == month]

    xtr, xts = train.drop(['avg_sale_price'], axis=1), val.drop(['avg_sale_price'], axis=1)
    ytr, yts = train['avg_sale_price'].values, val['avg_sale_price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 11 - Error 0.21861
Month 12 - Error 0.21076
Mean Error = 0.21468


In [119]:
melt5 = melt.copy()
melt5['Last_Month_Sales'] = melt5.groupby(['zipcode'])['avg_sale_price'].shift()
melt5['Last_Month_Diff'] = melt5.groupby(['zipcode'])['Last_Month_Sales'].diff()
melt5['Last-1_Month_Sales'] = melt5.groupby(['zipcode'])['avg_sale_price'].shift(2)
melt5['Last-1_Month_Diff'] = melt5.groupby(['zipcode'])['Last-1_Month_Sales'].diff()
melt5['Last-2_Month_Sales'] = melt5.groupby(['zipcode'])['avg_sale_price'].shift(3)
melt5['Last-2_Month_Diff'] = melt5.groupby(['zipcode'])['Last-2_Month_Sales'].diff()
melt5['Last-3_Month_Sales'] = melt5.groupby(['zipcode'])['avg_sale_price'].shift(4)
melt5['Last-3_Month_Diff'] = melt5.groupby(['zipcode'])['Last-3_Month_Sales'].diff()
melt5 = melt5.dropna()
melt5.head()

Unnamed: 0,zipcode,month,avg_sale_price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff,Last-3_Month_Sales,Last-3_Month_Diff
4755,32003,6,298865.0,293816.0,30341.0,263475.0,-38145.0,301620.0,15447.0,286173.0,-3117.0
4758,32011,6,144637.0,210804.0,39782.0,171022.0,10514.0,160508.0,8348.0,152160.0,-67150.0
4759,32024,6,137455.0,147920.0,-9438.0,157358.0,69620.0,87738.0,-26603.0,114341.0,-138488.0
4760,32025,6,139078.0,156524.0,46445.0,110079.0,9293.0,100786.0,-17246.0,118032.0,16522.0
4761,32033,6,180849.0,156700.0,35200.0,121500.0,-121500.0,243000.0,-5794.0,248794.0,69419.0


In [120]:
mean_error = []
for month in range(11,13):
    train = melt5[melt5['month'] < month]
    val = melt5[melt5['month'] == month]

    xtr, xts = train.drop(['avg_sale_price'], axis=1), val.drop(['avg_sale_price'], axis=1)
    ytr, yts = train['avg_sale_price'].values, val['avg_sale_price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 11 - Error 0.21694
Month 12 - Error 0.20501
Mean Error = 0.21097


In [121]:
melt6 = melt.copy()
melt6['Last_Month_Sales'] = melt6.groupby(['zipcode'])['avg_sale_price'].shift()
melt6['Last_Month_Diff'] = melt6.groupby(['zipcode'])['Last_Month_Sales'].diff()
melt6['Last-1_Month_Sales'] = melt6.groupby(['zipcode'])['avg_sale_price'].shift(2)
melt6['Last-1_Month_Diff'] = melt6.groupby(['zipcode'])['Last-1_Month_Sales'].diff()
melt6['Last-2_Month_Sales'] = melt6.groupby(['zipcode'])['avg_sale_price'].shift(3)
melt6['Last-2_Month_Diff'] = melt6.groupby(['zipcode'])['Last-2_Month_Sales'].diff()
melt6['Last-3_Month_Sales'] = melt6.groupby(['zipcode'])['avg_sale_price'].shift(4)
melt6['Last-3_Month_Diff'] = melt6.groupby(['zipcode'])['Last-3_Month_Sales'].diff()
melt6['Last-4_Month_Sales'] = melt6.groupby(['zipcode'])['avg_sale_price'].shift(5)
melt6['Last-4_Month_Diff'] = melt6.groupby(['zipcode'])['Last-4_Month_Sales'].diff()

melt6 = melt6.dropna()
melt6.head()

Unnamed: 0,zipcode,month,avg_sale_price,Last_Month_Sales,Last_Month_Diff,Last-1_Month_Sales,Last-1_Month_Diff,Last-2_Month_Sales,Last-2_Month_Diff,Last-3_Month_Sales,Last-3_Month_Diff,Last-4_Month_Sales,Last-4_Month_Diff
5706,32003,7,295053.0,298865.0,5049.0,293816.0,30341.0,263475.0,-38145.0,301620.0,15447.0,286173.0,-3117.0
5709,32011,7,220224.0,144637.0,-66167.0,210804.0,39782.0,171022.0,10514.0,160508.0,8348.0,152160.0,-67150.0
5710,32024,7,157163.0,137455.0,-10465.0,147920.0,-9438.0,157358.0,69620.0,87738.0,-26603.0,114341.0,-138488.0
5711,32025,7,128616.0,139078.0,-17446.0,156524.0,46445.0,110079.0,9293.0,100786.0,-17246.0,118032.0,16522.0
5712,32033,7,241143.0,180849.0,24149.0,156700.0,35200.0,121500.0,-121500.0,243000.0,-5794.0,248794.0,69419.0


In [122]:
mean_error = []
for month in range(11,13):
    train = melt6[melt6['month'] < month]
    val = melt6[melt6['month'] == month]

    xtr, xts = train.drop(['avg_sale_price'], axis=1), val.drop(['avg_sale_price'], axis=1)
    ytr, yts = train['avg_sale_price'].values, val['avg_sale_price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, ytr)

    p = model.predict(xts)

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 11 - Error 0.21520
Month 12 - Error 0.20217
Mean Error = 0.20869


### Changing the fit

In [123]:
mean_error = []
for month in range(11,13):
    train = melt6[melt6['month'] < month]
    val = melt6[melt6['month'] == month]

    xtr, xts = train.drop(['avg_sale_price'], axis=1), val.drop(['avg_sale_price'], axis=1)
    ytr, yts = train['avg_sale_price'].values, val['avg_sale_price'].values

    model = RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    model.fit(xtr, np.log1p(ytr))

    p = np.expm1(model.predict(xts))

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

Month 11 - Error 0.20669
Month 12 - Error 0.20384
Mean Error = 0.20526


In [150]:
val.loc[:, 'Prediction'] = np.round(p)
val.plot.scatter(x='Prediction', y='avg_sale_price', figsize=(15,10), title='Prediction vs Sales', 
                 ylim=(0,11), xlim=(0,11))

ValueError: Must have equal len keys and value when setting with an iterable

### Try again later...

In [135]:
melt6.astype({'month': 'float'}).dtypes

zipcode                 int64
month                 float64
avg_sale_price        float64
Last_Month_Sales      float64
Last_Month_Diff       float64
Last-1_Month_Sales    float64
Last-1_Month_Diff     float64
Last-2_Month_Sales    float64
Last-2_Month_Diff     float64
Last-3_Month_Sales    float64
Last-3_Month_Diff     float64
Last-4_Month_Sales    float64
Last-4_Month_Diff     float64
dtype: object

### Gradient Boosted Trees

In [136]:
mean_error = []
for month in range(11,13):
    train = melt6[melt6['month'] < month]
    val = melt6[melt6['month'] == month]

    xtr, xts = train.drop(['avg_sale_price'], axis=1), val.drop(['avg_sale_price'], axis=1)
    ytr, yts = train['avg_sale_price'].values, val['avg_sale_price'].values

    model = LGBMRegressor(n_estimators=1000, learning_rate=0.01)
    model.fit(xtr, np.log1p(ytr))

    p = np.expm1(model.predict(xts))

    error = rmsle(yts, p)
    print('Month %d - Error %.5f' % (month, error))
    mean_error.append(error)
print('Mean Error = %.5f' % np.mean(mean_error))

ValueError: DataFrame.dtypes for data must be int, float or bool.
Did not expect the data types in the following fields: month