# **Walmart Sales Forceasting**

#### **Load Libraries and Data:**

In [None]:
#Import all libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDRegressor, RidgeCV, ElasticNet, LinearRegression, Ridge, Lasso, TweedieRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from datetime import date
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import datetime
import holidays
import warnings

#Turn off wartnings for cleaner code
warnings.filterwarnings('ignore')

In [None]:
#Mount to google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#Load data into data frame
features = pd.read_csv('/content/drive/Shareddrives/Artificial Intelligence/Data/features.csv')
print(features.shape)
features.head()

(8190, 12)


Unnamed: 0,Store,Date,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday
0,1,2010-02-05,42.31,2.572,,,,,,211.096358,8.106,False
1,1,2010-02-12,38.51,2.548,,,,,,211.24217,8.106,True
2,1,2010-02-19,39.93,2.514,,,,,,211.289143,8.106,False
3,1,2010-02-26,46.63,2.561,,,,,,211.319643,8.106,False
4,1,2010-03-05,46.5,2.625,,,,,,211.350143,8.106,False


In [None]:
#Load data into data frame
stores = pd.read_csv('/content/drive/Shareddrives/Artificial Intelligence/Data/stores.csv')
print(stores.shape)
stores.head()

(45, 3)


Unnamed: 0,Store,Type,Size
0,1,A,151315
1,2,A,202307
2,3,B,37392
3,4,A,205863
4,5,B,34875


In [None]:
#Load data into data frame
test = pd.read_csv('/content/drive/Shareddrives/Artificial Intelligence/Data/test.csv')
print(test.shape)
test.head()

(115064, 4)


Unnamed: 0,Store,Dept,Date,IsHoliday
0,1,1,2012-11-02,False
1,1,1,2012-11-09,False
2,1,1,2012-11-16,False
3,1,1,2012-11-23,True
4,1,1,2012-11-30,False


In [None]:
#Load data into data frame
train = pd.read_csv('/content/drive/Shareddrives/Artificial Intelligence/Data/train.csv')
print(train.shape)
train.head()

(421570, 5)


Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday
0,1,1,2010-02-05,24924.5,False
1,1,1,2010-02-12,46039.49,True
2,1,1,2010-02-19,41595.55,False
3,1,1,2010-02-26,19403.54,False
4,1,1,2010-03-05,21827.9,False


#### **Training Preprocessing:**

In [None]:
#Makes the column Date to a numerical value
train['Year'] = pd.to_datetime(train['Date']).dt.year
train['Month'] = pd.to_datetime(train['Date']).dt.month
train['Week'] = pd.to_datetime(train['Date']).dt.isocalendar().week
train['Day'] = pd.to_datetime(train['Date']).dt.day

train['IsHoliday'] = train['IsHoliday'].astype(int)

train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Year,Month,Week,Day
0,1,1,2010-02-05,24924.5,0,2010,2,5,5
1,1,1,2010-02-12,46039.49,1,2010,2,6,12
2,1,1,2010-02-19,41595.55,0,2010,2,7,19
3,1,1,2010-02-26,19403.54,0,2010,2,8,26
4,1,1,2010-03-05,21827.9,0,2010,3,9,5


In [None]:
#Merge dataframes
stores = stores.merge(features, on='Store', how='left')
train = train.merge(stores, on=['Store', 'Date', 'IsHoliday'], how='left')

In [None]:
#Check if dates are in dataframe and if so, create 2 new columns for if it is a holiday or not what type of holiday 
dates  = [
        (['2010-02-12', '2011-02-11', '2012-02-10', '2013-02-08'], 'Super_Bowl'),
        (['2010-09-10', '2011-09-09', '2012-09-07', '2013-09-06'], 'Labor_Day'),
        (['2010-11-26', '2011-11-25', '2012-11-23', '2013-11-29'], 'Thanksgiving'),
        (['2010-12-31', '2011-12-30', '2012-12-28', '2013-12-27'], 'Christmas')
    ]

train['HolidayType'] = 0
train['Holiday'] = 'N/A'

for i in range(0, len(dates)):
    holiday = dates[i]
    train.loc[train['Date'].isin(holiday[0]), 'HolidayType'] = i + 1            # 0 - Nothing, 1 - Super Bowl, 2 - Labor Day, 3 - Thanksgiving, 4 - Christmas
    train.loc[train['Date'].isin(holiday[0]), 'Holiday'] = holiday[1]
print()




In [None]:
#Describe the train dataframe
train.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,Year,Month,Week,Day,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,HolidayType
count,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,421570.0,150681.0,111248.0,137091.0,134967.0,151432.0,421570.0,421570.0,421570.0
mean,22.200546,44.260317,15981.258123,2010.968591,6.44951,25.826762,15.673131,136727.915739,60.090059,3.361027,7246.420196,3334.628621,1439.421384,3383.168256,4628.975079,171.201947,7.960289,0.161961
std,12.785297,30.492054,22711.183519,0.796876,3.243217,14.151887,8.753549,60980.583328,18.447931,0.458515,8291.221345,9475.357325,9623.07829,6292.384031,5962.887455,39.159276,1.863296,0.65714
min,1.0,1.0,-4988.94,2010.0,1.0,1.0,1.0,34875.0,-2.06,2.472,0.27,-265.76,-29.1,0.22,135.16,126.064,3.879,0.0
25%,11.0,18.0,2079.65,2010.0,4.0,14.0,8.0,93638.0,46.68,2.933,2240.27,41.6,5.08,504.22,1878.44,132.022667,6.891,0.0
50%,22.0,37.0,7612.03,2011.0,6.0,26.0,16.0,140167.0,62.09,3.452,5347.45,192.0,24.6,1481.31,3359.45,182.31878,7.866,0.0
75%,33.0,74.0,20205.8525,2012.0,9.0,38.0,23.0,202505.0,74.28,3.738,9210.9,1926.94,103.99,3595.04,5563.8,212.416993,8.572,0.0
max,45.0,99.0,693099.36,2012.0,12.0,52.0,31.0,219622.0,100.14,4.468,88646.76,104519.54,141630.61,67474.85,108519.28,227.232807,14.313,4.0


In [None]:
#Show if the dataframe has null values
train.isnull().sum()

Store                0
Dept                 0
Date                 0
Weekly_Sales         0
IsHoliday            0
Year                 0
Month                0
Week                 0
Day                  0
Type                 0
Size                 0
Temperature          0
Fuel_Price           0
MarkDown1       270889
MarkDown2       310322
MarkDown3       284479
MarkDown4       286603
MarkDown5       270138
CPI                  0
Unemployment         0
HolidayType          0
Holiday              0
dtype: int64

In [None]:
#Replace null values with 0 and remove rows with negative weekly sales since this is not possible
train = train[(train['Weekly_Sales']>=0)]
train = train.fillna(0)
train.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Year,Month,Week,Day,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,HolidayType,Holiday
0,1,1,2010-02-05,24924.5,0,2010,2,5,5,A,151315,42.31,2.572,0.0,0.0,0.0,0.0,0.0,211.096358,8.106,0,
1,1,1,2010-02-12,46039.49,1,2010,2,6,12,A,151315,38.51,2.548,0.0,0.0,0.0,0.0,0.0,211.24217,8.106,1,Super_Bowl
2,1,1,2010-02-19,41595.55,0,2010,2,7,19,A,151315,39.93,2.514,0.0,0.0,0.0,0.0,0.0,211.289143,8.106,0,
3,1,1,2010-02-26,19403.54,0,2010,2,8,26,A,151315,46.63,2.561,0.0,0.0,0.0,0.0,0.0,211.319643,8.106,0,
4,1,1,2010-03-05,21827.9,0,2010,3,9,5,A,151315,46.5,2.625,0.0,0.0,0.0,0.0,0.0,211.350143,8.106,0,


In [None]:
#Describe the train dataframe
train.describe()

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,Year,Month,Week,Day,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,HolidayType
count,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0,420285.0
mean,22.195477,44.242771,16030.329773,0.07034,2010.968443,6.449709,25.827729,15.673919,136749.569176,60.090474,3.360888,2590.187246,878.803239,468.771234,1083.462694,1662.706138,171.212152,7.960077,0.161838
std,12.787213,30.507197,22728.500149,0.25572,0.796893,3.243394,14.152442,8.752825,60992.688568,18.44826,0.458523,6053.225499,5076.525234,5533.593112,3895.801513,4205.946641,39.16228,1.863873,0.656741
min,1.0,1.0,0.0,0.0,2010.0,1.0,1.0,1.0,34875.0,-2.06,2.472,0.0,-265.76,-29.1,0.0,0.0,126.064,3.879,0.0
25%,11.0,18.0,2117.56,0.0,2010.0,4.0,14.0,8.0,93638.0,46.68,2.933,0.0,0.0,0.0,0.0,0.0,132.022667,6.891,0.0
50%,22.0,37.0,7659.09,0.0,2011.0,6.0,26.0,16.0,140167.0,62.09,3.452,0.0,0.0,0.0,0.0,0.0,182.350989,7.866,0.0
75%,33.0,74.0,20268.38,0.0,2012.0,9.0,38.0,23.0,202505.0,74.28,3.738,2801.5,2.4,4.54,425.29,2168.04,212.445487,8.567,0.0
max,45.0,99.0,693099.36,1.0,2012.0,12.0,52.0,31.0,219622.0,100.14,4.468,88646.76,104519.54,141630.61,67474.85,108519.28,227.232807,14.313,4.0


In [None]:
#Show if the dataframe has null values
train.isnull().sum()

Store           0
Dept            0
Date            0
Weekly_Sales    0
IsHoliday       0
Year            0
Month           0
Week            0
Day             0
Type            0
Size            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
HolidayType     0
Holiday         0
dtype: int64

In [None]:
#Place necessary columns into datasets for data splitting
X = train[['Store', 'Dept', 'Size', 'Month','Type', 'Year','Week', 'Day' ,'IsHoliday','CPI', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']]
y = train['Weekly_Sales']
#Change store type column from numbers to letters
X['Type'] = X['Type'].astype('category').cat.codes

print(X.head())
print("\n\n")
print(y.head())

   Store  Dept    Size  Month  ...  MarkDown2  MarkDown3  MarkDown4  MarkDown5
0      1     1  151315      2  ...        0.0        0.0        0.0        0.0
1      1     1  151315      2  ...        0.0        0.0        0.0        0.0
2      1     1  151315      2  ...        0.0        0.0        0.0        0.0
3      1     1  151315      2  ...        0.0        0.0        0.0        0.0
4      1     1  151315      3  ...        0.0        0.0        0.0        0.0

[5 rows x 15 columns]



0    24924.50
1    46039.49
2    41595.55
3    19403.54
4    21827.90
Name: Weekly_Sales, dtype: float64


In [None]:
X_train, X_rem, y_train, y_rem = train_test_split(X, y, train_size=0.5)           # splitting to train, test sets
X_valid, X_test, y_valid, y_test = train_test_split(X_rem, y_rem, test_size=0.5)  # splitting to validation set

#Normalize the data
scaler = StandardScaler()
scaler.fit(X_train)
training = scaler.transform(X_train)
testing = scaler.transform(X_valid)
validation = scaler.transform(X_test)

print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)
X_train.head()

(210142, 15)
(105071, 15)
(105072, 15)


Unnamed: 0,Store,Dept,Size,Month,Type,Year,Week,Day,IsHoliday,CPI,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5
247123,26,2,152513,8,0,2011,32,12,0,136.144129,0.0,0.0,0.0,0.0,0.0
350494,37,52,39910,4,2,2011,17,29,0,214.358097,0.0,0.0,0.0,0.0,0.0
117157,12,94,112238,10,1,2012,42,19,0,131.149968,1810.87,0.0,8.48,640.64,2250.86
139467,15,10,123737,4,1,2011,13,1,0,134.068258,0.0,0.0,0.0,0.0,0.0
137387,14,93,200898,10,0,2010,43,29,0,182.901117,0.0,0.0,0.0,0.0,0.0


#### **Testing Preprocessing:**

In [None]:
#Makes the column Date to a numerical value
test['Year'] = pd.to_datetime(test['Date']).dt.year
test['Month'] = pd.to_datetime(test['Date']).dt.month
test['Week'] = pd.to_datetime(test['Date']).dt.isocalendar().week
test['Day'] = pd.to_datetime(test['Date']).dt.day

test['IsHoliday'] = test['IsHoliday'].astype(int)

test.head()

Unnamed: 0,Store,Dept,Date,IsHoliday,Year,Month,Week,Day
0,1,1,2012-11-02,0,2012,11,44,2
1,1,1,2012-11-09,0,2012,11,45,9
2,1,1,2012-11-16,0,2012,11,46,16
3,1,1,2012-11-23,1,2012,11,47,23
4,1,1,2012-11-30,0,2012,11,48,30


In [None]:
#Merge dataframes
test = test.merge(stores, on = ['Store', 'Date', 'IsHoliday'], how='left')

In [None]:
#Check if dates are in dataframe and if so, create 2 new columns for if it is a holiday or not what type of holiday 
test['HolidayType'] = 0
test['Holiday'] = 'N/A'

for i in range(0, len(dates)):
    holiday = dates[i]
    test.loc[test['Date'].isin(holiday[0]), 'HolidayType'] = i + 1            # 0 - Nothing, 1 - Super Bowl, 2 - Labor Day, 3 - Thanksgiving, 4 - Christmas
    test.loc[test['Date'].isin(holiday[0]), 'Holiday'] = holiday[1]

In [None]:
#Describe the test dataframe
test.describe()

Unnamed: 0,Store,Dept,Year,Month,Week,Day,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,HolidayType
count,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,114915.0,86437.0,105235.0,102176.0,115064.0,76902.0,76902.0,115064.0
mean,22.238207,44.339524,2012.767182,5.729855,23.050963,15.583536,136497.688921,53.941804,3.581546,7689.216439,3734.051729,2403.088666,3356.219071,3922.681189,176.961347,6.868733,0.207224
std,12.80993,30.65641,0.422629,3.590331,15.750407,8.8449,61106.926438,18.724153,0.239442,10698.760716,8323.495014,13767.939313,7570.501545,19445.150745,41.239967,1.583427,0.794411
min,1.0,1.0,2012.0,1.0,1.0,1.0,34875.0,-7.29,2.872,-2781.45,-35.74,-179.26,0.22,-185.17,131.236226,3.684,0.0
25%,11.0,18.0,2013.0,3.0,10.0,8.0,93638.0,39.82,3.431,1966.46,180.35,15.1,155.46,1309.3,138.402033,5.771,0.0
50%,22.0,37.0,2013.0,5.0,20.0,15.0,140167.0,54.47,3.606,4842.29,742.59,78.26,840.94,2390.43,192.304445,6.806,0.0
75%,33.0,74.0,2013.0,7.0,30.0,23.0,202505.0,67.35,3.766,9439.14,2735.67,272.58,3096.92,4227.27,223.244532,8.036,0.0
max,45.0,99.0,2013.0,12.0,52.0,31.0,219622.0,101.95,4.125,103184.98,71074.17,149483.31,65344.64,771448.1,228.976456,10.199,4.0


In [None]:
#Show if the dataframe has null values
test.isnull().sum()

Store               0
Dept                0
Date                0
IsHoliday           0
Year                0
Month               0
Week                0
Day                 0
Type                0
Size                0
Temperature         0
Fuel_Price          0
MarkDown1         149
MarkDown2       28627
MarkDown3        9829
MarkDown4       12888
MarkDown5           0
CPI             38162
Unemployment    38162
HolidayType         0
Holiday             0
dtype: int64

In [None]:
#Replace null values with 0
test = test.fillna(0)

test.head()

Unnamed: 0,Store,Dept,Date,IsHoliday,Year,Month,Week,Day,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,HolidayType,Holiday
0,1,1,2012-11-02,0,2012,11,44,2,A,151315,55.32,3.386,6766.44,5147.7,50.82,3639.9,2737.42,223.462779,6.573,0,
1,1,1,2012-11-09,0,2012,11,45,9,A,151315,61.24,3.314,11421.32,3370.89,40.28,4646.79,6154.16,223.481307,6.573,0,
2,1,1,2012-11-16,0,2012,11,46,16,A,151315,52.92,3.252,9696.28,292.1,103.78,1133.15,6612.69,223.512911,6.573,0,
3,1,1,2012-11-23,1,2012,11,47,23,A,151315,56.23,3.211,883.59,4.17,74910.32,209.91,303.32,223.561947,6.573,3,Thanksgiving
4,1,1,2012-11-30,0,2012,11,48,30,A,151315,52.34,3.207,2460.03,0.0,3838.35,150.57,6966.34,223.610984,6.573,0,


In [None]:
#Describe the test dataframe
test.describe()

Unnamed: 0,Store,Dept,IsHoliday,Year,Month,Week,Day,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,HolidayType
count,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0,115064.0
mean,22.238207,44.339524,0.077592,2012.767182,5.729855,23.050963,15.583536,136497.688921,53.941804,3.581546,7679.25943,2805.049619,2197.811964,2980.298267,3922.681189,118.270541,4.590657,0.207224
std,12.80993,30.65641,0.267529,0.422629,3.590331,15.750407,8.8449,61106.926438,18.724153,0.239442,10695.406505,7392.556347,13183.888087,7212.030921,19445.150745,89.878361,3.483338,0.794411
min,1.0,1.0,0.0,2012.0,1.0,1.0,1.0,34875.0,-7.29,2.872,-2781.45,-35.74,-179.26,0.0,-185.17,0.0,0.0,0.0
25%,11.0,18.0,0.0,2013.0,3.0,10.0,8.0,93638.0,39.82,3.431,1963.55,0.0,7.77,70.37,1309.3,0.0,0.0,0.0
50%,22.0,37.0,0.0,2013.0,5.0,20.0,15.0,140167.0,54.47,3.606,4828.72,316.88,60.0,600.58,2390.43,138.402033,5.771,0.0
75%,33.0,74.0,0.0,2013.0,7.0,30.0,23.0,202505.0,67.35,3.766,9427.41,1575.85,244.2,2627.85,4227.27,201.21223,7.293,0.0
max,45.0,99.0,1.0,2013.0,12.0,52.0,31.0,219622.0,101.95,4.125,103184.98,71074.17,149483.31,65344.64,771448.1,228.976456,10.199,4.0


In [None]:
#Show if the dataframe has null values
test.isnull().sum()

Store           0
Dept            0
Date            0
IsHoliday       0
Year            0
Month           0
Week            0
Day             0
Type            0
Size            0
Temperature     0
Fuel_Price      0
MarkDown1       0
MarkDown2       0
MarkDown3       0
MarkDown4       0
MarkDown5       0
CPI             0
Unemployment    0
HolidayType     0
Holiday         0
dtype: int64

In [None]:
#Place necessary columns into dataset and change store type column from numbers to letters
test_data = test[['Store', 'Dept', 'Size', 'Month','Type', 'Year','Week', 'Day','IsHoliday','CPI', 'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5']]
test_data['Type'] = test_data['Type'].astype('category').cat.codes


test_data.head()

Unnamed: 0,Store,Dept,Size,Month,Type,Year,Week,Day,IsHoliday,CPI,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5
0,1,1,151315,11,0,2012,44,2,0,223.462779,6766.44,5147.7,50.82,3639.9,2737.42
1,1,1,151315,11,0,2012,45,9,0,223.481307,11421.32,3370.89,40.28,4646.79,6154.16
2,1,1,151315,11,0,2012,46,16,0,223.512911,9696.28,292.1,103.78,1133.15,6612.69
3,1,1,151315,11,0,2012,47,23,1,223.561947,883.59,4.17,74910.32,209.91,303.32
4,1,1,151315,11,0,2012,48,30,0,223.610984,2460.03,0.0,3838.35,150.57,6966.34


#### **Determine Best Model:**

---



In [None]:
#Weighted Mean Absolute Error
def WMAE(dataset, real, predicted):
    weights = dataset.IsHoliday.apply(lambda x: 5 if x else 1)
    return np.round(np.sum(weights*abs(real-predicted))/(np.sum(weights)), 2)

#Populate list of models that want to be investigated
model_list = {'LinearRegression':LinearRegression(), 'Ridge':Ridge(),
              'Lasso': Lasso(), 'TweedieRegressor':TweedieRegressor(),'RidgeCV':RidgeCV(),'ElasticNet':ElasticNet(),
              'HistGradientBoostingRegressor':HistGradientBoostingRegressor(), 'GradientBoostingRegressor':GradientBoostingRegressor(), 'RandomForest':RandomForestRegressor(),'DescisionTree':DecisionTreeRegressor()}

#Iterate through list test model
for  model_name,model in model_list.items():
     model.fit(X_train,y_train)
     predicted_val = model.predict(X_test)
     print('WMAE score of ',model_name,' : ',WMAE(X_test,y_test,predicted_val))
     print(60*'-')

WMAE score of  LinearRegression  :  14751.46
------------------------------------------------------------
WMAE score of  Ridge  :  14751.46
------------------------------------------------------------
WMAE score of  Lasso  :  14750.46
------------------------------------------------------------
WMAE score of  TweedieRegressor  :  14743.05
------------------------------------------------------------
WMAE score of  RidgeCV  :  14752.34
------------------------------------------------------------
WMAE score of  ElasticNet  :  14740.0
------------------------------------------------------------
WMAE score of  HistGradientBoostingRegressor  :  4258.03
------------------------------------------------------------
WMAE score of  GradientBoostingRegressor  :  7128.62
------------------------------------------------------------
WMAE score of  RandomForest  :  1628.88
------------------------------------------------------------
WMAE score of  DescisionTree  :  2117.44
----------------------------

#### **Determining Best Hyperparameters:**

In [None]:
#Test what hyperparemeters are best and print score for RandomForestRegressor(best model from previous step)
estimators = [80,100,120,140]
max_depth = [27,30,32]

for n in estimators:
  for m in max_depth:
    model = RandomForestRegressor(n_estimators=n, max_depth=m)
    model.fit(X_train, y_train)
    print('n_estimators =', n, 'max_depth =', m)
    print('WMAE score:',WMAE(X_test,y_test,model.predict(X_test)))

n_estimators = 80 max_depth = 27
WMAE score: 1626.7
n_estimators = 80 max_depth = 30
WMAE score: 1628.94
n_estimators = 80 max_depth = 32
WMAE score: 1634.82
n_estimators = 100 max_depth = 27
WMAE score: 1629.99
n_estimators = 100 max_depth = 30
WMAE score: 1621.5
n_estimators = 100 max_depth = 32
WMAE score: 1632.52
n_estimators = 120 max_depth = 27
WMAE score: 1625.71
n_estimators = 120 max_depth = 30
WMAE score: 1626.23
n_estimators = 120 max_depth = 32
WMAE score: 1621.2
n_estimators = 140 max_depth = 27
WMAE score: 1624.13
n_estimators = 140 max_depth = 30
WMAE score: 1624.75
n_estimators = 140 max_depth = 32
WMAE score: 1623.14


#### **Fitting Chosen Model and Predicting Weekly Sales:**

In [None]:
#Fit model with best hyperparemeters determined in previous step and predict data
random_forest = RandomForestRegressor(n_estimators=120, max_depth=32)
random_forest.fit(X, train['Weekly_Sales'])
pred = random_forest.predict(test_data)

#### **Writing Predicted Weekly Sales to CSV:**

In [None]:
#Format predicted data into dataframe and save as a csv file in google drive folder
output = pd.DataFrame({"ID":test_data.Store.astype(str)+'_'+test_data.Dept.astype(str)+'_'+test.Date.astype(str)+'_', "Weekly_Sales": pred})
output.to_csv('/content/drive/Shareddrives/Artificial Intelligence/Weekly Sales Prediction.csv', index=False)

output

Unnamed: 0,ID,Weekly_Sales
0,1_1_2012-11-02_,35980.886583
1,1_1_2012-11-09_,22315.532250
2,1_1_2012-11-16_,19267.986917
3,1_1_2012-11-23_,20979.267917
4,1_1_2012-11-30_,25509.087333
...,...,...
115059,45_98_2013-06-28_,707.771417
115060,45_98_2013-07-05_,801.661500
115061,45_98_2013-07-12_,771.990000
115062,45_98_2013-07-19_,764.186250
