In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt
from sklearn import linear_model

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

In [2]:
url = 'https://media.githubusercontent.com/media/EricYangg/4AI3-Walmart-Forecast/main/Processed%20Dataset/x_train.csv'
x_train = pd.read_csv(url)

url = 'https://media.githubusercontent.com/media/EricYangg/4AI3-Walmart-Forecast/main/Processed%20Dataset/x_test.csv'
x_test = pd.read_csv(url)

url = 'https://media.githubusercontent.com/media/EricYangg/4AI3-Walmart-Forecast/main/Processed%20Dataset/y_train.csv'
y_train = pd.read_csv(url)

url = 'https://media.githubusercontent.com/media/EricYangg/4AI3-Walmart-Forecast/main/Processed%20Dataset/y_test.csv'
y_test = pd.read_csv(url)


In [3]:
x_train

Unnamed: 0,Store,Dept,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,MDAverage,MDSum,MDMin,MDMax
0,1.392183,1.270486,-1.288570,-0.275106,-0.036322,-1.158147,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,-0.989963,-1.355282,-0.271794,-1.358972,-0.427943,-0.084662
1,-1.188910,-0.926810,-1.361242,-0.275106,-0.731251,-1.275919,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,0.466451,0.538139,-0.271794,-1.358972,-0.427943,-0.084662
2,-1.188910,0.352213,1.400316,-0.275106,0.244469,0.455762,-0.252011,-0.037627,-0.084038,0.046557,0.081929,0.683805,-0.047383,-0.049038,-0.245190,-0.252011,0.081929
3,0.140744,-1.221970,-0.125808,-0.275106,-0.086734,1.853757,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,-0.934874,0.135089,-0.271794,-1.358972,-0.427943,-0.084662
4,0.062529,-1.156379,-1.724605,-0.275106,-2.430631,-1.249747,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,-1.013147,-1.110017,-0.271794,-1.358972,-0.427943,-0.084662
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
330172,0.375389,-0.369287,1.424540,-0.275106,0.820144,0.998821,6.068185,-0.134782,-0.073029,6.259835,0.356177,-0.740830,0.149580,2.495277,12.476386,-0.134782,6.259835
330173,1.392183,-0.598856,-0.610292,-0.275106,-1.946565,-0.460240,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,-0.981996,-1.434711,-0.271794,-1.358972,-0.427943,-0.084662
330174,-0.641405,0.516190,1.109625,-0.275106,-0.264532,1.286708,3.053010,-0.173069,-0.061488,1.131793,0.261853,0.503592,0.325612,0.842420,4.212100,-0.173069,3.053010
330175,-0.484975,-1.123583,-0.464947,-0.275106,-2.517904,-0.763393,-0.427943,-0.173069,-0.084662,-0.278117,-0.395181,0.525052,-0.722532,-0.271794,-1.358972,-0.427943,-0.084662


In [4]:
reg = XGBRegressor(max_depth = 5, n_estimators = 150).fit(x_train,y_train)

In [5]:
reg.score(x_train, y_train)

0.9202453633765982

In [6]:
reg.score(x_test, y_test)

0.917340761802

In [7]:
y_pred = reg.predict(x_test)

In [8]:
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Mean squared error: 8.16
Coefficient of determination: 0.92


In [9]:
params = { 'max_depth': [3,6,10],
           'learning_rate': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000],
           'colsample_bytree': [0.3, 0.7]}

In [10]:
xgbr = XGBRegressor(seed = 20)

In [110]:
clf = GridSearchCV(estimator=xgbr, 
                   param_grid=params,
                   scoring='neg_mean_squared_error', 
                   verbose=2,
                   cv=3,
                   n_jobs=3
                   )

In [111]:
clf.fit(x_train, y_train)
print("Best parameters:", clf.best_params_)
print("Lowest RMSE: ", (-clf.best_score_)**(1/2.0))

Fitting 3 folds for each of 54 candidates, totalling 162 fits
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   4.8s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   4.8s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=100; total time=   4.8s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=500; total time=  23.3s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=500; total time=  23.4s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=500; total time=  23.4s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=1000; total time=  57.6s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=1000; total time=  58.0s
[CV] END colsample_bytree=0.3, learning_rate=0.01, max_depth=3, n_estimators=1000; total time=  58.2s
[CV] END colsample_bytree=

In [152]:
best_xgb = XGBRegressor(colsample_bytree=0.7, learning_rate=0.1, max_depth=10, n_estimators=1000).fit(x_train,y_train)

In [153]:
best_xgb.score(x_train, y_train)

0.9928892898346682

In [154]:
best_xgb.score(x_test, y_test)

0.9839574800631623

In [155]:
y_pred = best_xgb.predict(x_test)

In [156]:
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % r2_score(y_test, y_pred))

Mean squared error: 1.58
Coefficient of determination: 0.98


Prediction

In [151]:
#provided submission test dataset
url = 'https://media.githubusercontent.com/media/EricYangg/4AI3-Walmart-Forecast/main/Datasets/test.csv'
test = pd.read_csv(url)

#test dataset after preprocessing
url = 'https://media.githubusercontent.com/media/EricYangg/4AI3-Walmart-Forecast/main/Processed%20Dataset/Submission%20Test.csv'
submission_test = pd.read_csv(url)

Final Submission Format:

Id,Weekly_Sales

1_1_2012-11-02,0

1_1_2012-11-09,0

1_1_2012-11-16,0

In [118]:
Id = test['Store'].astype('string') + '_' + test['Dept'].astype('string') + '_' + test['Date'].astype('string')
Id

0           1_1_2012-11-02
1           1_1_2012-11-09
2           1_1_2012-11-16
3           1_1_2012-11-23
4           1_1_2012-11-30
                ...       
115059    45_98_2013-06-28
115060    45_98_2013-07-05
115061    45_98_2013-07-12
115062    45_98_2013-07-19
115063    45_98_2013-07-26
Length: 115064, dtype: string

In [157]:
submission_test

Unnamed: 0,Store,Dept,Date,IsHoliday,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,MDAverage,MDSum,MDMin,MDMax
0,-1.66,-1.41,-1.68,-0.29,0.07,-0.82,-0.09,0.32,-0.16,0.09,-0.06,1.17,0.57,0.02,0.10,-0.16,0.32
1,-1.66,-1.38,-1.68,-0.29,0.07,-0.82,-0.09,0.32,-0.16,0.09,-0.06,1.17,0.57,0.02,0.10,-0.16,0.32
2,-1.66,-1.35,-1.68,-0.29,0.07,-0.82,-0.09,0.32,-0.16,0.09,-0.06,1.17,0.57,0.02,0.10,-0.16,0.32
3,-1.66,-1.32,-1.68,-0.29,0.07,-0.82,-0.09,0.32,-0.16,0.09,-0.06,1.17,0.57,0.02,0.10,-0.16,0.32
4,-1.66,-1.28,-1.68,-0.29,0.07,-0.82,-0.09,0.32,-0.16,0.09,-0.06,1.17,0.57,0.02,0.10,-0.16,0.32
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115059,1.78,1.59,1.69,-0.29,1.18,0.93,-0.70,-0.26,-0.17,-0.41,-0.11,-1.32,-1.32,-0.33,-1.65,-0.70,-0.11
115060,1.78,1.62,1.69,-0.29,1.18,0.93,-0.70,-0.26,-0.17,-0.41,-0.11,-1.32,-1.32,-0.33,-1.65,-0.70,-0.11
115061,1.78,1.65,1.69,-0.29,1.18,0.93,-0.70,-0.26,-0.17,-0.41,-0.11,-1.32,-1.32,-0.33,-1.65,-0.70,-0.11
115062,1.78,1.72,1.69,-0.29,1.18,0.93,-0.70,-0.26,-0.17,-0.41,-0.11,-1.32,-1.32,-0.33,-1.65,-0.70,-0.11


In [158]:
y_pred = best_xgb.predict(submission_test)
y_pred

array([27.455963 , 33.92854  , 20.87351  , ..., 32.761765 , 18.676714 ,
        7.9342856], dtype=float32)

In [159]:
weekly_sales = y_pred**3
weekly_sales

array([20697.127 , 39056.695 ,  9094.66  , ..., 35164.29  ,  6514.8047,
         499.4862], dtype=float32)

In [160]:
weekly_sales[0]

20697.127

In [161]:
sub = {'Id':Id, 'Weekly_Sales':weekly_sales}
xgb_submission = pd.DataFrame(data=sub)

xgb_submission['Weekly_Sales'] = round(xgb_submission['Weekly_Sales'], 2)

xgb_submission

Unnamed: 0,Id,Weekly_Sales
0,1_1_2012-11-02,20697.13
1,1_1_2012-11-09,39056.70
2,1_1_2012-11-16,9094.66
3,1_1_2012-11-23,29936.73
4,1_1_2012-11-30,29278.66
...,...,...
115059,45_98_2013-06-28,4478.15
115060,45_98_2013-07-05,12207.09
115061,45_98_2013-07-12,35164.29
115062,45_98_2013-07-19,6514.80


In [150]:
xgb_submission.to_csv('Submission Files/XGBoost Best.csv', float_format='%.2f',index=False)