# Step #01: Importing Packages

In [1]:
# 3rd party packages
import numpy as np # linear algebra
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn import metrics

# For visualisations
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')
sns.set()
%matplotlib inline
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go

import warnings
warnings.filterwarnings('ignore')

# pd.options.display.max_rows=2000

In [2]:
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE

# Step #02: Fetching Data

In [38]:
df = pd.read_csv('train.csv')

In [39]:
# analysing dataframe's shape
df.head(30)

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
5,2013-01-06,1,1,12
6,2013-01-07,1,1,10
7,2013-01-08,1,1,9
8,2013-01-09,1,1,12
9,2013-01-10,1,1,9


In [23]:
# minimum date
df.date.min()

'2013-01-01'

In [24]:
# maximum date
df.date.max()

'2017-12-31'

In [25]:
test = pd.read_csv('test.csv')

In [26]:
test.shape

(45000, 4)

In [27]:
test.tail()

Unnamed: 0,id,date,store,item
44995,44995,2018-03-27,10,50
44996,44996,2018-03-28,10,50
44997,44997,2018-03-29,10,50
44998,44998,2018-03-30,10,50
44999,44999,2018-03-31,10,50


In [28]:
test.date.min()

'2018-01-01'

In [29]:
test.date.max()

'2018-03-31'

# Step #03: Cleaning Data

In [30]:
# sorting the whole dataframe on the basis of column "date"
df = df.sort_values(by=['date'], ascending=True)

In [31]:
# resetting index of the dataframe
df.reset_index(inplace=True)
# dropping unwanted column created while resetting index of the dataframe
df.drop(columns=['index'], inplace=True)
df.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-01,7,12,26
2,2013-01-01,7,46,27
3,2013-01-01,8,12,54
4,2013-01-01,9,12,35


#  Step #04: Visualising Data

In [32]:
# Create traces and visualzing line graph of the initial 100000 data points
fig = go.Figure()
fig.add_trace(go.Scatter(x=df.date.iloc[:100000], y=df.sales[:100000],
                    mode='lines',
                    name='lines'))
fig.show()

# Step #05: Transformation

In [42]:
df['date'] = pd.to_datetime(df['date'])

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 4 columns):
date     913000 non-null datetime64[ns]
store    913000 non-null int64
item     913000 non-null int64
sales    913000 non-null int64
dtypes: datetime64[ns](1), int64(3)
memory usage: 27.9 MB


In [60]:
temp = df[(df.date.dt.year==2013)&(df.date.dt.month==1)]
temp

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
5,2013-01-06,1,1,12
6,2013-01-07,1,1,10
7,2013-01-08,1,1,9
8,2013-01-09,1,1,12
9,2013-01-10,1,1,9


In [52]:
temp.set_index('date', inplace=True)
print(temp.index)
print(temp)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10', '2013-01-11', '2013-01-12',
               '2013-01-13', '2013-01-14', '2013-01-15', '2013-01-16',
               '2013-01-17', '2013-01-18', '2013-01-19', '2013-01-20',
               '2013-01-21', '2013-01-22', '2013-01-23', '2013-01-24',
               '2013-01-25', '2013-01-26', '2013-01-27', '2013-01-28',
               '2013-01-29', '2013-01-30', '2013-01-31'],
              dtype='datetime64[ns]', name='date', freq=None)
            store  item  sales
date                          
2013-01-01      1     1     13
2013-01-02      1     1     11
2013-01-03      1     1     14
2013-01-04      1     1     13
2013-01-05      1     1     10
2013-01-06      1     1     12
2013-01-07      1     1     10
2013-01-08      1     1      9
2013-01-09      1     1     12
2013-01-10      1     1      9
2013-01-11 

In [61]:
temp = temp.groupby(['store','item', pd.Grouper(key='date', freq='D')]).sum().reset_index()

In [59]:
df[(df.date.dt.year==2013)&(df.date.dt.month==1)]

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10
5,2013-01-06,1,1,12
6,2013-01-07,1,1,10
7,2013-01-08,1,1,9
8,2013-01-09,1,1,12
9,2013-01-10,1,1,9


In [62]:
temp

Unnamed: 0,store,item,date,sales
0,1,1,2013-01-01,13
1,1,1,2013-01-02,11
2,1,1,2013-01-03,14
3,1,1,2013-01-04,13
4,1,1,2013-01-05,10
5,1,1,2013-01-06,12
6,1,1,2013-01-07,10
7,1,1,2013-01-08,9
8,1,1,2013-01-09,12
9,1,1,2013-01-10,9


* __Converting date column to year, month, day of week__

In [None]:
# creating a function to create columns year, month and day of week 
df['date']=pd.to_datetime(df['date'])
df['month']=df['date'].dt.month
df['year']=df['date'].dt.year
df['dayofweek']=df['date'].dt.dayofweek

In [None]:
# verfiying changes
df.head()

* __Creating columns by taking rolling averages at different window sizes__

In [None]:
# Calculating rolling average at the window of 6
df['rolling_average_6'] = df.sales.rolling(window=6).mean()
df.head()

In [None]:
# Calculating rolling average at the window of 12
df['rolling_average_12'] = df.sales.rolling(window=12).mean()
df.head()

In [None]:
# Calculating rolling average at the window of 24
df['rolling_average_24'] = df.sales.rolling(window=24).mean()
df.head()

* __Creating columns daily_average and monthly_average__

In [None]:
# creating columns daily_average and montly_average
df['daily_avg']=df.groupby(['item','store','dayofweek'])['sales'].transform('mean')
df['monthly_avg']=df.groupby(['item','store','month'])['sales'].transform('mean')
df.head()

* __Creating a column and naming it as 'seasons'__

In [None]:
col         = 'month'
conditions  = [ (df[col]==5)|(df[col]==6)|(df[col]==7)|(df[col]==8), (df[col]==1)|(df[col]==2)|(df[col]==11)|(df[col]==12), (df[col]==9)|(df[col]==10), (df[col]==3)|(df[col]==4) ]
choices     = [1,2,3,4]
df["seasons"] = np.select(conditions, choices, default=np.nan)
df.seasons = df.seasons.astype(int)
df.head()

# Step #06: Modeling Random Forest

## Considering season: Summer

### __We are only going to consider summer seasons data in this section for spliting data in a manner that we will use 2013 summer data for training and 2014 summer's data for testing__

### Predicting overall sales

* __Splitting datset__

In [None]:
data = df[df.seasons==1]

In [None]:
# spliting dataset
training = data[data.year<=2016]
testing = data[data.year==2017]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales', 'seasons'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales', 'seasons'],axis=1)
y_test = testing.sales

In [None]:
 # Perform Grid-Search
gsc = GridSearchCV(estimator=RandomForestRegressor(), param_grid={'max_depth': range(3,7), 'n_estimators': (10, 50, 100, 1000), }, cv=5, scoring='neg_mean_squared_error', verbose=0, n_jobs=-1)
grid_result = gsc.fit(x_train, y_train)
best_params = grid_result.best_params_
rfr = RandomForestRegressor(max_depth=best_params["max_depth"], n_estimators=best_params["n_estimators"],                               random_state=False, verbose=False)
# Perform K-Fold CV
scores = cross_val_score(rfr, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')

In [None]:
scores

In [None]:
# RFE
selector = RFE(rfr, 4, step=1)
selector = selector.fit(x_train, y_train)

In [None]:
x_train.columns

In [None]:
selector.support_ 

In [None]:
selector.estimator_.feature_importances_

In [None]:
x_train.drop(x_train.columns[np.where(selector.support_==False)[0]], axis=1, inplace=True)

In [None]:
x_train.columns

In [None]:
# making predictions out of the model
col = x_train.columns
y_pred = cross_val_predict(rfr, x_test[col], y_test, cv=5)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# Calculating MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
testing[['sales','y_pred']]

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date, y=training.sales,
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### Predicting sales on specific item

__We are going to pick the most freqently sell item in our data__

* __Splitting datset__

In [None]:
data = df[(df.year<=2014) & (df.seasons==1) & (df.item==1)]

In [None]:
# spliting dataset
training = data[data.year==2013]
testing = data[data.year==2014]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales', 'seasons'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales', 'seasons'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# calculate MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date.iloc[15000:17340], y=training.sales.iloc[15000:17340],
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### Predicting sales on specific store

__We are going to pick the most freqently store in our data__

* __Splitting datset__

In [None]:
data = df[(df.year<=2014) & (df.seasons==1) & (df.store==1)]

In [None]:
# spliting dataset
training = data[data.year==2013]
testing = data[data.year==2014]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales', 'seasons'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales', 'seasons'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# calculate MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date.iloc[85000:86700], y=training.sales.iloc[85000:86700],
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### Predicting sales on specific item and specific store

__We are going to pick the most freqently store in our data__

* __Splitting datset__

In [None]:
data = df[(df.year<=2014) & (df.seasons==1) & (df.store==1) & (df.item==1)]

In [None]:
# spliting dataset
training = data[data.year==2013]
testing = data[data.year==2014]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales', 'seasons'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales', 'seasons'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# calculating MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date.iloc[1000:1734], y=training.sales.iloc[1000:1734],
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### __We are only going to consider summer seasons data in this section for spliting data in a manner that we will use 2013 and 2014 summer data for training and 2015 summer's data for testing__

### Predicting overall sales

* __Splitting datset__

In [None]:
data = df[(df.year<=2015) & (df.seasons==1)]

In [None]:
# spliting dataset
training = data[data.year<=2014]
testing = data[data.year==2015]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data 
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# Calculating MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date, y=training.sales,
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### Predicting sales on specific item

__We are going to pick the most freqently sell item in our data__

* __Splitting datset__

In [None]:
data = df[(df.year<=2015) & (df.seasons==1) & (df.item==1)]

In [None]:
# spliting dataset
training = data[data.year<=2014]
testing = data[data.year==2015]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# calculate MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date.iloc[15000:17340], y=training.sales.iloc[15000:17340],
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### Predicting sales on specific store

__We are going to pick the most freqently store in our data__

* __Splitting datset__

In [None]:
data = df[(df.year<=2015) & (df.seasons==1) & (df.store==1)]

In [None]:
# spliting dataset 
training = data[data.year<=2014]
testing = data[data.year==2015]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# calculate MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date.iloc[85000:86700], y=training.sales.iloc[85000:86700],
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### Predicting sales on specific item and specific store

__We are going to pick the most freqently store in our data__

* __Splitting datset__

In [None]:
data = df[(df.year<=2015) & (df.seasons==1) & (df.store==1) & (df.item==1)]

In [None]:
# spliting dataset
training = data[data.year<=2014]
testing = data[data.year==2015]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales', 'seasons'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales', 'seasons'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# calculating MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date.iloc[1000:1734], y=training.sales.iloc[1000:1734],
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

__We are only going to consider summer seasons data in this section for spliting data in a manner that we will use 2013, 2014,2015 and 2016 summer data for training and 2017 summer's data for testing__

### Predicting overall sales

* __Splitting datset__

In [None]:
data = df[df.seasons==1]

In [None]:
# spliting dataset
training = data[data.year<=2016]
testing = data[data.year==2017]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales', 'seasons'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales', 'seasons'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data 
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# Calculating MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date, y=training.sales,
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### Predicting sales on specific item

__We are going to pick the most freqently sell item in our data__

* __Splitting datset__

In [None]:
data = df[(df.seasons==1) & (df.item==1)]

In [None]:
# spliting dataset
training = data[data.year<=2016]
testing = data[data.year==2017]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales', 'seasons'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales', 'seasons'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# calculate MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date.iloc[15000:17340], y=training.sales.iloc[15000:17340],
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### Predicting sales on specific store

__We are going to pick the most freqently store in our data__

* __Splitting datset__

In [None]:
data = df[(df.seasons==1) & (df.store==1)]

In [None]:
# spliting dataset
training = data[data.year<=2016]
testing = data[data.year==2017]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales', 'seasons'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales', 'seasons'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# calculate MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date.iloc[85000:86700], y=training.sales.iloc[85000:86700],
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()

### Predicting sales on specific item and specific store

__We are going to pick the most freqently store in our data__

* __Splitting datset__

In [None]:
data = df[(df.seasons==1) & (df.store==1) & (df.item==1)]

In [None]:
# spliting dataset
training = data[data.year<=2016]
testing = data[data.year==2017]

In [None]:
# setting x_train, x_test, y_train, y_test 
# feature we are using for the training and testing purpose are: store, item, month, year, dayofweek, daily_avg, monthly_avg
x_train = training.drop(['date', 'sales', 'seasons'],axis=1)
y_train = training.sales
x_test = testing.drop(['date', 'sales', 'seasons'],axis=1)
y_test = testing.sales

In [None]:
# Setting model parameters and training the model on data
def XGBmodel(x_train,x_test,y_train,y_test):
    matrix_train = xgb.DMatrix(x_train,label=y_train)
    matrix_test = xgb.DMatrix(x_test,label=y_test)
    model=xgb.train(params={'objective':'reg:linear','eval_metric':'mae'}
                    ,dtrain=matrix_train,num_boost_round=500, 
                    early_stopping_rounds=20,evals=[(matrix_test,'test')],)
    return model

model=XGBmodel(x_train,x_test,y_train,y_test)

In [None]:
# making predictions out of the model
y_pred = model.predict(xgb.DMatrix(x_test), ntree_limit = model.best_ntree_limit)
# saving predictions in the testing dataframe
testing['y_pred'] = y_pred

In [None]:
# calculating MAE, MSE, RMSE
print("MAE Score: " + str(metrics.mean_absolute_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("MSE Score: " + str(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list())))
print("RMSE Score: " + str(np.sqrt(metrics.mean_squared_error(testing.sales.to_list(), testing.y_pred.to_list()))))

In [None]:
# Comparing the original sales and predicted sales using line chart 
fig = go.Figure()
# fig.add_trace(go.Scatter(x=training.date.iloc[1000:1734], y=training.sales.iloc[1000:1734],
#                     mode='lines', name='previous trend'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.sales,
                    mode='lines',
                    name='original sales'))
fig.add_trace(go.Scatter(x=testing.date, y=testing.y_pred,
                    mode='lines',
                    name='predicted sales'))
fig.show()