# Import Libraries

In [77]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
import statsmodels.api as sm

# Import Files

In [54]:
test = pd.read_csv("data/test.csv")
train = pd.read_csv("data/train.csv")
sample = pd.read_csv("data/sample_submission.csv")

# Functions

In [3]:
# to calculate score
def smape(a, f):
    return 1/len(a) * np.sum(2 * np.abs(f-a) / (np.abs(a) + np.abs(f))*100)

In [4]:
# make to csv
def export_me(model, testz, name):
    result = pd.DataFrame({"id": range(1,len(testz)+1), "sales": model.predict(testz)})
    result.to_csv(name+".csv", index=False)
    return None

# Explore Data

In [5]:
train.describe()

Unnamed: 0,Item,Sales
count,821000.0,821000.0
mean,25.5,51.225683
std,14.430878,28.208462
min,1.0,0.0
25%,13.0,29.0
50%,25.5,46.0
75%,38.0,68.0
max,50.0,214.0


In [6]:
train.dtypes

Date     object
Store    object
Item      int64
Sales     int64
dtype: object

In [7]:
train.columns

Index(['Date', 'Store', 'Item', 'Sales'], dtype='object')

In [8]:
test.columns

Index(['Date', 'Store', 'Item'], dtype='object')

In [9]:
print(len(train))
print(len(test))

821000
92000


In [10]:
print(train.Date.min(), train.Date.max())
print(test.Date.min(), test.Date.max())

1-Apr-17 9-Sep-20
1-Aug-21 9-Sep-21


In [11]:
print(train.Store.unique())
print(test.Store.unique())

['KMart' 'Target' 'Coles' 'Woolies' 'BigW' 'Aldi' 'Asian SuperMart'
 'Big Savers' 'Costco' 'SupaIGA']
['KMart' 'Target' 'Coles' 'Woolies' 'BigW' 'Aldi' 'Asian SuperMart'
 'Big Savers' 'Costco' 'SupaIGA']


In [12]:
datetime.strptime(train["Date"][0], "%d-%b-%y")

datetime.datetime(2017, 1, 1, 0, 0)

# Preprocessing

Make Date into separate features

In [55]:
train["Date"] = pd.to_datetime(train["Date"], format="%d-%b-%y")
train["Year"] = train["Date"].dt.year
train["Month"] = train["Date"].dt.month
train["Day"] = train["Date"].dt.day
train["Store"] = LabelEncoder().fit(train["Store"]).transform(train["Store"])

# Trial 1

If we uncategorize it it would make a lower SMAPE score

In [None]:
#train1["Item"] = train["Item"].astype("category")
#train1["Store"] = train["Store"].astype("category")

In [45]:
# split feature and target
features = train[["Store", "Item", "Year", "Month", "Day"]]
target = train["Sales"]

In [46]:
# Split the data 

X_train,X_test,y_train,y_test = train_test_split(features,target,test_size = 0.2)

In [47]:
# Try different Algorithms 

model = LinearRegression().fit(X_train,y_train)

In [48]:
predictions = model.predict(X_test)

In [49]:
smape(y_test, predictions)

44.21461435193209

In [74]:
model1 = sm.OLS(y_train, X_train).fit()
predictions = model1.predict(X_test)

In [75]:
smape(y_test, predictions)

45.329908976417684

In [28]:
test["Date"] = pd.to_datetime(test["Date"], format="%d-%b-%y")
test["Year"] = test["Date"].dt.year
test["Month"] = test["Date"].dt.month
test["Day"] = test["Date"].dt.day
#test["Item"] = test["Item"].astype("category")
test["Store"] = LabelEncoder().fit(test["Store"]).transform(test["Store"])
#test["Store"] = test["Store"].astype("category")
testz = test[["Store", "Item", "Year", "Month", "Day"]]

# Trial 2

Change linear regression

Choose columns: Store, Item, Year

In [59]:
round(train.corr(),3)

Unnamed: 0,Store,Item,Sales,Year,Month,Day
Store,1.0,-0.0,0.207,0.0,-0.0,-0.0
Item,-0.0,1.0,-0.056,0.0,-0.0,0.0
Sales,0.207,-0.056,1.0,0.161,0.081,0.003
Year,0.0,0.0,0.161,1.0,-0.165,-0.002
Month,-0.0,-0.0,0.081,-0.165,1.0,0.011
Day,-0.0,0.0,0.003,-0.002,0.011,1.0


In [69]:
# split feature and target
features = train[["Store", "Year"]]
target = train["Sales"]

# Split the data
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size = 0.2)

# Try different Algorithms 
model = LinearRegression().fit(X_train,y_train)

predictions = model.predict(X_test)

In [70]:
smape(y_test, predictions)

44.82655673081572

In [21]:
train

Unnamed: 0,Date,Store,Item,Sales,Year,Month,Day
0,2017-01-01,6,1,13,2017,1,1
1,2017-01-02,6,1,11,2017,1,2
2,2017-01-03,6,1,14,2017,1,3
3,2017-01-04,6,1,13,2017,1,4
4,2017-01-05,6,1,10,2017,1,5
...,...,...,...,...,...,...,...
820995,2021-06-26,7,50,82,2021,6,26
820996,2021-06-27,7,50,83,2021,6,27
820997,2021-06-28,7,50,91,2021,6,28
820998,2021-06-29,7,50,122,2021,6,29


# Trial 3
Bayesian Ridge

In [83]:
train3 = train
train3["Item"] = train["Item"].astype("category")
train3["Store"] = train["Store"].astype("category")
train3["Month"] = train["Month"].astype("category")
train3["Day"] = train["Day"].astype("category")

In [92]:
# split feature and target
features = train3[["Store", "Item", "Year", "Month", "Day"]]
target = train3["Sales"]

# Split the data
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size = 0.2)

model3 = linear_model.BayesianRidge().fit(X_train,y_train)

prediction3 = np.round(model3.predict(X_test))

In [93]:
smape(y_test, prediction3)

44.13412921949413

# Trial 4

Bro its multiple linear regression bro

In [98]:
# split feature and target
features = train[["Store", "Item", "Year", "Month", "Day"]]
target = train["Sales"]

# Split the data
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size = 0.2)

# make model
X_trainz = sm.add_constant(X_train)
model4 = sm.OLS(y_train, X_trainz).fit()

X_testz = sm.add_constant(X_test)
prediction4 = model4.predict(X_testz)

In [100]:
model4.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.084
Model:,OLS,Adj. R-squared:,0.084
Method:,Least Squares,F-statistic:,12010.0
Date:,"Sun, 03 Oct 2021",Prob (F-statistic):,0.0
Time:,09:26:11,Log-Likelihood:,-3096700.0
No. Observations:,656800,AIC:,6193000.0
Df Residuals:,656794,BIC:,6194000.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-7745.1551,51.927,-149.156,0.000,-7846.930,-7643.381
Store,2.0268,0.012,174.655,0.000,2.004,2.050
Item,-0.1093,0.002,-47.336,0.000,-0.114,-0.105
Year,3.8559,0.026,149.937,0.000,3.805,3.906
Month,0.9190,0.010,93.412,0.000,0.900,0.938
Day,0.0105,0.004,2.768,0.006,0.003,0.018

0,1,2,3
Omnibus:,55234.472,Durbin-Watson:,2.002
Prob(Omnibus):,0.0,Jarque-Bera (JB):,70377.437
Skew:,0.768,Prob(JB):,0.0
Kurtosis:,3.458,Cond. No.,3150000.0


In [99]:
smape(y_test, prediction4)

44.18558187742178

# Trial 5

In [101]:
# split feature and target
features = train[["Store", "Item", "Year", "Month", "Day"]]
target = train["Sales"]

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

# Applying scaler() to all the columns except the 'yes-no' and 'dummy' variables
features = scaler.fit_transform(features)

# Split the data
X_train,X_test,y_train,y_test = train_test_split(features,target,test_size = 0.2)

# make model
X_trainz = sm.add_constant(X_train)
model5 = sm.OLS(y_train, X_trainz).fit()

X_testz = sm.add_constant(X_test)
prediction5 = model5.predict(X_testz)

In [102]:
smape(y_test, prediction5)

44.217504268111625

# Trial

Try to separate each group

In [None]:
store_list = train.Store.unique()
train2 = {}
for u in store_list:
    train2[u] = train[train["Store"] == u]

In [None]:
train2["KMart"]