## Import libraries

In [84]:
import numpy as np
import pandas as pd

## Path to data

In [85]:
PATH = 'dataset'
PATH_TO_train_data = PATH + '/' + 'yds_train2018.csv'
PATH_TO_test_data = PATH + '/' + 'yds_test2018.csv'
PATH_TO_promotional_expense = PATH + '/' + 'promotional_expense.csv'
PATH_TO_holidays = PATH + '/' + 'holidays.xlsx'

In [86]:
train_data = pd.read_csv(PATH_TO_train_data)

In [87]:
train_data.head()

Unnamed: 0,S_No,Year,Month,Week,Merchant_ID,Product_ID,Country,Sales
0,1,2013,1,1,ar00001,1,Argentina,157500.0
1,2,2013,1,1,ar00003,1,Argentina,39375.0
2,3,2013,1,1,ar00004,1,Argentina,15750.0
3,4,2013,1,1,ar00007,1,Argentina,47250.0
4,5,2013,1,1,ar00008,1,Argentina,283500.0


## Drop Merchant_ID and S_No as there are not required.

In [88]:
train_data.drop(columns=['Merchant_ID', 'S_No'], inplace=True)

In [89]:
train_data.shape

(79072, 6)

In [90]:
train_data.head()

Unnamed: 0,Year,Month,Week,Product_ID,Country,Sales
0,2013,1,1,1,Argentina,157500.0
1,2013,1,1,1,Argentina,39375.0
2,2013,1,1,1,Argentina,15750.0
3,2013,1,1,1,Argentina,47250.0
4,2013,1,1,1,Argentina,283500.0


## Group by (Year, Month, Product_ID and Country) and add Sales for each group.

In [91]:
train_data = train_data.groupby(['Year', 'Month', 'Product_ID', 'Country']).Sales.sum().reset_index()

In [92]:
train_data.shape

(388, 5)

In [93]:
train_data.head()

Unnamed: 0,Year,Month,Product_ID,Country,Sales
0,2013,1,1,Argentina,34346025.0
1,2013,1,1,Columbia,3947356.31
2,2013,1,2,Argentina,2751851.48
3,2013,1,2,Belgium,314850.13
4,2013,1,2,Columbia,2885137.06


## Read Expense data and rename Product_ID

In [94]:
promotional_expense_data = pd.read_csv(PATH_TO_promotional_expense)
promotional_expense_data.rename(columns={'Product_Type':'Product_ID'}, inplace=True)
promotional_expense_data.head()

Unnamed: 0,Year,Month,Country,Product_ID,Expense_Price
0,2013,1,Argentina,1,14749.307
1,2013,1,Argentina,2,1329.374
2,2013,1,Belgium,2,249.59
3,2013,1,Columbia,1,1893.122
4,2013,1,Columbia,2,1436.726


## Merge train data and Promotional Expense data as the correlation between thwm is very high and it will be used to predict accurately.

In [95]:
train_sales_and_expense_data = pd.merge(train_data, promotional_expense_data, on=['Year', 'Month', 'Country', 'Product_ID'])

In [96]:
train_sales_and_expense_data.head()

Unnamed: 0,Year,Month,Product_ID,Country,Sales,Expense_Price
0,2013,1,1,Argentina,34346025.0,14749.307
1,2013,1,1,Columbia,3947356.31,1893.122
2,2013,1,2,Argentina,2751851.48,1329.374
3,2013,1,2,Belgium,314850.13,249.59
4,2013,1,2,Columbia,2885137.06,1436.726


In [97]:
train_sales_and_expense_data['Sales'].corr(train_sales_and_expense_data['Expense_Price'])

0.9761801393114081

## Convert Country data into categorical values/one hot representation.

In [98]:
# train_sales_and_expense_data['Country_num'] = train_sales_and_expense_data.Country.map({'Argentina': 0, 'Belgium': 1, 'Columbia': 2, 'Denmark': 3, 'England': 4, 'Finland':5})
train_sales_and_expense_data = pd.get_dummies(train_sales_and_expense_data, columns=['Country'])

In [99]:
# train_sales_and_expense_data.drop(columns='Country', inplace=True)

In [100]:
train_sales_and_expense_data.head()

Unnamed: 0,Year,Month,Product_ID,Sales,Expense_Price,Country_Argentina,Country_Belgium,Country_Columbia,Country_Denmark,Country_England,Country_Finland
0,2013,1,1,34346025.0,14749.307,1,0,0,0,0,0
1,2013,1,1,3947356.31,1893.122,0,0,1,0,0,0
2,2013,1,2,2751851.48,1329.374,1,0,0,0,0,0
3,2013,1,2,314850.13,249.59,0,1,0,0,0,0
4,2013,1,2,2885137.06,1436.726,0,0,1,0,0,0


In [101]:
X_train = train_sales_and_expense_data.drop(columns='Sales')

In [102]:
X_train.head()

Unnamed: 0,Year,Month,Product_ID,Expense_Price,Country_Argentina,Country_Belgium,Country_Columbia,Country_Denmark,Country_England,Country_Finland
0,2013,1,1,14749.307,1,0,0,0,0,0
1,2013,1,1,1893.122,0,0,1,0,0,0
2,2013,1,2,1329.374,1,0,0,0,0,0
3,2013,1,2,249.59,0,1,0,0,0,0
4,2013,1,2,1436.726,0,0,1,0,0,0


In [103]:
X_train.shape

(336, 10)

In [104]:
y_train = train_sales_and_expense_data.Sales

In [105]:
y_train.head()

0    34346025.00
1     3947356.31
2     2751851.48
3      314850.13
4     2885137.06
Name: Sales, dtype: float64

In [106]:
y_train.shape

(336,)

## Modelling
### Tried different algorithms on the train data.

In [107]:
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
import xgboost

In [108]:
# svr = SVR(kernel='linear', C=1e3)
# model = svr.fit(X_train, y_train)

In [109]:
# lreg = LinearRegression(normalize=True)
# model = lreg.fit(X_train, y_train)

In [110]:
# ridgeReg = Ridge(alpha=1, normalize=True)
# model = ridgeReg.fit(X_train, y_train)

In [111]:
# lassoReg = Lasso(alpha=0.01, normalize=True)
# model = lassoReg.fit(X_train,y_train)

In [112]:
# ENreg = ElasticNet(alpha=1, l1_ratio=0.5, normalize=False)
# model = ENreg.fit(X_train,y_train)

In [113]:
# dtReg = DecisionTreeRegressor()
# model = dtReg.fit(X_train, y_train)

In [114]:
# rfReg = RandomForestRegressor()
# model = rfReg.fit(X_train, y_train)

In [141]:
xgb = xgboost.XGBRegressor(n_estimators=1500, learning_rate=0.05, gamma=0, subsample=0.75,
                           colsample_bytree=1, max_depth=15)
model = xgb.fit(X_train, y_train)

In [142]:
# svr = SVR(kernel='rbf', C=1, gamma=0.1)
# model = svr.fit(X_train, y_train)

# svr = SVR(kernel='poly', C=1e3, degree=2)
# model = svr.fit(X_train, y_train)

In [143]:
model

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.05, max_delta_step=0,
       max_depth=15, min_child_weight=1, missing=None, n_estimators=1500,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.75)

In [144]:
y_train_pred = model.predict(X_train)

In [145]:
compare = pd.DataFrame({'y_train': y_train, 'y_train_pred': y_train_pred})
compare.head()

Unnamed: 0,y_train,y_train_pred
0,34346025.0,34346050.0
1,3947356.31,3947356.0
2,2751851.48,2751856.0
3,314850.13,314849.8
4,2885137.06,2885133.0


## Find correlation between y_train and predicted values on train data.

In [146]:
compare['y_train'].corr(compare['y_train_pred'])

0.9999999999994662

In [147]:
l = (compare['y_train_pred'] - compare['y_train']).abs()
l.count()

336

## Fins SMAPE score for y_train and predicted value.

In [150]:
num = (compare['y_train'] - compare['y_train_pred']).abs()
den = (compare['y_train'] + compare['y_train_pred'].abs())
count = num.count()
out = (num/den).sum()
SMAPE = (out/count)*100
SMAPE

0.0004228589229385793

## Preprocess the test data.
### Merge expense data with test data.
### Convert Country column into categorical values/one hot representation.

In [151]:
promotional_expense_data.head()

Unnamed: 0,Year,Month,Country,Product_ID,Expense_Price
0,2013,1,Argentina,1,14749.307
1,2013,1,Argentina,2,1329.374
2,2013,1,Belgium,2,249.59
3,2013,1,Columbia,1,1893.122
4,2013,1,Columbia,2,1436.726


In [152]:
test_data = pd.read_csv(PATH_TO_test_data)
test_data.drop(['S_No', 'Sales'], axis=1, inplace=True)
test_sales_and_expense_data = pd.merge(test_data, promotional_expense_data, on=['Year', 'Month', 'Country', 'Product_ID'], how='left')

# test_sales_and_expense_data['Country_num'] = test_sales_and_expense_data.Country.map({'Argentina': 0, 'Belgium': 1, 'Columbia': 2, 'Denmark': 3, 'England': 4, 'Finland':5})
# test_sales_and_expense_data.drop(columns='Country', inplace=True)
test_sales_and_expense_data = pd.get_dummies(test_sales_and_expense_data, columns=['Country'])
X_test = test_sales_and_expense_data
X_test = X_test.fillna(0)
X_test.head()

Unnamed: 0,Year,Month,Product_ID,Expense_Price,Country_Argentina,Country_Belgium,Country_Columbia,Country_Denmark,Country_England,Country_Finland
0,2016,4,1,8214.875,1,0,0,0,0,0
1,2016,5,1,10777.878,1,0,0,0,0,0
2,2016,6,1,10320.673,1,0,0,0,0,0
3,2016,7,1,7377.587,1,0,0,0,0,0
4,2016,8,1,9805.705,1,0,0,0,0,0


In [153]:
X_test.shape

(105, 10)

In [154]:
y_predict = model.predict(X_test)

In [155]:
y_predict.shape

(105,)

In [156]:
y_predict

array([1.02699320e+07, 3.15437240e+07, 1.40111720e+07, 1.05898560e+07,
       1.06330220e+07, 1.06619240e+07, 9.66662200e+06, 1.05930380e+07,
       8.82332800e+06, 1.63152820e+07, 1.12143240e+07, 1.01285160e+07,
       7.22022200e+06, 7.58411850e+06, 7.72333050e+06, 7.42005600e+06,
       7.75597250e+06, 7.65424050e+06, 7.70600000e+06, 7.87909250e+06,
       7.65140050e+06, 7.90708650e+06, 7.53311150e+06, 6.79388600e+06,
       1.89772078e+05, 2.23759797e+05, 2.09742016e+05, 1.28947025e+06,
       1.14047975e+06, 1.26919975e+06, 1.19119438e+06, 1.19522800e+06,
       1.26486225e+06, 1.60720975e+06, 1.13721412e+06, 1.64975862e+06,
       1.29998900e+06, 1.09861650e+06, 8.63458062e+05, 3.52254300e+06,
       3.14916450e+06, 3.71887225e+06, 3.00576650e+06, 2.82699100e+06,
       3.44674100e+06, 3.03174100e+06, 3.53805075e+06, 3.42009375e+06,
       2.88689750e+06, 3.04058975e+06, 3.83980750e+06, 6.95220850e+06,
       7.38611200e+06, 7.42518350e+06, 7.43334950e+06, 6.67500400e+06,
      

In [157]:
y_predict_df = pd.DataFrame({'Sales': y_predict})

In [158]:
y_predict_df.tail()

Unnamed: 0,Sales
100,36974376.0
101,29801832.0
102,20147500.0
103,28193508.0
104,19724594.0


In [159]:
output_df = pd.read_csv(PATH_TO_test_data)
s_no = output_df.S_No
output_df.drop('S_No', axis=1, inplace=True)

In [160]:
output_df.head()

Unnamed: 0,Year,Month,Product_ID,Country,Sales
0,2016,4,1,Argentina,
1,2016,5,1,Argentina,
2,2016,6,1,Argentina,
3,2016,7,1,Argentina,
4,2016,8,1,Argentina,


In [161]:
output_df.Sales = y_predict_df.abs()

In [162]:
output_df.head()

Unnamed: 0,Year,Month,Product_ID,Country,Sales
0,2016,4,1,Argentina,10269932.0
1,2016,5,1,Argentina,31543724.0
2,2016,6,1,Argentina,14011172.0
3,2016,7,1,Argentina,10589856.0
4,2016,8,1,Argentina,10633022.0


In [163]:
output_df = pd.concat([s_no, output_df], axis=1)

## Save data in a csv file.

In [164]:
output_df.to_csv('yds_submission2018.csv', index=False)

In [165]:
output_df.shape

(105, 6)

In [166]:
output_df.head()

Unnamed: 0,S_No,Year,Month,Product_ID,Country,Sales
0,79073,2016,4,1,Argentina,10269932.0
1,79074,2016,5,1,Argentina,31543724.0
2,79075,2016,6,1,Argentina,14011172.0
3,79076,2016,7,1,Argentina,10589856.0
4,79077,2016,8,1,Argentina,10633022.0
