# Assignment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
#from sklearn.feature_selection import RFE
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold

The data is provided in 4 different CSVs as below:

stores.csv
This file contains anonymized information about the 45 stores, indicating the type and size of store.

train.csv
This is the historical training data, which covers to 2010-02-05 to 2012-11-01. Within this file you will find the following fields:
•   Store - the store number
•   Dept - the department number
•   Date - the week
•   Weekly_Sales -  sales for the given department in the given store
•   IsHoliday - whether the week is a special holiday week

test.csv
This file is identical to train.csv, except we have withheld the weekly sales. You must predict the sales for each triplet of store, department, and date in this file.

features.csv
This file contains additional data related to the store, department, and regional activity for the given dates. It contains the following fields:
•   Store - the store number
•   Date - the week
•   Temperature - average temperature in the region
•   Fuel Price - cost of fuel in the region
•   MarkDown1-5 - anonymized data related to promotional markdowns that the Retail chain is running. Mark Down data is only         available after Nov 2011, and is not available for all stores all the time. Any missing value is marked with an NA.
•   CPI - the consumer price index
•   Unemployment - the unemployment rate
•   IsHoliday - whether the week is a special holiday week

In [2]:
dataset = pd.read_csv("train.csv",sep=',')
features = pd.read_csv("features.csv",sep=',')
#features.drop(['IsHoliday'])
stores = pd.read_csv("stores.csv", sep=',')
dataset = dataset.merge(stores, how='left').merge(features, how='left')
dataset['IsHoliday'] = dataset.IsHoliday.map({False:0, True:1})
dataset.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
0,1,1,2010-02-05,24924.5,0,A,151315,42.31,2.572,,,,,,211.096358,8.106
1,1,1,2010-02-12,46039.49,1,A,151315,38.51,2.548,,,,,,211.24217,8.106
2,1,1,2010-02-19,41595.55,0,A,151315,39.93,2.514,,,,,,211.289143,8.106
3,1,1,2010-02-26,19403.54,0,A,151315,46.63,2.561,,,,,,211.319643,8.106
4,1,1,2010-03-05,21827.9,0,A,151315,46.5,2.625,,,,,,211.350143,8.106


In [3]:
corr = dataset.corr()

In [4]:
corr

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment
Store,1.0,0.024004,-0.085195,-0.000548,-0.182881,-0.050097,0.06529,-0.119588,-0.035173,-0.031556,-0.009941,-0.026634,-0.211088,0.208552
Dept,0.024004,1.0,0.148032,0.000916,-0.002966,0.004437,0.003572,-0.002426,0.00029,0.001784,0.004257,0.000109,-0.007477,0.007837
Weekly_Sales,-0.085195,0.148032,1.0,0.012774,0.243828,-0.002312,-0.00012,0.085251,0.02413,0.060385,0.045414,0.090362,-0.020921,-0.025864
IsHoliday,-0.000548,0.000916,0.012774,1.0,0.000593,-0.155949,-0.078281,-0.035586,0.334818,0.42796,-0.000562,-0.053719,-0.001944,0.01046
Size,-0.182881,-0.002966,0.243828,0.000593,1.0,-0.058313,0.003361,0.345673,0.108827,0.048913,0.168196,0.304575,-0.003314,-0.068238
Temperature,-0.050097,0.004437,-0.002312,-0.155949,-0.058313,1.0,0.143859,-0.040594,-0.323927,-0.09688,-0.063947,-0.017544,0.182112,0.09673
Fuel_Price,0.06529,0.003572,-0.00012,-0.078281,0.003361,0.143859,1.0,0.061371,-0.220895,-0.102092,-0.044986,-0.128065,-0.16421,-0.033853
MarkDown1,-0.119588,-0.002426,0.085251,-0.035586,0.345673,-0.040594,0.061371,1.0,0.024486,-0.108115,0.819238,0.160257,-0.055558,0.050285
MarkDown2,-0.035173,0.00029,0.02413,0.334818,0.108827,-0.323927,-0.220895,0.024486,1.0,-0.050108,-0.007768,-0.00744,-0.039534,0.02094
MarkDown3,-0.031556,0.001784,0.060385,0.42796,0.048913,-0.09688,-0.102092,-0.108115,-0.050108,1.0,-0.071095,-0.026467,-0.02359,0.012818


# Removing Some of the featuers

In [5]:
dataset[['MarkDown1','MarkDown2','MarkDown3','MarkDown4', 'MarkDown5']] = dataset[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']].fillna(0)
dataset['Month'] = pd.to_datetime(dataset['Date']).dt.month
dataset = dataset.drop(columns=["Date", "CPI", "Fuel_Price", 'Unemployment', 'MarkDown2', 'Temperature'])
dataset = pd.get_dummies(dataset, columns=['Type'])
dataset.head()

Unnamed: 0,Store,Dept,Weekly_Sales,IsHoliday,Size,MarkDown1,MarkDown3,MarkDown4,MarkDown5,Month,Type_A,Type_B,Type_C
0,1,1,24924.5,0,151315,0.0,0.0,0.0,0.0,2,1,0,0
1,1,1,46039.49,1,151315,0.0,0.0,0.0,0.0,2,1,0,0
2,1,1,41595.55,0,151315,0.0,0.0,0.0,0.0,2,1,0,0
3,1,1,19403.54,0,151315,0.0,0.0,0.0,0.0,2,1,0,0
4,1,1,21827.9,0,151315,0.0,0.0,0.0,0.0,3,1,0,0


# Applying Different Models

After applying different algorithum like Support Vector Machine, Random Forest, Extra Tree Regressor. The average absolute error in extra tree regressor in minimum, so i have selected this algorithum for this particular dataset. I also try to apply multivariante time series analysis but not able to apply it, may be on this type of dataset multivariante time series analysis will give more accurate result. 

In [6]:
def extraTreesRegressor():
    clf = ExtraTreesRegressor(n_estimators=100,max_features='auto', verbose=1, n_jobs=1)
    return clf

def predict_(m, test_x):
    return pd.Series(m.predict(test_x))

def model_():
#     return knn()
    return extraTreesRegressor()
#     return svm()
#     return nn()
#     return randomForestRegressor()    

def train_(train_x, train_y):
    m = model_()
    m.fit(train_x, train_y)
    return m

def train_and_predict(train_x, train_y, test_x):
    m = train_(train_x, train_y)
    return predict_(m, test_x), m

In [7]:
def calculate_error(test_y, predicted):
    return mean_absolute_error(test_y, predicted)

In [8]:
kf = KFold(n_splits=5)
splited = []
# dataset2 = dataset.copy()
for name, group in dataset.groupby(["Store", "Dept"]):
    group = group.reset_index(drop=True)
    trains_x = []
    trains_y = []
    tests_x = []
    tests_y = []
    if group.shape[0] <= 5:
        f = np.array(range(5))
        np.random.shuffle(f)
        group['fold'] = f[:group.shape[0]]
        continue
    fold = 0
    for train_index, test_index in kf.split(group):
        group.loc[test_index, 'fold'] = fold
        fold += 1
    splited.append(group)

splited = pd.concat(splited).reset_index(drop=True)

In [9]:
best_model = None
error_cv = 0
best_error = np.iinfo(np.int32).max
for fold in range(5):
    dataset_train = splited.loc[splited['fold'] != fold]
    dataset_test = splited.loc[splited['fold'] == fold]
    train_y = dataset_train['Weekly_Sales']
    train_x = dataset_train.drop(columns=['Weekly_Sales', 'fold'])
    test_y = dataset_test['Weekly_Sales']
    test_x = dataset_test.drop(columns=['Weekly_Sales', 'fold'])
    print(dataset_train.shape, dataset_test.shape)
    predicted, model = train_and_predict(train_x, train_y, test_x)
    error = calculate_error(test_y, predicted)
    error_cv += error
    print(fold, error)
    if error < best_error:
        print('Search Best Model')
        best_error = error
        best_model = model
error_cv /= 5

(335722, 14) (85552, 14)


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.7min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.8s finished


0 1948.7944659283678
Search Best Model
(335849, 14) (85425, 14)


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.9s finished


1 2691.244089258572
(335970, 14) (85304, 14)


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.6min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.8s finished


2 1874.7826091577383
Search Best Model
(338733, 14) (82541, 14)


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.7s finished


3 2222.058928501031
(338822, 14) (82452, 14)


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:  1.3min finished
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.7s finished


4 1826.8769827298604
Search Best Model


In [10]:
dataset_test = pd.read_csv("test.csv", names=['Store','Dept','Date','isHoliday'],sep=',', header=0)
features = pd.read_csv("features.csv",sep=',', header=0,
                       names=['Store','Date','Temperature','Fuel_Price','MarkDown1','MarkDown2','MarkDown3','MarkDown4',
                              'MarkDown5','CPI','Unemployment','IsHoliday']).drop(columns=['IsHoliday'])
stores = pd.read_csv("stores.csv", names=['Store','Type','Size'],sep=',', header=0)
dataset_test = dataset_test.merge(stores, how='left').merge(features, how='left')

In [11]:
dataset_test = pd.get_dummies(dataset_test, columns=["Type"])
dataset_test[['MarkDown1','MarkDown2','MarkDown3','MarkDown4', 'MarkDown5']] = dataset_test[['MarkDown1','MarkDown2','MarkDown3','MarkDown4','MarkDown5']].fillna(0)
dataset_test = dataset_test.fillna(0)
column_date = dataset_test['Date']
dataset_test['Month'] = pd.to_datetime(dataset_test['Date']).dt.month
dataset_test = dataset_test.drop(columns=["Date", "CPI", "Fuel_Price", 'Unemployment', 'MarkDown2', 'Temperature'])
dataset_test.head()

Unnamed: 0,Store,Dept,isHoliday,Size,MarkDown1,MarkDown3,MarkDown4,MarkDown5,Type_A,Type_B,Type_C,Month
0,1,1,False,151315,6766.44,50.82,3639.9,2737.42,1,0,0,11
1,1,1,False,151315,11421.32,40.28,4646.79,6154.16,1,0,0,11
2,1,1,False,151315,9696.28,103.78,1133.15,6612.69,1,0,0,11
3,1,1,True,151315,883.59,74910.32,209.91,303.32,1,0,0,11
4,1,1,False,151315,2460.03,3838.35,150.57,6966.34,1,0,0,11


In [12]:
error_cv   # Average mean absolute error for model extraTreeRegressor

2112.751415115114

In [13]:
predicted_test = best_model.predict(dataset_test)

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    1.8s finished


In [14]:
dataset_test['weeklySales'] = predicted_test
dataset_test = dataset_test.rename(columns={'weeklySales': 'Weekly_Sales'})

In [15]:
dataset_test

Unnamed: 0,Store,Dept,isHoliday,Size,MarkDown1,MarkDown3,MarkDown4,MarkDown5,Type_A,Type_B,Type_C,Month,Weekly_Sales
0,1,1,False,151315,6766.44,50.82,3639.90,2737.42,1,0,0,11,17984.688600
1,1,1,False,151315,11421.32,40.28,4646.79,6154.16,1,0,0,11,22218.156100
2,1,1,False,151315,9696.28,103.78,1133.15,6612.69,1,0,0,11,18574.437100
3,1,1,True,151315,883.59,74910.32,209.91,303.32,1,0,0,11,27950.368775
4,1,1,False,151315,2460.03,3838.35,150.57,6966.34,1,0,0,11,17688.999775
5,1,1,False,151315,6343.16,270.00,2928.90,10147.90,1,0,0,12,19183.998500
6,1,1,False,151315,3504.83,73.26,1636.80,2779.60,1,0,0,12,16018.843500
7,1,1,False,151315,8231.71,274.00,358.15,2834.02,1,0,0,12,18799.806500
8,1,1,True,151315,12659.55,174.78,74.46,1208.86,1,0,0,12,27325.532900
9,1,1,False,151315,1214.08,15.01,72.36,3940.02,1,0,0,1,18368.524650


In [16]:
output = dataset_test[['Store','Dept','isHoliday','Weekly_Sales']]

In [17]:
output.head()

Unnamed: 0,Store,Dept,isHoliday,Weekly_Sales
0,1,1,False,17984.6886
1,1,1,False,22218.1561
2,1,1,False,18574.4371
3,1,1,True,27950.368775
4,1,1,False,17688.999775


In [20]:
output.to_csv('output_final.csv')

Conclusion: It is clear from the data that most appropriate algorithm for this type of dataset in Multivariate Time Series Analysis but I am not able to apply this algorithm and so by applying different algorithms like Support Vector Machine, Random Forest, Extra Tree Regressor I try to predict the weekly forecast and absolute error is minimum in Extra Tree Regressor, so I have selected this model.