In [2]:
from sklearn import model_selection, metrics
from sklearn import (svm, 
                     neighbors, 
                     ensemble, 
                    linear_model, 
                    naive_bayes, 
                    tree, 
                    discriminant_analysis)
import numpy as np

In [3]:
import pandas as pd

In [4]:
import time

In [8]:
train_data = pd.read_csv('./data/preprocessed_train_data.csv')
test_data = pd.read_csv('./data/preprocessed_test_data.csv')

In [9]:
print(train_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 53 columns):
Unnamed: 0                           913000 non-null int64
date                                 913000 non-null object
store                                913000 non-null int64
item                                 913000 non-null int64
sales                                913000 non-null int64
year                                 913000 non-null int64
month                                913000 non-null int64
day                                  913000 non-null int64
weekday                              913000 non-null int64
day_of_year                          913000 non-null int64
week_of_year                         913000 non-null int64
isWeekend                            913000 non-null int64
isHoliday                            913000 non-null int64
store_sales_sum                      913000 non-null int64
store_sales_median                   913000 non-null int64
sto

# Baseline Models

In [10]:
print("Baseline Model:", 1 / len(train_data['sales'].unique().tolist()))

Baseline Model: 0.004694835680751174


In [18]:
import math 
def SMAPE1(y_true, y_pred):
    sum_val = 0 
    for t, p in zip(y_true, y_pred):
        numerator = math.fabs(p - t)
        denominator = (math.fabs(t) + math.fabs(p)) / 2
        if denominator == 0:
            denominator = 0.0001
        val = numerator / denominator
        sum_val += val 
    
    return (sum_val / len(y_true)) * 100 

def SMAPE2(y_true, y_pred):
    sum_val = 0 
    for t, p in zip(y_true, y_pred):
        numerator = math.fabs(p - t)
        denominator = (math.fabs(t) + math.fabs(p))
        if denominator == 0:
            denominator = 0.0001
        val = numerator / denominator
        sum_val += val 
    
    return (sum_val / len(y_true)) * 100 

def SMAPE3(y_true, y_pred):
    sum_1 = 0
    sum_2 = 0
    for t, p in zip(y_true, y_pred):
        sum_1 += math.fabs(p - t)
        sum_2 += (t + p) 
    if sum_2 == 0:
        sum_2 = 0.000001
    return (sum_1 / sum_2)

# Train Test Split

In [21]:
sample_data = train_data

In [22]:
target = ['sales']
features = np.delete(train_data.columns, [0,1,4])

In [23]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(sample_data[features], sample_data[target], random_state=0)
train_y = train_y['sales']
test_y = test_y['sales']

In [24]:
# List of MLA Algorithms
MLA = [
    # ensemble
#     ensemble.AdaBoostClassifier(),
#     ensemble.AdaBoostRegressor(),
#     ensemble.RandomForestClassifier(), 
    ensemble.RandomForestRegressor(),
#     ensemble.BaggingClassifier(),
    ensemble.GradientBoostingRegressor(),
    ensemble.ExtraTreesRegressor(),
    
    #Nearest Neighbor
#     neighbors.KNeighborsClassifier(),
    
    # svm
    svm.LinearSVR(), 
    svm.SVR(), 
    svm.NuSVR(),
    
    
    # tree
#     tree.DecisionTreeClassifier(),
    tree.DecisionTreeRegressor(),    
]

In [25]:
def ml_training(MLA):
    pd_dataframe = pd.DataFrame(columns=['Name', 'Train_Score', 'Test_Score', 'SAMPE_Train', 'SAMPE_Test', 'Time', 'Parameters'])
    row_number = 0
    for alg in MLA:
        alg_name = alg.__class__.__name__
        print("starting:", alg_name)

        start_time = time.time()
        alg.fit(train_X, train_y)
        end_time = time.time()
        time_taken = end_time - start_time

        train_score = alg.score(train_X, train_y)
        test_score = alg.score(test_X, test_y)
        sampe_train = SMAPE1(alg.predict(train_X), train_y)
        sampe_test = SMAPE1(alg.predict(test_X), test_y)

        # add to pandas dataframe
        pd_dataframe.loc[row_number] = [alg_name, train_score, test_score, sampe_train, sampe_test, time_taken, alg.get_params()]
        row_number+=1
        
    pd_dataframe.sort_values(by=['SAMPE_Test'], ascending=False, inplace=True)
    print('done')
    return pd_dataframe

In [None]:
MLA_Compare = ml_training(MLA)

starting: RandomForestRegressor
starting: GradientBoostingRegressor
starting: ExtraTreesRegressor
starting: LinearSVR
starting: SVR


In [None]:
MLA_Compare

In [26]:
best_clf = tree.DecisionTreeRegressor()
best_clf.fit(train_X, train_y)
print(best_clf.score(train_data[features], train_data['sales']))
print(SMAPE1(best_clf.predict(train_data[features]), train_data['sales']))

0.9759100293146691
3.805219587529634


In [29]:
print(best_clf.score(test_X, test_y))
print(SMAPE1(best_clf.predict(test_X), test_y))
print(SMAPE2(best_clf.predict(test_X), test_y))
print(SMAPE3(best_clf.predict(test_X), test_y))

0.9035019489120373
15.220878350118408
7.610439175059204
0.06389892800151672


# Submission

In [63]:
predictions = best_clf.predict(test_data[features])
predictions = predictions.astype(int)
pandas_id = [i for i in range(len(predictions))]

In [64]:
print(predictions)
print(f'{len(pandas_id)} {len(predictions)}')

[ 5 19 19 ... 30 16 41]
45000 45000


In [65]:
pd.read_csv('./data/sample_submission.csv.zip').head()

Unnamed: 0,id,sales
0,0,52
1,1,52
2,2,52
3,3,52
4,4,52


In [66]:
submission_dataframe = pd.DataFrame({'id':pandas_id, 'sales':predictions})

In [67]:
submission_dataframe.tail()

Unnamed: 0,id,sales
44995,44995,26
44996,44996,24
44997,44997,30
44998,44998,16
44999,44999,41


In [68]:
submission_dataframe.to_csv('./data/sumbission.csv', index=False)