In [22]:
from sklearn import model_selection, metrics
from sklearn import (svm, 
                     neighbors, 
                     ensemble, 
                    linear_model, 
                    naive_bayes, 
                    tree, 
                    discriminant_analysis)

In [2]:
import pandas as pd

In [3]:
import time

In [4]:
data = pd.read_csv('./data/preprocessed_train_data.csv')

In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 913000 entries, 0 to 912999
Data columns (total 13 columns):
Unnamed: 0       913000 non-null int64
date             913000 non-null object
store            913000 non-null int64
item             913000 non-null int64
sales            913000 non-null int64
Datetime_date    913000 non-null object
year             913000 non-null int64
month            913000 non-null int64
day              913000 non-null int64
weekday          913000 non-null int64
isWeekend        913000 non-null int64
week             913000 non-null int64
isHoliday        913000 non-null int64
dtypes: int64(11), object(2)
memory usage: 90.6+ MB
None


# Baseline Models

In [6]:
print("Baseline Model:", 1 / len(data['sales'].unique().tolist()))

Baseline Model: 0.004694835680751174


# Train Test Split

In [7]:
sample_data = data.sample(20000)

In [8]:
target = ['sales']
features = ['store', 'item', 'year', 'month', 'week', 'isHoliday', 'day', 'isWeekend', 'weekday']

In [9]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(sample_data[features], sample_data[target], random_state=0)
train_y = train_y['sales']
test_y = test_y['sales']

In [10]:
# List of MLA Algorithms
MLA = [
    # ensemble
    ensemble.AdaBoostClassifier(),
    ensemble.AdaBoostRegressor(),
    ensemble.RandomForestClassifier(), 
    ensemble.RandomForestRegressor(),
    ensemble.BaggingClassifier(),
    ensemble.GradientBoostingRegressor(),
    ensemble.ExtraTreesRegressor(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    # svm
    svm.LinearSVR(), 
    svm.SVR(), 
    svm.NuSVR(),
    
    
    # tree
    tree.DecisionTreeClassifier(),
    tree.DecisionTreeRegressor(),    
]

In [11]:
def ml_training(MLA):
    pd_dataframe = pd.DataFrame(columns=['Name', 'Train_Score', 'Test_Score', 'Time', 'Parameters'])
    row_number = 0
    for alg in MLA:
        alg_name = alg.__class__.__name__
        print("starting:", alg_name)

        start_time = time.time()
        alg.fit(train_X, train_y)
        end_time = time.time()
        time_taken = end_time - start_time

        train_score = alg.score(train_X, train_y)
        test_score = alg.score(test_X, test_y)

        # add to pandas dataframe
        pd_dataframe.loc[row_number] = [alg_name, train_score, test_score, time_taken, alg.get_params()]
        row_number+=1
        
    pd_dataframe.sort_values(by=['Test_Score'], ascending=False, inplace=True)
    print('done')
    return pd_dataframe

In [12]:
MLA_Compare = ml_training(MLA)

starting: AdaBoostClassifier
starting: AdaBoostRegressor
starting: RandomForestClassifier
starting: RandomForestRegressor
starting: BaggingClassifier
starting: GradientBoostingRegressor
starting: ExtraTreesRegressor
starting: KNeighborsClassifier
starting: LinearSVR
starting: SVR
starting: NuSVR
starting: DecisionTreeClassifier
starting: DecisionTreeRegressor
done


In [13]:
MLA_Compare

Unnamed: 0,Name,Train_Score,Test_Score,Time,Parameters
3,RandomForestRegressor,0.969011,0.83901,0.515686,"{'bootstrap': True, 'criterion': 'mse', 'max_d..."
5,GradientBoostingRegressor,0.707088,0.695412,0.680557,"{'alpha': 0.9, 'criterion': 'friedman_mse', 'i..."
12,DecisionTreeRegressor,1.0,0.692646,0.093944,"{'criterion': 'mse', 'max_depth': None, 'max_f..."
6,ExtraTreesRegressor,1.0,0.473312,0.381781,"{'bootstrap': False, 'criterion': 'mse', 'max_..."
1,AdaBoostRegressor,0.238006,0.242033,0.600468,"{'base_estimator': None, 'learning_rate': 1.0,..."
10,NuSVR,0.082793,0.036546,20.567249,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd..."
4,BaggingClassifier,0.9892,0.0288,1.794973,"{'base_estimator': None, 'bootstrap': True, 'b..."
11,DecisionTreeClassifier,1.0,0.0278,0.241843,"{'class_weight': None, 'criterion': 'gini', 'm..."
9,SVR,0.076367,0.018246,17.001092,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd..."
2,RandomForestClassifier,0.989133,0.018,1.016999,"{'bootstrap': True, 'class_weight': None, 'cri..."


In [28]:
print(ensemble.BaggingClassifier().fit(train_X, train_y).score(data[features], data['sales']))
prediction = ensemble.BaggingClassifier().fit(train_X, train_y).predict_proba(data[features])

0.04347426067907996


In [29]:
print(prediction)
print(metrics.log_loss(data['sales'], prediction))

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


ValueError: y_true and y_pred contain different number of classes 213, 169. Please provide the true labels explicitly through the labels argument. Classes found in y_true: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 214 231]