In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import (svm, 
                     neighbors, 
                     ensemble, 
                    linear_model, 
                    naive_bayes, 
                    tree, 
                    discriminant_analysis)

import datetime
import time

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline 

import seaborn as sns

In [3]:
train_data = pd.read_csv('./data/train.csv.zip')

In [32]:
print(train_data.tail())

              date  store  item  sales
912995  2017-12-27     10    50     63
912996  2017-12-28     10    50     59
912997  2017-12-29     10    50     74
912998  2017-12-30     10    50     62
912999  2017-12-31     10    50     82


In [5]:
data = train_data.copy(deep=True)

In [6]:
data['Datetime_date'] = data['date'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

In [7]:
data['year'] = data['Datetime_date'].map(lambda x: x.year)
data['month'] = data['Datetime_date'].map(lambda x: x.month)
data['day'] = data['Datetime_date'].map(lambda x: x.day)

In [8]:
date_cols = ['year', 'month', 'day']
print(data[date_cols].head())

   year  month  day
0  2013      1    1
1  2013      1    2
2  2013      1    3
3  2013      1    4
4  2013      1    5


In [9]:
print(data['store'].unique().tolist())
# there are 10 stores

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [10]:
print(data['item'].unique().tolist())
# there are 50 items

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]


In [11]:
print(data['sales'].unique().tolist())

[13, 11, 14, 10, 12, 9, 7, 5, 16, 18, 15, 8, 6, 21, 20, 17, 22, 19, 24, 23, 26, 25, 27, 38, 34, 29, 31, 30, 4, 36, 28, 33, 32, 37, 35, 43, 40, 41, 39, 42, 50, 45, 44, 47, 53, 49, 46, 48, 51, 58, 54, 57, 55, 52, 3, 2, 1, 59, 56, 60, 63, 69, 64, 67, 65, 71, 61, 73, 62, 82, 78, 68, 74, 70, 87, 66, 77, 88, 76, 75, 102, 72, 92, 86, 79, 85, 81, 90, 84, 80, 103, 97, 96, 95, 89, 104, 94, 100, 91, 83, 106, 101, 98, 115, 93, 111, 119, 99, 108, 110, 120, 105, 126, 109, 114, 113, 112, 121, 107, 117, 118, 139, 124, 131, 123, 138, 134, 127, 136, 116, 125, 122, 128, 150, 129, 135, 137, 132, 133, 145, 130, 144, 0, 148, 141, 140, 152, 147, 169, 156, 159, 153, 142, 157, 155, 163, 143, 154, 165, 146, 160, 158, 151, 164, 171, 161, 177, 162, 175, 181, 168, 167, 149, 174, 170, 176, 178, 166, 173, 187, 182, 189, 179, 172, 204, 180, 190, 191, 210, 184, 183, 186, 185, 199, 196, 194, 197, 207, 209, 195, 198, 231, 205, 192, 200, 193, 188, 208, 201, 214, 206, 203, 202]


In [13]:
data['weekday'] = data['Datetime_date'].map(lambda x: x.weekday())

In [15]:
data['isWeekend'] = data['weekday'].map(lambda x: 1 if x==6 or x==5 else 0)

In [46]:
data['week'] = data['Datetime_date'].map(lambda x: x.isocalendar()[1])

In [48]:
print(data['week'].unique())

[ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
 49 50 51 52 53]


In [None]:
data.to_csv('./data/train_features.csv.zip')

# Machine Learning

In [17]:
print("Baseline Model:", 1 / len(data['sales'].unique().tolist()))

Baseline Model: 0.004694835680751174


In [51]:
target = ['sales']
features = ['year', 'month', 'day', 'store', 'item', 'isWeekend', 'weekday', 'week']

In [52]:
small_data = data.sample(30000)

In [53]:
train_X, test_X, train_y, test_y = model_selection.train_test_split(small_data[features], small_data[target], random_state=0)
train_y = train_y['sales']
test_y = test_y['sales']

In [54]:
MLA = [
    # ensemble
    ensemble.AdaBoostClassifier(),
    ensemble.AdaBoostRegressor(),
    ensemble.RandomForestClassifier(), 
    ensemble.RandomForestRegressor(),
    ensemble.BaggingClassifier(),
    ensemble.GradientBoostingRegressor(),
    ensemble.ExtraTreesRegressor(),
    
    #Nearest Neighbor
    neighbors.KNeighborsClassifier(),
    
    # svm
    svm.LinearSVR(), 
    svm.SVR(), 
    svm.NuSVR(),
    
    
    # tree
    tree.DecisionTreeClassifier(),
    tree.DecisionTreeRegressor(),    
]
MLA_Compare = pd.DataFrame(columns=['Name', 'Train_Score', 'Test_Score', 'Time', 'Parameters'])

In [55]:
row_number = 0
for alg in MLA:
    alg_name = alg.__class__.__name__
    print("starting:", alg_name)
    start = time.time()
    alg.fit(train_X, train_y)
    end = time.time()
    time_taken = end - start 
    train_score = alg.score(train_X, train_y)
    test_score = alg.score(test_X, test_y)
    
    # add to pandas dataframe
    MLA_Compare.loc[row_number] = [alg_name, train_score, test_score, time_taken, alg.get_params()]
    row_number+=1

starting: AdaBoostClassifier
starting: AdaBoostRegressor
starting: RandomForestClassifier
starting: RandomForestRegressor
starting: BaggingClassifier
starting: GradientBoostingRegressor
starting: ExtraTreesRegressor
starting: KNeighborsClassifier
starting: LinearSVR
starting: SVR
starting: NuSVR
starting: DecisionTreeClassifier
starting: DecisionTreeRegressor


In [56]:
MLA_Compare.sort_values(by=['Test_Score'], ascending=False, inplace=True)
MLA_Compare

Unnamed: 0,Name,Train_Score,Test_Score,Time,Parameters
3,RandomForestRegressor,0.971467,0.844138,0.817533,"{'bootstrap': True, 'criterion': 'mse', 'max_d..."
12,DecisionTreeRegressor,1.0,0.705199,0.150925,"{'criterion': 'mse', 'max_depth': None, 'max_f..."
5,GradientBoostingRegressor,0.700732,0.689434,1.06937,"{'alpha': 0.9, 'criterion': 'friedman_mse', 'i..."
6,ExtraTreesRegressor,1.0,0.525502,0.61367,"{'bootstrap': False, 'criterion': 'mse', 'max_..."
1,AdaBoostRegressor,0.24843,0.234735,1.136352,"{'base_estimator': None, 'learning_rate': 1.0,..."
10,NuSVR,0.08924,0.048483,58.470457,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd..."
9,SVR,0.087243,0.034824,47.298573,"{'C': 1.0, 'cache_size': 200, 'coef0': 0.0, 'd..."
4,BaggingClassifier,0.988889,0.032667,2.429596,"{'base_estimator': None, 'bootstrap': True, 'b..."
11,DecisionTreeClassifier,1.0,0.030933,0.507725,"{'class_weight': None, 'criterion': 'gini', 'm..."
2,RandomForestClassifier,0.987156,0.017067,1.239274,"{'bootstrap': True, 'class_weight': None, 'cri..."


In [39]:
MLA_Compare.to_csv('./scores/mla_algorithms_trainning_20k.csv')

In [42]:
param_grid = {
    'n_estimators':[50, 55, 60], 
    'criterion':['mse', 'mae'],
#     'loss':['ls','lad','huber', 'quantile'],
#     'learning_rate':[0.1, 0.01, 0.001], 
#     'n_estimators':[100,200,300],
#     'max_depth':[1,2,3,4],
#     'min_samples_split':[0.1, 0.25, 0.75, 1.0], #(0, 1)
#     'min_weight_fraction_leaf':[0,0.25,0.5],  # [0, 0.5]  
}

In [43]:
tune_model = model_selection.GridSearchCV(ensemble.RandomForestRegressor(), param_grid=param_grid)
print("starting:", tune_model.__class__.__name__)
tune_model.fit(train_X, train_y)
print('finished')

starting: GridSearchCV


KeyboardInterrupt: 

In [183]:
print(tune_model.score(train_X, train_y))
print(tune_model.score(test_X, test_y))
# print(tune_model.score(data[features], data[target]['sales']))


0.8338084879771797
0.8231509811658636


In [184]:
gird_params = [
#     AdaBoostClassifier
    {
        'n_estimators':[30, 35, 40, 35, 50, 55, 60], 
        'learning_rate': [0.1, 0.01, 0.001], 
    }, 
#     AdaBoostRegressor
    {
        'n_estimators':[30, 35, 40, 35, 50, 55, 60], 
        'learning_rate':[0.1, 0.01, 0.001],   
        'loss':['linear', 'square', 'exponential']
    },
#     RandomForestClassifier
    {
        'n_estimators':[1, 5, 10, 15, 20, 25],
        'criterion':['gini', 'entropy']
    },
#     RandomForestRegressor
    {
        'n_estimators':[30, 35, 40, 45, 50, 55, 60], 
        'criterion':['mse', 'mae'],
    },
#     BaggingClassifier
    {
        'n_estimators':[30, 35, 40, 45, 50, 55, 60],         
    },
#     GradientBoostingRegressor
    {
        'loss':['ls','lad','huber', 'quantile'],
        'learning_rate':[0.1, 0.01, 0.001], 
        'n_estimators':[100,200,300],
        'max_depth':[1,2,3,4],
        'min_samples_split':[0.1, 0.25, 0.75, 1.0], #(0, 1)
        'min_weight_fraction_leaf':[0,0.25,0.5],  # [0, 0.5]  
    },
#     ExtraTreesRegressor
    {
        'n_estimators':[30, 35, 40, 45, 50, 55, 60], 
        'criterion':['mse', 'mae'],        
    },
#     KNeighborsClassifier
    {
        'n_neighbors':[5,6,7,8,9,10,11,12],
        'weights':['uniform', 'distance'],
        'leaf_size':[20,25,30,35,40],        
    },
#     LinearSVR
    {
        'loss':['epsilon_insensitive', 'squared_epsilon_insensitive']        
    },
#     SVR
    {
        'kernel':['rbf', 'sigmoid', 'linear', 'poly']
        
    },
#     NuSVR
    {
        
    },
#     DecisionTreeClassifier
    {
        
    },
#     DecisionTreeRegressor
    {
        
    },
]

In [185]:
MLA_Hyperparameter = pd.DataFrame(columns=['Name', 'Train_Score', 'Test_Score', 'Time', 'Parameters'])

In [186]:
row_number = 0
for alg, params in zip(MLA, gird_params):
    alg_name = alg.__class__.__name__
    print("starting:", alg_name)
    
    start = time.time()
    tune_model = model_selection.GridSearchCV(alg, param_grid=params)
    tune_model.fit(train_X, train_y)
    end = time.time()
    time_taken = end - start 
    
    train_score = tune_model.score(train_X, train_y)
    test_score = tune_model.score(test_X, test_y)
    
    # add to pandas dataframe
    MLA_Hyperparameter.loc[row_number] = [alg_name, train_score, test_score, time_taken, tune_model.best_params_]
    row_number+=1

starting: AdaBoostClassifier




KeyboardInterrupt: 

In [None]:
MLA_Hyperparameter.sort_values(by=['Test_Score'], ascending=False, inplace=True)
MLA_Hyperparameter

In [None]:
MLA_Hyperparameter.to_csv('./score/hyperparameters_mla_scores_20k.csv')

In [None]:
def ml_training(MLA, params=[]):
    
    row_number = 0
    for alg, params in zip(MLA, gird_params):
        alg_name = alg.__class__.__name__
        print("starting:", alg_name)

        start = time.time()
        tune_model = model_selection.GridSearchCV(alg, param_grid=params)
        tune_model.fit(train_X, train_y)
        end = time.time()
        time_taken = end - start 

        train_score = tune_model.score(train_X, train_y)
        test_score = tune_model.score(test_X, test_y)

        # add to pandas dataframe
        MLA_Hyperparameter.loc[row_number] = [alg_name, train_score, test_score, time_taken, tune_model.best_params_]
        row_number+=1
        