# Diplodatos Kaggle Competition

In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    
    # drop the columns we won't use (it may be good to use them somehow)
    df = df.drop(["Upc", "FinelineNumber"], axis=1)

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

Load the data...

In [3]:
X, y, XX, yy = transform_data("./data/train.csv", "./data/test.csv")

Create the model and evaluate it

In [4]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

X_train

Unnamed: 0,VisitNumber,ScanCount,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,DepartmentDescription_BEDDING,DepartmentDescription_BOOKS AND MAGAZINES,...,DepartmentDescription_WIRELESS,DepartmentDescription_nan,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday,Weekday_nan
81954,163907,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
32578,65166,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
86578,173052,5,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
85079,170137,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9788,19404,32,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52950,106158,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8809,17442,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
78302,156542,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1229,2404,8,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [5]:
# results dataframe is used to store the computed results
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [62]:
"""
# Original:
tree_param = {
    'criterion': ('gini', 'entropy'),
    'min_samples_leaf': (1, 2, 5),
    'min_samples_split': (2, 3, 5, 10, 50, 100)
}
tree = DecisionTreeClassifier(random_state=42)
"""

"""
Best models so far (from better to worse):
MLPClassifier(
    random_state=42,
    max_iter=300,
    alpha=0.02,
    hidden_layer_sizes=(64,),
)
DecisionTreeClassifier(
    random_state=42,
    min_samples_leaf=5,
    min_samples_split=101,
    max_depth=59,
    class_weight={999: 0.49}
)
"""

model_param = {
    'alpha': (0.02,),
    # 'learning_rate_init': (0.004,)
}
model = MLPClassifier(random_state=42,
                      verbose=True,
                      max_iter=300,
                      hidden_layer_sizes=(96,))

search = GridSearchCV(model, model_param, cv=3, scoring='accuracy') #scoring='balanced_accuracy')

# NOTICE we exclude visit number; all it does is confuse our model.
search.fit(X_train.drop(columns=['VisitNumber']), y_train)

best_model_clf = search.best_estimator_

Iteration 1, loss = 2.64678491
Iteration 2, loss = 1.52249391
Iteration 3, loss = 1.25002378
Iteration 4, loss = 1.15240720
Iteration 5, loss = 1.10134120
Iteration 6, loss = 1.06640185
Iteration 7, loss = 1.04016634
Iteration 8, loss = 1.02234745
Iteration 9, loss = 1.00483227
Iteration 10, loss = 0.99190553
Iteration 11, loss = 0.97980417
Iteration 12, loss = 0.96981355
Iteration 13, loss = 0.96115705
Iteration 14, loss = 0.95219038
Iteration 15, loss = 0.94573773
Iteration 16, loss = 0.93911502
Iteration 17, loss = 0.93142012
Iteration 18, loss = 0.92576354
Iteration 19, loss = 0.92098344
Iteration 20, loss = 0.91574826
Iteration 21, loss = 0.91091674
Iteration 22, loss = 0.90511703
Iteration 23, loss = 0.90086184
Iteration 24, loss = 0.89763433
Iteration 25, loss = 0.89300720
Iteration 26, loss = 0.89038250
Iteration 27, loss = 0.88723421
Iteration 28, loss = 0.88525967
Iteration 29, loss = 0.88041988
Iteration 30, loss = 0.87724221
Iteration 31, loss = 0.87373057
Iteration 32, los



Iteration 2, loss = 1.52370257
Iteration 3, loss = 1.25849906
Iteration 4, loss = 1.15986531
Iteration 5, loss = 1.10779738
Iteration 6, loss = 1.07327832
Iteration 7, loss = 1.04511953
Iteration 8, loss = 1.02593724
Iteration 9, loss = 1.00725642
Iteration 10, loss = 0.99493582
Iteration 11, loss = 0.98207310
Iteration 12, loss = 0.97169291
Iteration 13, loss = 0.96170225
Iteration 14, loss = 0.95301906
Iteration 15, loss = 0.94568219
Iteration 16, loss = 0.93876658
Iteration 17, loss = 0.93195829
Iteration 18, loss = 0.92642300
Iteration 19, loss = 0.92144374
Iteration 20, loss = 0.91521536
Iteration 21, loss = 0.91184347
Iteration 22, loss = 0.90656117
Iteration 23, loss = 0.90272134
Iteration 24, loss = 0.89992510
Iteration 25, loss = 0.89490605
Iteration 26, loss = 0.89131546
Iteration 27, loss = 0.88804385
Iteration 28, loss = 0.88579580
Iteration 29, loss = 0.88247492
Iteration 30, loss = 0.87872466
Iteration 31, loss = 0.87622254
Iteration 32, loss = 0.87432970
Iteration 33, lo



Iteration 2, loss = 1.52314244
Iteration 3, loss = 1.25286155
Iteration 4, loss = 1.15471672
Iteration 5, loss = 1.09829962
Iteration 6, loss = 1.06227865
Iteration 7, loss = 1.03650481
Iteration 8, loss = 1.01622374
Iteration 9, loss = 0.99895769
Iteration 10, loss = 0.98658810
Iteration 11, loss = 0.97392359
Iteration 12, loss = 0.96557719
Iteration 13, loss = 0.95532473
Iteration 14, loss = 0.94693293
Iteration 15, loss = 0.93947167
Iteration 16, loss = 0.93214560
Iteration 17, loss = 0.92523559
Iteration 18, loss = 0.91964124
Iteration 19, loss = 0.91451842
Iteration 20, loss = 0.90981585
Iteration 21, loss = 0.90462148
Iteration 22, loss = 0.89982537
Iteration 23, loss = 0.89584216
Iteration 24, loss = 0.89165468
Iteration 25, loss = 0.88841322
Iteration 26, loss = 0.88655738
Iteration 27, loss = 0.88227292
Iteration 28, loss = 0.87819707
Iteration 29, loss = 0.87723628
Iteration 30, loss = 0.87269610
Iteration 31, loss = 0.87041769
Iteration 32, loss = 0.86753779
Iteration 33, lo



Iteration 1, loss = 2.31435926
Iteration 2, loss = 1.31313205
Iteration 3, loss = 1.15130124
Iteration 4, loss = 1.08647985
Iteration 5, loss = 1.04743943
Iteration 6, loss = 1.02160805
Iteration 7, loss = 1.00305308
Iteration 8, loss = 0.98778206
Iteration 9, loss = 0.97468677
Iteration 10, loss = 0.96219650
Iteration 11, loss = 0.95311526
Iteration 12, loss = 0.94466233
Iteration 13, loss = 0.93640780
Iteration 14, loss = 0.93014133
Iteration 15, loss = 0.92357671
Iteration 16, loss = 0.91805317
Iteration 17, loss = 0.91386789
Iteration 18, loss = 0.90788682
Iteration 19, loss = 0.90437827
Iteration 20, loss = 0.89957109
Iteration 21, loss = 0.89642926
Iteration 22, loss = 0.89267368
Iteration 23, loss = 0.88912378
Iteration 24, loss = 0.88584359
Iteration 25, loss = 0.88402536
Iteration 26, loss = 0.88110722
Iteration 27, loss = 0.87851491
Iteration 28, loss = 0.87612745
Iteration 29, loss = 0.87402645
Iteration 30, loss = 0.87246637
Iteration 31, loss = 0.86971484
Iteration 32, los



In [63]:
print('Best Decision Tree accuracy: ', search.best_score_)
print(best_model_clf)
results = results.append({'clf': best_model_clf, 'best_acc': search.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
best_clf = results.loc[results['best_acc'].idxmax()]['clf']
print(best_clf)

Best Decision Tree accuracy:  0.6879795396419436
MLPClassifier(alpha=0.02, hidden_layer_sizes=(96,), max_iter=300,
              random_state=42, verbose=True)
The best classifier so far is: 
MLPClassifier(alpha=0.02, hidden_layer_sizes=(64,), max_iter=300,
              random_state=42, verbose=True)


In [64]:
results

Unnamed: 0,clf,best_acc
0,DecisionTreeClassifier(class_weight={999: 0.49...,0.63442
1,"MLPClassifier(alpha=0.01, hidden_layer_sizes=(...",0.687127
2,"MLPClassifier(alpha=0.01, hidden_layer_sizes=(...",0.673934
3,"MLPClassifier(hidden_layer_sizes=(64,), learni...",0.665196
4,"MLPClassifier(alpha=0.01, hidden_layer_sizes=(...",0.68506
5,"MLPClassifier(alpha=0.01, hidden_layer_sizes=(...",0.670716
6,"MLPClassifier(hidden_layer_sizes=(64,), max_it...",0.671057
7,"MLPClassifier(hidden_layer_sizes=(64,), max_it...",0.680733
8,"MLPClassifier(hidden_layer_sizes=(64,), learni...",0.672101
9,"MLPClassifier(hidden_layer_sizes=(64,), learni...",0.673039


**And finally**, we predict the unknown label for the testing set

In [55]:
X.shape, XX.shape

((67029, 79), (28645, 79))

In [56]:
# NOTICE we exclude visit number becuase we didn't use it for training.
yy = best_clf.predict(XX.drop(columns=['VisitNumber']))

The last thing we do is generating a file that should be *submitted* on kaggle

In [57]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [58]:
submission.to_csv("./data/submission.csv", header=True, index=False)