# Diplodatos Kaggle Competition

In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import CategoricalNB, GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [2]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])
    
    # the next three operations are the ones we have just presented in the previous lines
    
    # drop the columns we won't use (it may be good to use them somehow)
    df = df.drop(["Upc", "FinelineNumber"], axis=1)

    # one-hot encoding for the DepartmentDescription
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=True)

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=True)

    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

Load the data...

In [3]:
X, y, XX, yy = transform_data("./data/train.csv", "./data/test.csv")

Create the model and evaluate it

In [4]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it could be useful for you depending on your approach)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

X_train

Unnamed: 0,VisitNumber,ScanCount,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,DepartmentDescription_BEDDING,DepartmentDescription_BOOKS AND MAGAZINES,...,DepartmentDescription_WIRELESS,DepartmentDescription_nan,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday,Weekday_nan
81954,163907,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
32578,65166,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
86578,173052,5,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
85079,170137,7,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
9788,19404,32,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52950,106158,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8809,17442,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
78302,156542,5,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1229,2404,8,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [5]:
# results dataframe is used to store the computed results
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [75]:
"""
# Original:
model_param = {
    'criterion': ('gini', 'entropy'),
    'min_samples_leaf': (1, 2, 5),
    'min_samples_split': (2, 3, 5, 10, 50, 100)
}
model = DecisionTreeClassifier(random_state=42)
"""

"""
Best models so far (from better to worse):
MLPClassifier(
    alpha=0.03,
    hidden_layer_sizes=(96,),
    max_iter=300,
    n_iter_no_change=30,
    random_state=42
)
MLPClassifier(
    alpha=0.02,
    hidden_layer_sizes=(64,),
    max_iter=300,
    random_state=42
)
DecisionTreeClassifier(
    random_state=42,
    min_samples_leaf=5,
    min_samples_split=101,
    max_depth=59,
    class_weight={999: 0.49}
)
"""

model_param = {}
model = MLPClassifier(random_state=42,
                      verbose=True,
                      max_iter=300,
                      alpha=0.03,
                      # learning_rate_init=0.002,
                      n_iter_no_change=30,
                      hidden_layer_sizes=(96,))

search = GridSearchCV(model, model_param, cv=3, scoring='accuracy') # scoring='balanced_accuracy'

# NOTICE we exclude visit number; all it does is confuse our model.
search.fit(X_train.drop(columns=['VisitNumber']), y_train)

best_model_clf = search.best_estimator_

Iteration 1, loss = 2.65077244
Iteration 2, loss = 1.52906854
Iteration 3, loss = 1.25866649
Iteration 4, loss = 1.16246583
Iteration 5, loss = 1.11252099
Iteration 6, loss = 1.07847494
Iteration 7, loss = 1.05305404
Iteration 8, loss = 1.03589541
Iteration 9, loss = 1.01901410
Iteration 10, loss = 1.00652433
Iteration 11, loss = 0.99476784
Iteration 12, loss = 0.98538098
Iteration 13, loss = 0.97727694
Iteration 14, loss = 0.96879138
Iteration 15, loss = 0.96259563
Iteration 16, loss = 0.95645002
Iteration 17, loss = 0.94899170
Iteration 18, loss = 0.94361123
Iteration 19, loss = 0.93912256
Iteration 20, loss = 0.93422943
Iteration 21, loss = 0.92975527
Iteration 22, loss = 0.92425646
Iteration 23, loss = 0.92021889
Iteration 24, loss = 0.91733927
Iteration 25, loss = 0.91296167
Iteration 26, loss = 0.91049729
Iteration 27, loss = 0.90773773
Iteration 28, loss = 0.90593885
Iteration 29, loss = 0.90130023
Iteration 30, loss = 0.89840073
Iteration 31, loss = 0.89512262
Iteration 32, los



Iteration 1, loss = 2.65192348
Iteration 2, loss = 1.53010121
Iteration 3, loss = 1.26704637
Iteration 4, loss = 1.16990112
Iteration 5, loss = 1.11903721
Iteration 6, loss = 1.08541089
Iteration 7, loss = 1.05801043
Iteration 8, loss = 1.03952680
Iteration 9, loss = 1.02142677
Iteration 10, loss = 1.00975189
Iteration 11, loss = 0.99741099
Iteration 12, loss = 0.98753235
Iteration 13, loss = 0.97804482
Iteration 14, loss = 0.96981408
Iteration 15, loss = 0.96279314
Iteration 16, loss = 0.95636548
Iteration 17, loss = 0.94979144
Iteration 18, loss = 0.94444412
Iteration 19, loss = 0.94005714
Iteration 20, loss = 0.93403602
Iteration 21, loss = 0.93096842
Iteration 22, loss = 0.92603549
Iteration 23, loss = 0.92269823
Iteration 24, loss = 0.92008604
Iteration 25, loss = 0.91552814
Iteration 26, loss = 0.91214095
Iteration 27, loss = 0.90932991
Iteration 28, loss = 0.90733550
Iteration 29, loss = 0.90396280
Iteration 30, loss = 0.90056193
Iteration 31, loss = 0.89841398
Iteration 32, los



Iteration 2, loss = 1.52961685
Iteration 3, loss = 1.26150181
Iteration 4, loss = 1.16491556
Iteration 5, loss = 1.10943807
Iteration 6, loss = 1.07438193
Iteration 7, loss = 1.04933488
Iteration 8, loss = 1.02972078
Iteration 9, loss = 1.01312534
Iteration 10, loss = 1.00120592
Iteration 11, loss = 0.98916749
Iteration 12, loss = 0.98121889
Iteration 13, loss = 0.97133273
Iteration 14, loss = 0.96338130
Iteration 15, loss = 0.95643144
Iteration 16, loss = 0.94946519
Iteration 17, loss = 0.94294302
Iteration 18, loss = 0.93770478
Iteration 19, loss = 0.93298880
Iteration 20, loss = 0.92850879
Iteration 21, loss = 0.92349849
Iteration 22, loss = 0.91904488
Iteration 23, loss = 0.91528045
Iteration 24, loss = 0.91143937
Iteration 25, loss = 0.90853880
Iteration 26, loss = 0.90675774
Iteration 27, loss = 0.90287850
Iteration 28, loss = 0.89899336
Iteration 29, loss = 0.89837907
Iteration 30, loss = 0.89402717
Iteration 31, loss = 0.89202771
Iteration 32, loss = 0.88921351
Iteration 33, lo



Iteration 1, loss = 2.31898690
Iteration 2, loss = 1.32116981
Iteration 3, loss = 1.16118608
Iteration 4, loss = 1.09785834
Iteration 5, loss = 1.05998432
Iteration 6, loss = 1.03507629
Iteration 7, loss = 1.01734657
Iteration 8, loss = 1.00281886
Iteration 9, loss = 0.99041811
Iteration 10, loss = 0.97850160
Iteration 11, loss = 0.96989282
Iteration 12, loss = 0.96188902
Iteration 13, loss = 0.95402727
Iteration 14, loss = 0.94829042
Iteration 15, loss = 0.94195380
Iteration 16, loss = 0.93682163
Iteration 17, loss = 0.93275311
Iteration 18, loss = 0.92706803
Iteration 19, loss = 0.92364380
Iteration 20, loss = 0.91928078
Iteration 21, loss = 0.91635307
Iteration 22, loss = 0.91278000
Iteration 23, loss = 0.90928386
Iteration 24, loss = 0.90627275
Iteration 25, loss = 0.90468188
Iteration 26, loss = 0.90150412
Iteration 27, loss = 0.89934738
Iteration 28, loss = 0.89718506
Iteration 29, loss = 0.89520978
Iteration 30, loss = 0.89381426
Iteration 31, loss = 0.89120649
Iteration 32, los



In [76]:
print('Best Decision Tree accuracy: ', search.best_score_)
print(best_model_clf)
results = results.append({'clf': best_model_clf, 'best_acc': search.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
best_clf = results.loc[results['best_acc'].idxmax()]['clf']
print(best_clf)

Best Decision Tree accuracy:  0.6926257459505542
MLPClassifier(alpha=0.03, hidden_layer_sizes=(96,), max_iter=300,
              n_iter_no_change=30, random_state=42, verbose=True)
The best classifier so far is: 
MLPClassifier(alpha=0.03, hidden_layer_sizes=(96,), max_iter=300,
              n_iter_no_change=30, random_state=42, verbose=True)


In [77]:
results

Unnamed: 0,clf,best_acc
0,DecisionTreeClassifier(class_weight={999: 0.49...,0.63442
1,"MLPClassifier(alpha=0.01, hidden_layer_sizes=(...",0.687127
2,"MLPClassifier(alpha=0.01, hidden_layer_sizes=(...",0.673934
3,"MLPClassifier(hidden_layer_sizes=(64,), learni...",0.665196
4,"MLPClassifier(alpha=0.01, hidden_layer_sizes=(...",0.68506
5,"MLPClassifier(alpha=0.01, hidden_layer_sizes=(...",0.670716
6,"MLPClassifier(hidden_layer_sizes=(64,), max_it...",0.671057
7,"MLPClassifier(hidden_layer_sizes=(64,), max_it...",0.680733
8,"MLPClassifier(hidden_layer_sizes=(64,), learni...",0.672101
9,"MLPClassifier(hidden_layer_sizes=(64,), learni...",0.673039


**And finally**, we predict the unknown label for the testing set

In [78]:
X.shape, XX.shape

((67029, 79), (28645, 79))

In [79]:
# NOTICE we exclude visit number becuase we didn't use it for training.
yy = best_clf.predict(XX.drop(columns=['VisitNumber']))

The last thing we do is generating a file that should be *submitted* on kaggle

In [80]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [81]:
submission.to_csv("./data/submission.csv", header=True, index=False)