# Diplodatos Kaggle Competition

This competition is part of the practice work for the subject called 'Aprendizaje Supervisado' of the edition 2020  Diplodatos . 
This is a simplified version of the competition for recruiting done by Walmart in the plataform Kaggle (Walmart Recruiting: Trip Type Classification).
The goal is try to predict the trip types of purchases into the supermarket made by costomers. These types are codified in the original dataset.


The main goals are:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

Create a function to load the datasets.

**Some important decisions made here**:
1. We used one hot encoding for Weekday and DepartmentDescription. But, we didn't add an aditional column for the NaN values. All transformations are applied to the training and testing datasets...
1. We dropped NaN values, only in the training dataset, and we could see that it's better for our model. 
1. We didn't drop the Upc and FinelineNumber to simplify the process, and we didn't use one hot encoding for that columns, the model went better when we just used those columns as a number.

In [1]:
def transform_data(train_data_fname, test_data_fname):
    df_train = pd.read_csv(train_data_fname)
    df_train['is_train_set'] = 1
    df_test = pd.read_csv(test_data_fname)
    df_test['is_train_set'] = 0

    
    # we  get the TripType for the train set. To do that, we group by VisitNumber and
    # then we get the max (or min or avg)
    df_train=df_train.dropna(subset=['Upc'])
    y = df_train.groupby(["VisitNumber", "Weekday"], as_index=False).max().TripType
    

    # we remove the TripType now, and concat training and testing data
    # the concat is done so that we have the same columns for both datasets
    # after one-hot encoding
    df_train = df_train.drop("TripType", axis=1)
    df = pd.concat([df_train, df_test])

    # one-hot encoding for the DepartmentDescription    
    df = pd.get_dummies(df, columns=["DepartmentDescription"], dummy_na=False)
   

    # now we add the groupby values
    df = df.groupby(["VisitNumber", "Weekday"], as_index=False).sum()
    
    # finally, we do one-hot encoding for the Weekday
    df = pd.get_dummies(df, columns=["Weekday"], dummy_na=False)
    
    
    # get train and test back
    df_train = df[df.is_train_set != 0]
    df_test = df[df.is_train_set == 0]
    
    X = df_train.drop(["is_train_set"], axis=1)
    yy = None
    XX = df_test.drop(["is_train_set"], axis=1)

    return X, y, XX, yy

Load the data...

In [425]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

In [426]:
X.shape

(66077, 79)

In [427]:
X.head()

Unnamed: 0,VisitNumber,Upc,ScanCount,FinelineNumber,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,...,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
0,5,68113150000.0,-1,1000.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,9,1070088000.0,3,5719.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,10,17009270000.0,3,10073.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,11,22738950000.0,4,7499.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6,12,183217300000.0,7,15639.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [428]:
y.shape

(66077,)

In [429]:
y.head()

0    999
1      8
2      8
3     35
4     41
Name: TripType, dtype: int64

In [430]:
XX.shape

(28645, 79)

In [431]:
XX.head()

Unnamed: 0,VisitNumber,Upc,ScanCount,FinelineNumber,DepartmentDescription_1-HR PHOTO,DepartmentDescription_ACCESSORIES,DepartmentDescription_AUTOMOTIVE,DepartmentDescription_BAKERY,DepartmentDescription_BATH AND SHOWER,DepartmentDescription_BEAUTY,...,DepartmentDescription_SWIMWEAR/OUTERWEAR,DepartmentDescription_TOYS,DepartmentDescription_WIRELESS,Weekday_Friday,Weekday_Monday,Weekday_Saturday,Weekday_Sunday,Weekday_Thursday,Weekday_Tuesday,Weekday_Wednesday
1,7,67949630000.0,2,13435.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,8,425923900000.0,28,58669.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
7,15,1019672000000.0,9,7880.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,19,1733264000000.0,9,38013.0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
11,23,692249600000.0,2,8506.0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [432]:
XX.columns

Index(['VisitNumber', 'Upc', 'ScanCount', 'FinelineNumber',
       'DepartmentDescription_1-HR PHOTO', 'DepartmentDescription_ACCESSORIES',
       'DepartmentDescription_AUTOMOTIVE', 'DepartmentDescription_BAKERY',
       'DepartmentDescription_BATH AND SHOWER', 'DepartmentDescription_BEAUTY',
       'DepartmentDescription_BEDDING',
       'DepartmentDescription_BOOKS AND MAGAZINES',
       'DepartmentDescription_BOYS WEAR',
       'DepartmentDescription_BRAS & SHAPEWEAR',
       'DepartmentDescription_CAMERAS AND SUPPLIES',
       'DepartmentDescription_CANDY, TOBACCO, COOKIES',
       'DepartmentDescription_CELEBRATION', 'DepartmentDescription_COMM BREAD',
       'DepartmentDescription_CONCEPT STORES',
       'DepartmentDescription_COOK AND DINE', 'DepartmentDescription_DAIRY',
       'DepartmentDescription_DSD GROCERY',
       'DepartmentDescription_ELECTRONICS',
       'DepartmentDescription_FABRICS AND CRAFTS',
       'DepartmentDescription_FINANCIAL SERVICES',
       'DepartmentD

In [433]:
import xgboost

In [434]:
data_dmatrix = xgboost.DMatrix(data=X,label=y)

Create the model and evaluate it

In [435]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [436]:
from xgboost import XGBClassifier

In [466]:
XGBClassifier??

In [420]:
my_model = XGBClassifier(objective='binary:logistic', n_estimators=1000, learning_rate=0.1, subsample=0.5)
my_model.fit(X_train, y_train, 
             early_stopping_rounds=5, 
             eval_set=[(X_valid, y_valid)], 
             verbose=False)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=1000, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.5,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [421]:
from sklearn.metrics import mean_absolute_error

predictions = my_model.predict(X_valid)
print("Mean Absolute Error: " + str(mean_absolute_error(predictions, y_valid)))

Mean Absolute Error: 16.726089588377725


In [422]:
from sklearn.metrics import accuracy_score

In [423]:
print("Accuracy: " + str(accuracy_score(y_valid,predictions)))

Accuracy: 0.711410411622276


**And finally**, we predict the unknown label for the testing set

In [437]:
X.shape, XX.shape

((66077, 79), (28645, 79))

In [457]:
# results dataframe is used to store the computed results
results = pd.DataFrame(columns=('clf', 'best_acc'))

In [458]:
results = results.append({'clf': predictions.reshape(-1,1), 'best_acc': accuracy_score(y_valid,predictions)}, ignore_index=True)

In [459]:
results

Unnamed: 0,clf,best_acc
0,"[[7], [20], [42], [7], [25], [4], [38], [9], [...",0.71141


In [462]:
yy = my_model.predict(XX)

The last thing we do is generating a file that should be *submitted* on kaggle

In [463]:
submission = pd.DataFrame(list(zip(XX.VisitNumber, yy)), columns=["VisitNumber", "TripType"])

In [465]:
submission

Unnamed: 0,VisitNumber,TripType
0,7,9
1,8,40
2,15,21
3,19,25
4,23,9
5,25,32
6,47,39
7,57,8
8,61,26
9,63,36


In [464]:
submission.to_csv("../data/submission.csv", header=True, index=False)