In [6]:
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
from pandarallel import pandarallel
import json

In [7]:
pandarallel.initialize(progress_bar = True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [8]:
df_train = pd.read_csv('Data/X_train_update.csv', index_col = 0)
df_test = pd.read_csv('Data/X_test_update.csv', index_col = 0)
y_train = pd.read_csv('Data/Y_train_CVw08PX.csv', index_col = 0).iloc[df_train.index].values.ravel()

In [9]:
df_train.columns

Index(['designation', 'description', 'productid', 'imageid'], dtype='object')

In [10]:
df_train.drop(['description', 'productid', 'imageid'], axis = 1, inplace = True)
df_test.drop(['description', 'productid', 'imageid'], axis = 1, inplace = True)

In [13]:
len(set(y_train))

27

In [7]:
df_train.head()

Unnamed: 0,designation
0,Olivia: Personalisiertes Notizbuch / 150 Seite...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...
4,La Guerre Des Tuques


In [8]:
nlp_fr = spacy.load("fr_core_news_sm")

In [9]:
def normalize_accent(string):
    string = string.replace('á', 'a')
    string = string.replace('â', 'a')

    string = string.replace('é', 'e')
    string = string.replace('è', 'e')
    string = string.replace('ê', 'e')
    string = string.replace('ë', 'e')

    string = string.replace('î', 'i')
    string = string.replace('ï', 'i')

    string = string.replace('ö', 'o')
    string = string.replace('ô', 'o')
    string = string.replace('ò', 'o')
    string = string.replace('ó', 'o')

    string = string.replace('ù', 'u')
    string = string.replace('û', 'u')
    string = string.replace('ü', 'u')

    string = string.replace('ç', 'c')
    
    return string

In [10]:
def remove_digits(string):
    result = ''.join([i for i in string if not i.isdigit()])
    return result

In [11]:
def raw_to_tokens(raw_string, spacy_nlp):
    # Write code for lower-casing
    string = raw_string.lower()
    
    string = normalize_accent(string)
    
    string = remove_digits(string)
    
    spacy_tokens = spacy_nlp(string)
        
    string_tokens = [token.orth_ for token in spacy_tokens if not token.is_punct and not token.is_stop]
    
    clean_string = " ".join(string_tokens)
    
    return clean_string

In [12]:
X_train.shape

NameError: name 'X_train' is not defined

# Algorithms

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [13]:
pd.read_csv('Results/res_grid_tree.csv')

Unnamed: 0.1,Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_depth,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,0,197.473755,0.334555,0.600453,0.073182,balanced,10,1,"{'class_weight': 'balanced', 'max_depth': 10, ...",0.325561,0.308795,0.313887,0.316082,0.007019,11,0.334549,0.319083,0.33387,0.329167,0.007136
1,1,173.223137,16.914126,0.347319,0.165659,balanced,10,5,"{'class_weight': 'balanced', 'max_depth': 10, ...",0.325717,0.308051,0.314481,0.316084,0.007301,10,0.332457,0.316082,0.331189,0.326576,0.007438
2,2,161.190176,1.997853,0.284537,0.109032,balanced,10,10,"{'class_weight': 'balanced', 'max_depth': 10, ...",0.325591,0.307536,0.313201,0.315444,0.00754,12,0.330185,0.314212,0.329851,0.324749,0.007452
3,3,376.978065,4.206181,0.463485,0.046787,balanced,30,1,"{'class_weight': 'balanced', 'max_depth': 30, ...",0.533022,0.527458,0.521533,0.527339,0.004691,9,0.788382,0.778363,0.748667,0.771804,0.016864
4,4,365.727744,5.52583,0.25119,0.062679,balanced,30,5,"{'class_weight': 'balanced', 'max_depth': 30, ...",0.533765,0.527729,0.523908,0.528468,0.004058,8,0.699257,0.688302,0.668193,0.685251,0.012864
5,5,345.360968,5.012525,0.213559,0.038751,balanced,30,10,"{'class_weight': 'balanced', 'max_depth': 30, ...",0.537937,0.533754,0.523515,0.531737,0.006058,7,0.655108,0.651515,0.62776,0.644795,0.012134
6,6,422.293742,11.213587,0.22502,0.016997,balanced,100,1,"{'class_weight': 'balanced', 'max_depth': 100,...",0.541238,0.541021,0.5363,0.53952,0.002278,4,0.99378,0.993725,0.99366,0.993722,4.9e-05
7,7,388.532567,8.853651,0.459289,0.064754,balanced,100,5,"{'class_weight': 'balanced', 'max_depth': 100,...",0.541169,0.538799,0.538061,0.539343,0.001326,6,0.798489,0.796198,0.798063,0.797584,0.000995
8,8,363.50231,11.957434,0.35014,0.045005,balanced,100,10,"{'class_weight': 'balanced', 'max_depth': 100,...",0.544072,0.541238,0.536136,0.540483,0.003283,2,0.715515,0.715109,0.715147,0.715257,0.000183
9,9,422.1915,8.661335,0.379185,0.079111,balanced,300,1,"{'class_weight': 'balanced', 'max_depth': 300,...",0.541396,0.541851,0.537815,0.540354,0.001805,3,0.99378,0.993725,0.99366,0.993722,4.9e-05


In [14]:
from sklearn.ensemble import BaggingClassifier

base_estimator_1 = DecisionTreeClassifier(max_depth=10, class_weight='balanced')
base_estimator_2 = DecisionTreeClassifier(max_depth=30, class_weight='balanced')
base_estimator_3 = DecisionTreeClassifier(max_depth=100, class_weight='balanced')

parameters = {'base_estimator':[base_estimator_1, base_estimator_2, base_estimator_3], 'n_estimators':[10]}
bagging_tree = BaggingClassifier(n_jobs=-1)

clf = GridSearchCV(
    bagging_tree, 
    parameters,
    scoring = 'f1_weighted', 
    cv=3, 
    return_train_score=True,
    verbose=2,
    n_jobs=-1)

clf = clf.fit(X_train_pca, y_train)
res_grid_bagging = pd.DataFrame(clf.cv_results_)
best_grid_bagging = pd.DataFrame(zip(clf.best_params_))

res_grid_bagging.to_csv('Results/res_grid_bagging.csv')
best_grid_bagging.to_csv('Results/best_grid_bagging.csv')

print(best_grid_bagging)
res_grid_bagging

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of   9 | elapsed: 76.7min remaining: 21.9min
[Parallel(n_jobs=-1)]: Done   9 out of   9 | elapsed: 89.9min finished


                0
0  base_estimator
1    n_estimators


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_base_estimator,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,1593.19326,32.361594,14.823171,9.328475,DecisionTreeClassifier(class_weight='balanced'...,10,{'base_estimator': DecisionTreeClassifier(clas...,0.38635,0.358104,0.335984,0.360152,0.020613,3,0.398161,0.369876,0.358949,0.375662,0.016523
1,2849.551167,240.861407,12.834806,3.557258,DecisionTreeClassifier(class_weight='balanced'...,10,{'base_estimator': DecisionTreeClassifier(clas...,0.638693,0.635587,0.627317,0.633867,0.004801,2,0.901133,0.913769,0.885061,0.899987,0.011748
2,1933.053942,724.95165,4.657761,1.442087,DecisionTreeClassifier(class_weight='balanced'...,10,{'base_estimator': DecisionTreeClassifier(clas...,0.646968,0.648558,0.642874,0.646134,0.002394,1,0.988176,0.988927,0.988336,0.98848,0.000323


In [15]:
from sklearn.ensemble import RandomForestClassifier

parameters = {'max_depth':[10, 30, 100], 'n_estimators':[10], 'min_samples_leaf':[1, 5], 'class_weight':['balanced']}
rf = RandomForestClassifier(random_state=0, n_jobs=-1)

clf = GridSearchCV(
    rf,
    parameters,
    scoring = 'f1_weighted',
    cv = 3,
    return_train_score = True,
    verbose = 2, 
    n_jobs=-1)

clf = clf.fit(X_train_pca, y_train)
res_grid_rf = pd.DataFrame(clf.cv_results_)
best_grid_rf = pd.DataFrame(zip(clf.best_params_))

res_grid_rf.to_csv('Results/res_grid_rf.csv')
best_grid_rf.to_csv('Results/best_grid_rf.csv')

print(best_grid_rf)
res_grid_rf

Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 out of  18 | elapsed:  3.9min finished


                  0
0      class_weight
1         max_depth
2  min_samples_leaf
3      n_estimators


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_max_depth,param_min_samples_leaf,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,31.193221,0.683313,0.945918,0.213192,balanced,10,1,10,"{'class_weight': 'balanced', 'max_depth': 10, ...",0.513159,0.520822,0.503314,0.512433,0.007165,6,0.563521,0.574485,0.564251,0.567419,0.005005
1,30.37116,0.271174,0.456743,0.094352,balanced,10,5,10,"{'class_weight': 'balanced', 'max_depth': 10, ...",0.530176,0.512263,0.510629,0.517692,0.008855,5,0.568663,0.553305,0.558757,0.560242,0.006357
2,61.470233,1.916997,1.080846,0.078038,balanced,30,1,10,"{'class_weight': 'balanced', 'max_depth': 30, ...",0.631562,0.626048,0.620206,0.62594,0.004637,3,0.962227,0.963486,0.961607,0.96244,0.000782
3,53.596864,1.721497,0.974505,0.036374,balanced,30,5,10,"{'class_weight': 'balanced', 'max_depth': 30, ...",0.635012,0.634907,0.629175,0.633032,0.002727,2,0.87147,0.883917,0.893931,0.883106,0.009188
4,61.42438,2.20062,0.934377,0.208853,balanced,100,1,10,"{'class_weight': 'balanced', 'max_depth': 100,...",0.624755,0.625117,0.620882,0.623585,0.001917,4,0.990729,0.990699,0.990457,0.990629,0.000122
5,43.355388,6.499594,0.429324,0.168758,balanced,100,5,10,"{'class_weight': 'balanced', 'max_depth': 100,...",0.641154,0.63448,0.637319,0.637652,0.002735,1,0.918949,0.921393,0.918433,0.919591,0.001291


In [16]:
from sklearn.ensemble import GradientBoostingClassifier

parameters = {'n_estimators':[10],
              'max_depth': [3, 7, 12], 'learning_rate': [0.03, 0.1, 0.3]}
gb = GradientBoostingClassifier()

clf = GridSearchCV(
    gb,
    parameters,
    scoring='f1_weighted',
    cv=3,
    return_train_score=True,
    verbose=2, 
    n_jobs=3)

clf = clf.fit(X_train_pca[:25000], y_train[:25000])
res_grid_gb = pd.DataFrame(clf.cv_results_)
best_grid_gb = pd.DataFrame(zip(clf.best_params_))

res_grid_gb.to_csv('Results/res_grid_gb.csv')
best_grid_gb.to_csv('Results/best_grid_gb.csv')

print(best_grid_gb)
res_grid_gb

Fitting 3 folds for each of 9 candidates, totalling 27 fits


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done  27 out of  27 | elapsed: 318.2min finished


               0
0  learning_rate
1      max_depth
2   n_estimators


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,714.019847,5.331306,0.460934,0.178482,0.03,3,10,"{'learning_rate': 0.03, 'max_depth': 3, 'n_est...",0.447111,0.437856,0.44322,0.442731,0.003795,9,0.493282,0.488485,0.482928,0.488232,0.00423
1,2103.410218,4.688002,0.499041,0.088777,0.03,7,10,"{'learning_rate': 0.03, 'max_depth': 7, 'n_est...",0.502239,0.506676,0.510618,0.506507,0.003423,8,0.672234,0.673093,0.667404,0.67091,0.002504
2,4036.539929,12.052654,0.587334,0.066803,0.03,12,10,"{'learning_rate': 0.03, 'max_depth': 12, 'n_es...",0.507433,0.511876,0.519892,0.513062,0.005155,7,0.801566,0.799998,0.796181,0.799248,0.002261
3,572.974932,1.171121,0.271306,0.054787,0.1,3,10,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.529371,0.523417,0.529006,0.527265,0.002725,6,0.609565,0.607613,0.606432,0.60787,0.001292
4,1791.854611,6.468201,0.351017,0.049304,0.1,7,10,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.556922,0.56531,0.573756,0.565322,0.006873,1,0.850901,0.850592,0.844838,0.848777,0.002788
5,4438.896466,13.526749,0.570763,0.011838,0.1,12,10,"{'learning_rate': 0.1, 'max_depth': 12, 'n_est...",0.543361,0.545918,0.557846,0.549035,0.006311,3,0.945046,0.950122,0.951851,0.949006,0.002888
6,589.690584,1.573393,0.281,0.036697,0.3,3,10,"{'learning_rate': 0.3, 'max_depth': 3, 'n_esti...",0.557269,0.560241,0.571687,0.56306,0.006214,2,0.696732,0.694409,0.6946,0.695247,0.001053
7,1571.878136,4.919483,0.403217,0.05864,0.3,7,10,"{'learning_rate': 0.3, 'max_depth': 7, 'n_esti...",0.544937,0.542256,0.552113,0.546432,0.00416,4,0.903031,0.900562,0.898437,0.900677,0.001878
8,3244.828225,7.71324,0.438039,0.0506,0.3,12,10,"{'learning_rate': 0.3, 'max_depth': 12, 'n_est...",0.524774,0.541115,0.542112,0.535993,0.007951,5,0.989677,0.994248,0.992419,0.992115,0.001878


In [17]:
import xgboost as xgb

parameters = {'n_estimators':[10],
              'max_depth': [3, 7, 12], 'eta':[0.05, 0.1, 0.2, 0.3]}
xgb = xgb.XGBClassifier(objective='multi:softmax')

clf = GridSearchCV(
    xgb,
    parameters,
    scoring='f1_weighted',
    cv=3,
    return_train_score=True,
    verbose=2,
    n_jobs=3)

clf = clf.fit(X_train_pca[:25000], y_train[:25000])
res_grid_xgb = pd.DataFrame(clf.cv_results_)
best_grid_xgb = pd.DataFrame(zip(clf.best_params_))

res_grid_xgb.to_csv('Results/res_grid_xgb.csv')
best_grid_xgb.to_csv('Results/best_grid_xgb.csv')

print(best_grid_xgb)
res_grid_xgb

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] eta=0.05, max_depth=3, n_estimators=10 ..........................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[04:55:28] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[04:55:29] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[04:55:30] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[04:55:31] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=3
[04:55:32] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[04:55:33] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[04:55:34] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=3
[04:55:35] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[04:55:35] INFO: src/tree/updater_prune.cc:74: tree pruni

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  6.0min remaining:    0.0s


[05:01:29] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[05:01:30] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=3
[05:01:31] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[05:01:32] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 10 extra nodes, 0 pruned nodes, max_depth=3
[05:01:33] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[05:01:34] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 14 extra nodes, 0 pruned nodes, max_depth=3
[05:01:35] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 6 extra nodes, 0 pruned nodes, max_depth=3
[05:01:36] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 12 extra nodes, 0 pruned nodes, max_depth=3
[05:01:37] INFO: src/tree/updater_prune.cc:74: tree pruni

[Parallel(n_jobs=1)]: Done  36 out of  36 | elapsed: 440.3min finished


[12:15:54] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 226 extra nodes, 0 pruned nodes, max_depth=12
[12:16:02] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 156 extra nodes, 0 pruned nodes, max_depth=12
[12:16:08] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 150 extra nodes, 0 pruned nodes, max_depth=12
[12:16:14] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 50 extra nodes, 0 pruned nodes, max_depth=12
[12:16:20] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 114 extra nodes, 0 pruned nodes, max_depth=12
[12:16:26] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 62 extra nodes, 0 pruned nodes, max_depth=12
[12:16:32] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 38 extra nodes, 0 pruned nodes, max_depth=12
[12:16:38] INFO: src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 252 extra nodes, 0 pruned nodes, max_depth=12
[12:16:44] INFO: src/tree/updater_prune.cc:

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_eta,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,mean_train_score,std_train_score
0,351.301025,1.922713,3.595205,0.058503,0.05,3,10,"{'eta': 0.05, 'max_depth': 3, 'n_estimators': 10}",0.560631,0.561505,0.562676,0.561603,0.000838,9,0.61245,0.608999,0.609333,0.610261,0.001554
1,697.52349,1.967992,3.782681,0.085168,0.05,7,10,"{'eta': 0.05, 'max_depth': 7, 'n_estimators': 10}",0.631193,0.63516,0.643861,0.636733,0.00529,5,0.828265,0.828593,0.834863,0.830573,0.003036
2,1079.245505,6.426192,3.973507,0.058581,0.05,12,10,"{'eta': 0.05, 'max_depth': 12, 'n_estimators':...",0.640463,0.649276,0.654848,0.64819,0.005923,1,0.951599,0.949807,0.953056,0.951487,0.001329
3,355.572879,1.573162,3.660999,0.050834,0.1,3,10,"{'eta': 0.1, 'max_depth': 3, 'n_estimators': 10}",0.560631,0.561505,0.562676,0.561603,0.000838,9,0.61245,0.608999,0.609333,0.610261,0.001554
4,698.635305,2.407644,3.908108,0.056669,0.1,7,10,"{'eta': 0.1, 'max_depth': 7, 'n_estimators': 10}",0.631193,0.63516,0.643861,0.636733,0.00529,5,0.828265,0.828593,0.834863,0.830573,0.003036
5,1083.575697,6.032767,3.884159,0.036725,0.1,12,10,"{'eta': 0.1, 'max_depth': 12, 'n_estimators': 10}",0.640463,0.649276,0.654848,0.64819,0.005923,1,0.951599,0.949807,0.953056,0.951487,0.001329
6,358.494365,0.664636,3.59748,0.063335,0.2,3,10,"{'eta': 0.2, 'max_depth': 3, 'n_estimators': 10}",0.560631,0.561505,0.562676,0.561603,0.000838,9,0.61245,0.608999,0.609333,0.610261,0.001554
7,707.988656,1.009221,3.805507,0.064736,0.2,7,10,"{'eta': 0.2, 'max_depth': 7, 'n_estimators': 10}",0.631193,0.63516,0.643861,0.636733,0.00529,5,0.828265,0.828593,0.834863,0.830573,0.003036
8,1086.649182,4.274854,4.17901,0.083456,0.2,12,10,"{'eta': 0.2, 'max_depth': 12, 'n_estimators': 10}",0.640463,0.649276,0.654848,0.64819,0.005923,1,0.951599,0.949807,0.953056,0.951487,0.001329
9,361.561309,0.267082,3.600784,0.033255,0.3,3,10,"{'eta': 0.3, 'max_depth': 3, 'n_estimators': 10}",0.560631,0.561505,0.562676,0.561603,0.000838,9,0.61245,0.608999,0.609333,0.610261,0.001554


In [None]:
Decision Trees, Bagging, Random forests, Boosting, Gradient Boosted Trees, AdaBoost, etc.