In [1]:
import pandas as pd
import numpy as np
import spacy
import matplotlib.pyplot as plt
from pandarallel import pandarallel
import json

In [2]:
pandarallel.initialize(progress_bar = True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [3]:
df_train = pd.read_csv('Data/X_train_update.csv', index_col = 0)
df_test = pd.read_csv('Data/X_test_update.csv', index_col = 0)
y_train = pd.read_csv('Data/Y_train_CVw08PX.csv', index_col = 0).iloc[df_train.index].values.ravel()

In [4]:
df_train.columns

Index(['designation', 'description', 'productid', 'imageid'], dtype='object')

In [5]:
df_train.drop(['description', 'productid', 'imageid'], axis = 1, inplace = True)
df_test.drop(['description', 'productid', 'imageid'], axis = 1, inplace = True)

In [6]:
df_train.head()

Unnamed: 0,designation
0,Olivia: Personalisiertes Notizbuch / 150 Seite...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...
4,La Guerre Des Tuques


In [7]:
nlp_fr = spacy.load("fr_core_news_sm")

In [8]:
def normalize_accent(string):
    string = string.replace('á', 'a')
    string = string.replace('â', 'a')

    string = string.replace('é', 'e')
    string = string.replace('è', 'e')
    string = string.replace('ê', 'e')
    string = string.replace('ë', 'e')

    string = string.replace('î', 'i')
    string = string.replace('ï', 'i')

    string = string.replace('ö', 'o')
    string = string.replace('ô', 'o')
    string = string.replace('ò', 'o')
    string = string.replace('ó', 'o')

    string = string.replace('ù', 'u')
    string = string.replace('û', 'u')
    string = string.replace('ü', 'u')

    string = string.replace('ç', 'c')
    
    return string

In [9]:
def remove_digits(string):
    result = ''.join([i for i in string if not i.isdigit()])
    return result

In [10]:
def raw_to_tokens(raw_string, spacy_nlp):
    # Write code for lower-casing
    string = raw_string.lower()
    
    string = normalize_accent(string)
    
    string = remove_digits(string)
    
    spacy_tokens = spacy_nlp(string)
        
    string_tokens = [token.orth_ for token in spacy_tokens if not token.is_punct and not token.is_stop]
    
    clean_string = " ".join(string_tokens)
    
    return clean_string

In [11]:
X_train_pca = pd.read_csv('Data/X_train_pca_1000.csv', index_col=0)
X_test_pca = pd.read_csv('Data/X_test_pca_1000.csv', index_col=0)

In [12]:
X_train_pca.shape

(84916, 1000)

# Algorithms

### Decision Tree

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.ensemble import BaggingClassifier

base_estimator_1 = DecisionTreeClassifier(max_depth=10, class_weight='balanced')
base_estimator_2 = DecisionTreeClassifier(max_depth=30, class_weight='balanced')
base_estimator_3 = DecisionTreeClassifier(max_depth=100, class_weight='balanced')

parameters = {'base_estimator':[base_estimator_1, base_estimator_2, base_estimator_3], 'n_estimators':[10]}
bagging_tree = BaggingClassifier(n_jobs=-1)

clf = GridSearchCV(
    bagging_tree, 
    parameters,
    scoring = 'f1_weighted', 
    cv=3, 
    return_train_score=True,
    verbose=2,
    n_jobs=-1)

clf = clf.fit(X_train_pca, y_train)
res_grid_bagging = pd.DataFrame(clf.cv_results_)
best_grid_bagging = pd.DataFrame(zip(clf.best_params_))

res_grid_bagging.to_csv('Results/res_grid_bagging.csv')
best_grid_bagging.to_csv('Results/best_grid_bagging.csv')

print(best_grid_bagging)
res_grid_bagging

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
from sklearn.ensemble import RandomForestClassifier

parameters = {'max_depth':[10, 30, 100], 'n_estimators':[10], 'min_samples_leaf':[1, 5], 'class_weight':['balanced']}
rf = RandomForestClassifier(random_state=0, n_jobs=-1)

clf = GridSearchCV(
    rf,
    parameters,
    scoring = 'f1_weighted',
    cv = 3,
    return_train_score = True,
    verbose = 2, 
    n_jobs=-1)

clf = clf.fit(X_train_pca, y_train)
res_grid_rf = pd.DataFrame(clf.cv_results_)
best_grid_rf = pd.DataFrame(zip(clf.best_params_))

res_grid_rf.to_csv('Results/res_grid_rf.csv')
best_grid_rf.to_csv('Results/best_grid_rf.csv')

print(best_grid_rf)
res_grid_rf

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

parameters = {'n_estimators':[10],
              'max_depth': [10, 30, 100]}
gb = GradientBoostingClassifier()

clf = GridSearchCV(
    gb,
    parameters,
    scoring='f1_weighted',
    cv=3,
    return_train_score=True,
    verbose=2, 
    n_jobs=-1)

clf = clf.fit(X_train_pca, y_train)
res_grid_gb = pd.DataFrame(clf.cv_results_)
best_grid_gb = pd.DataFrame(zip(clf.best_params_))

res_grid_gb.to_csv('Results/res_grid_gb.csv')
best_grid_gb.to_csv('Results/best_grid_gb.csv')

print(best_grid_gb)
res_grid_gb

In [None]:
import xgboost as xgb

parameters = {'n_estimators':[10],
              'max_depth': [10, 30, 100]}
xgb = xgb.XGBClassifier(objective='multi:softmax', verbosity=2, n_jobs=-1)

clf = GridSearchCV(
    xgb,
    parameters,
    scoring='f1_weighted',
    cv=3,
    return_train_score=True,
    verbose=2,
    n_jobs=-1)

clf = clf.fit(X_train_pca, y_train)
res_grid_xgb = pd.DataFrame(clf.cv_results_)
best_grid_xgb = pd.DataFrame(zip(clf.best_params_))

res_grid_xgb.to_csv('Results/res_grid_xgb.csv')
best_grid_xgb.to_csv('Results/best_grid_xgb.csv')

print(best_grid_xgb)
res_grid_xgb

In [None]:
Decision Trees, Bagging, Random forests, Boosting, Gradient Boosted Trees, AdaBoost, etc.