In [1]:
import pandas as pd
import numpy as np
import spacy
from pandarallel import pandarallel

In [2]:
pandarallel.initialize(progress_bar = True)

INFO: Pandarallel will run on 16 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [3]:
df_train = pd.read_csv('X_train_update.csv', index_col = 0)
df_test = pd.read_csv('X_test_update.csv', index_col = 0)
y_train = pd.read_csv('Y_train_CVw08PX.csv', index_col = 0).values.ravel()

In [4]:
df_train.columns

Index(['designation', 'description', 'productid', 'imageid'], dtype='object')

In [5]:
df_train.drop(['description', 'productid', 'imageid'], axis = 1, inplace = True)
df_test.drop(['description', 'productid', 'imageid'], axis = 1, inplace = True)

In [6]:
df_train.head()

Unnamed: 0,designation
0,Olivia: Personalisiertes Notizbuch / 150 Seite...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...
4,La Guerre Des Tuques


In [7]:
nlp_fr = spacy.load("fr_core_news_sm")

In [8]:
def normalize_accent(string):
    string = string.replace('á', 'a')
    string = string.replace('â', 'a')

    string = string.replace('é', 'e')
    string = string.replace('è', 'e')
    string = string.replace('ê', 'e')
    string = string.replace('ë', 'e')

    string = string.replace('î', 'i')
    string = string.replace('ï', 'i')

    string = string.replace('ö', 'o')
    string = string.replace('ô', 'o')
    string = string.replace('ò', 'o')
    string = string.replace('ó', 'o')

    string = string.replace('ù', 'u')
    string = string.replace('û', 'u')
    string = string.replace('ü', 'u')

    string = string.replace('ç', 'c')
    
    return string

In [9]:
def remove_digits(string):
    result = ''.join([i for i in string if not i.isdigit()])
    return result

In [10]:
def raw_to_tokens(raw_string, spacy_nlp):
    # Write code for lower-casing
    string = raw_string.lower()
    
    string = normalize_accent(string)
    
    string = remove_digits(string)
    
    spacy_tokens = spacy_nlp(string)
        
    string_tokens = [token.orth_ for token in spacy_tokens if not token.is_punct and not token.is_stop]
    
    clean_string = " ".join(string_tokens)
    
    return clean_string

In [11]:
df_train = pd.read_csv('df_train_simple_preprocess.csv', index_col=0)
df_test = pd.read_csv('df_test_simple_preprocess.csv', index_col=0)


In [12]:
print('Null in Train: {} \n Null in Test:{}'.format(df_train.isnull().sum(), df_test.isnull().sum()))

Null in Train: designation    12
dtype: int64 
 Null in Test:designation    1
dtype: int64


In [13]:
df_train.fillna('', inplace = True)
df_test.fillna('', inplace = True)

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=1.0,
    min_df=0.00005,
    strip_accents = 'ascii',
    analyzer = 'word')
X_train = vectorizer.fit_transform(df_train.designation.values).todense()
X_test = vectorizer.transform(df_test.designation.values).todense()

In [15]:
X_train.shape

(84916, 12940)

In [None]:
from sklearn.decomposition import SparsePCA
pca = SparsePCA(n_components = 100)
X_train_pca = pca.fit_transform(X_train)
X_test_pca =pca.transform(X_test)

In [None]:
X_train_pca.to_csv('X_train_pca')
X_test_pca.to_csv('X_test_pca')

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

parameters = {'max_depth':[2, 5, 7], 'n_estimators':[10, 30, 100]}
rf = RandomForestClassifier(random_state=0, n_jobs = -1, verbose = 2)

clf = GridSearchCV(
    rf,
    parameters,
    scoring = 'f1_weighted',
    cv = 3,
    return_train_score = True,
    verbose = 2)

search = clf.fit(X_train, y_train)

In [None]:
search.best_params_

In [None]:
res = pd.DataFrame(search.cv_results_)
res

In [None]:
type(y_train)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

# parameters = search.best_params_
parameters = {}
rf = RandomForestClassifier(max_depth=4, n_estimators=16, verbose=1, n_jobs=-1)
cross_val_score(rf, X_train, y=y_train, scoring='f1_weighted', cv=5, n_jobs=1, verbose=2)

In [None]:
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

parameters = {'objective':'multi:softmax',
              'n_estimators':[10, 30, 100],
              'max_depth': [2, 3, 5]}
xgb = xgb.XGBClassifier(verbosity =  2, n_jobs = -1)

clf = RandomizedSearchCV(
    xgb,
    parameters,
    scoring = 'f1_weighted',
    cv = 3,
    return_train_score = True,
    n_jobs = -1,
    verbose =  2)

search = clf.fit(X_train, y_train)
evals_result = clf.evals_result()

In [None]:
search.best_params_

In [None]:
res = pd.DataFrame(search.cv_results_)
res

In [None]:
from sklearn.model_selection import cross_val_score

parameters = search.best_params_
xgb = xgb.XGBClassifier(**parameters, verbosity =  2, n_jobs = -1)
cross_val_score(xgb, X_train, y=y_train, scoring='f1_weighted', cv=5, n_jobs=-1, verbose=2)

In [None]:
Decision Trees, Bagging, Random forests, Boosting, Gradient Boosted Trees, AdaBoost, etc.