In [7]:
import pandas as pd
import numpy as np
import spacy
from pandarallel import pandarallel

In [8]:
pandarallel.initialize(progress_bar = True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.


In [9]:
df_train = pd.read_csv('X_train_update.csv', index_col = 0)
df_test = pd.read_csv('X_test_update.csv', index_col = 0)
y_train = pd.read_csv('Y_train_CVw08PX.csv', index_col = 0)

In [None]:
for i in range(10):
    print(i, df_train.designation[i])

# First of all, let's identify the language for each product
Because of long computation time, results have been saved in a csv file.
See below correponding code.

In [None]:
def language_detection(df):
    
    from spacy_langdetect import LanguageDetector
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe(LanguageDetector(), name='language_detector', last=True)

    df['language_designation'] = df['designation'].parallel_apply(
        lambda x : nlp(x)._.language)
    
    df['language_description'] = df['description'].parallel_apply(
        lambda x : {'language': 'unknown', 'score': 0}
        if pd.isnull(x)
        else
        nlp(x)._.language)
    
    df['language'] = df.parallel_apply(lambda x : 
                          x['language_designation']['language'] 
                          if ((x['language_designation']['score'] > x['language_description']['score']) & (x['language_designation']['score'] > 0.95))
                          else
                          (x['language_description']['language'] if x['language_description']['score'] > 0.9
                           else 'fr'),
                          axis = 1)
    
    df.drop(['language_designation', 'language_description'], axis = 1, inplace = True)    

In [None]:
language_detection(df_train)
language_detection(df_test)

In [None]:
df_train.head(10)

In [None]:
print('train : \n',df_train.groupby('language')['productid'].count())
print('test : \n',df_test.groupby('language')['productid'].count())

# Apply Preprocessing 
### With corresponding language model

In [None]:
nlp_fr = spacy.load("fr_core_news_sm")
nlp_en = spacy.load("en_core_web_sm")
nlp_de = spacy.load("de_core_news_sm")

In [None]:
def normalize_accent(string):
    string = string.replace('á', 'a')
    string = string.replace('â', 'a')

    string = string.replace('é', 'e')
    string = string.replace('è', 'e')
    string = string.replace('ê', 'e')
    string = string.replace('ë', 'e')

    string = string.replace('î', 'i')
    string = string.replace('ï', 'i')

    string = string.replace('ö', 'o')
    string = string.replace('ô', 'o')
    string = string.replace('ò', 'o')
    string = string.replace('ó', 'o')

    string = string.replace('ù', 'u')
    string = string.replace('û', 'u')
    string = string.replace('ü', 'u')

    string = string.replace('ç', 'c')
    
    return string

In [None]:
def remove_digits(string):
    result = ''.join([i for i in string if not i.isdigit()])
    return result

In [None]:
def raw_to_tokens(raw_string, spacy_nlp):
    # Write code for lower-casing
    string = raw_string.lower()
    
    string = normalize_accent(string)
    
    string = remove_digits(string)
    
    spacy_tokens = spacy_nlp(string)
        
    string_tokens = [token.orth_ for token in spacy_tokens if not token.is_punct and not token.is_stop]
    
    clean_string = " ".join(string_tokens)
    
    return clean_string

In [None]:
def raw_to_tokens_with_language(df):
    df['designation_preprocessed'] = df.parallel_apply(
        lambda x : raw_to_tokens(x['designation'], nlp_de) if x['language'] == 'de'
        else (raw_to_tokens(x['designation'],nlp_en) if x['language'] == 'en' 
              else raw_to_tokens(x['designation'],nlp_fr)), axis = 1)
    
    df['description_preprocessed'] = df.parallel_apply(
    lambda x : '' if pd.isnull(x['description']) else raw_to_tokens(x['description'], nlp_de) if x['language'] == 'de'
    else (raw_to_tokens(x['description'],nlp_en) if x['language'] == 'en' 
          else raw_to_tokens(x['description'],nlp_fr)), axis = 1)

In [None]:
raw_to_tokens_with_language(df_train)
raw_to_tokens_with_language(df_test)
df_train.head(5)

In [None]:
def spellcheck(df):
    from spellchecker import SpellChecker
    spell = SpellChecker()
    df['designation_ckecked'] = df['designation'].parallel_apply(lambda x : ' '.join([word for word in spell.known(x)]) + ' '.join([spell.correction(word) for word in spell.unknown(x)]))
    df['description_ckecked'] = df['description'].parallel_apply(lambda x : np.nan if pd.isnull(x) else ' '.join([word for word in spell.known(x)]) + ' '.join([spell.correction(word) for word in spell.unknown(x)]))

In [None]:
spellcheck(df_train)

In [None]:
def get_chunks(df):
    df['design_noun_chunks'] = df.parallel_apply(
    lambda x : ' '.join([chunk.text for chunk in nlp_de(x['designation']).noun_chunks]) if x['language'] == 'de'
    else (' '.join([chunk.text for chunk in nlp_en(x['designation']).noun_chunks]) if x['language'] == 'en' 
          else ' '.join([chunk.text for chunk in nlp_fr(x['designation']).noun_chunks])), axis = 1)
    
    df['descr_noun_chunks'] = df.parallel_apply(
    lambda x : np.nan if pd.isnull(x['description']) else ' '.join([chunk.text for chunk in nlp_de(x['description']).noun_chunks]) if x['language'] == 'de'
    else (' '.join([chunk.text for chunk in nlp_en(x['description']).noun_chunks]) if x['language'] == 'en' 
          else ' '.join([chunk.text for chunk in nlp_fr(x['description']).noun_chunks])), axis = 1)

In [None]:
get_chunks(df_train)
get_chunks(df_test)
df_train.head(5)

In [None]:
df_train.to_csv('df_train_preprocessed_full_2.csv', index = False)
df_test.to_csv('df_test_preprocessed_full_2.csv', index = False)

# Translation

In [17]:
from translate import Translator
translator= Translator(to_lang="fr")
translation = translator.translate("This is a pen.")
translation



In [18]:
df_train = pd.read_csv('df_train_preprocessed_full_2.csv')
df_test = pd.read_csv('df_test_preprocessed_full_2.csv')

In [14]:
df_train.head()

Unnamed: 0,designation,description,productid,imageid,language,designation_preprocessed,description_preprocessed,design_noun_chunks,descr_noun_chunks
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,de,olivia personalisiertes notizbuch seiten pun...,,"[Olivia, / 150 Seiten, / Punktraster, / Ca, Di...",[]
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,fr,journal arts n° art marche salon art asiatiq...,,"[Journal, Des Arts (, - L'art Et Son Marche Sa...",[]
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,fr,grand stylet ergonomique bleu gamepad nintendo...,pilot style touch pen marque speedlink style...,[Grand Stylet Ergonomique Bleu Gamepad Nintend...,"[PILOT STYLE, Touch, Pen de marque Speedlink e..."
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,fr,peluche donald europe disneyland marionnette...,,"[Peluche Donald, - Europe, Disneyland 2000, À ...",[]
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,fr,guerre tuques,luc id&eacute;es grandeur veut organiser jeu g...,"[La Guerre, Des Tuques]","[Luc, des id&eacute;es de grandeur, Il, un jeu..."


In [16]:
def translate(df):
    df['designation_trs'] = df.parallel_apply(
    lambda x : np.nan if pd.isnull(x['designation']) 
        else (
            x['designation'] if x['language'] == 'fr'
            else translator.translate(x['designation'])
        ),axis = 1)
    
    df['description_trs'] = df.parallel_apply(
    lambda x : np.nan if pd.isnull(x['description']) 
        else (
            x['description'] if x['language'] == 'fr'
            else translator.translate(x['description'])
        ),axis = 1)
        
    df['descr_noun_chunks_trs'] = df.parallel_apply(
    lambda x : np.nan if pd.isnull(x['descr_noun_chunks']) 
        else (
            x['descr_noun_chunks'] if x['language'] == 'fr'
            else translator.translate(x['descr_noun_chunks'])
        ),axis = 1)
        
    df['design_noun_chunks_trs'] = df.parallel_apply(
    lambda x : np.nan if pd.isnull(x['design_noun_chunks']) 
        else (
            x['design_noun_chunks'] if x['language'] == 'fr'
            else translator.translate(x['design_noun_chunks'])
        ),axis = 1)
        
translate(df_train)
translate(df_test)

df_train.to_csv('df_train_preprocessed_full_trs.csv')
df_test.to_csv('df_test_preprocessed_full_trs.csv')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=21229), Label(value='0 / 21229')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=21229), Label(value='0 / 21229')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=21229), Label(value='0 / 21229')))…

Process ForkPoolWorker-21:
Process ForkPoolWorker-17:
Process ForkPoolWorker-20:
Process ForkPoolWorker-18:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
    self.run()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/process.py", line 99, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/pool.py", line 110, in worker
    task = get()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/queues.py", line 352, in get
    res = self._reader.recv_bytes()
  File "/opt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 216, in recv_bytes
    buf = self._recv_bytes(maxlength)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 407, in _recv_bytes
    buf = self._recv(4)
  File "/opt/anaconda3/lib/python3.7/multiprocessing/connection.py", line 379, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


KeyboardInterrupt: 

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 377, in _make_request
    httplib_response = conn.getresponse(buffering=True)
  File "/opt/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 377, in _make_request
    httplib_response = conn.getresponse(buffering=True)
  File "/opt/anaconda3/lib/python3.7/site-packages/urllib3/connectionpool.py", line 377, in _make_request
    httplib_response = conn.getresponse(buffering=True)
TypeError: getresponse() got an unexpected keyword argument 'buffering'
TypeError: getresponse() got an unexpected keyword argument 'buffering'
TypeError: getresponse() got an unexpected keyword argument 'buffering'

During handling of the above exception, another exception occurred:


During handling of the above exception, another exception occurred:


During handling of the above exception, another exce

# Forecasts using designation

In [None]:
df_train = pd.read_csv('df_train_preprocessed_full_2.csv')
df_test = pd.read_csv('df_test_preprocessed_full_2.csv')
df_train.head()

In [None]:
len(set(y_train))

In [None]:
df_train['label'] = pd.DataFrame(y_train)
df_train = df_train.dropna(subset = ['designation'])
y_train = df_train['label']
df_train.drop(['label'], axis = 1, inplace = True)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
    max_df=0.7,
    min_df=0.0001,
    strip_accents = 'ascii',
    analyzer = 'word')
X_train = vectorizer.fit_transform(df_train.designation.values).todense()
X_test = vectorizer.transform(df_test.designation.values).todense()

In [None]:
X_train.shape

In [None]:
X_train_pca = pd.read_csv('X_train_pca_1000.csv', index_col=0)
X_test_pca = pd.read_csv('X_test_pca_1000.csv', index_col=0)

In [None]:
print(pca.explained_variance_ratio_)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier


parameters = {'max_depth':[7], 'n_estimators':[100]}
rf = RandomForestClassifier(random_state=0, n_jobs = -1, verbose = 1, max_depth=7, n_estimators=100)

cross_val_score(rf, X_train_pca, y=y_train, scoring='f1_weighted', cv=5, n_jobs=-1, verbose=2)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import warnings

warnings.filterwarnings('ignore')

parameters = {'max_depth':[7, 15], 'n_estimators':[10, 50]}
rf = RandomForestClassifier(random_state=0, n_jobs = -1, verbose = 2)

clf = RandomizedSearchCV(
    rf,
    parameters,
    scoring = 'f1_weighted',
    cv = 3,
    n_iter = 4,
    return_train_score = True,
    verbose = 2,
    n_jobs = -1)

search = clf.fit(X_train, y_train)

warnings.filterwarnings('default')

In [None]:
search.best_params_

In [None]:
res = pd.DataFrame(search.cv_results_)
res

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

parameters = {'objective':'multi:softmax',
              'n_estimators':[3, 10, 30, 100],
              'max_depth': [2, 3, 4]}
xgb = xgb.XGBClassifier(verbosity =  2, n_jobs = -1)

clf = RandomizedSearchCV(
    xgb,
    parameters,
    scoring = 'f1_weighted',
    cv = 5,
    n_iter = 5,
    return_train_score = True,
    n_jobs = -1,
    verbose =  2)

search = clf.fit(X_train_pca, y_train)
evals_result = clf.evals_result()

In [None]:
search.best_params_

In [None]:
res = pd.DataFrame(search.cv_results_)
res

# Forecasts using designation & description

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(df_train.description.values)
X_test = vectorizer.transform(df_test.description.values)