## Purpose: Predict classes of new tokens

### 1 : Libraries import

In [1]:
import os
import nltk
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime, date
from langdetect import detect
from joblib import dump, load
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn import tree

### 2 : Definition of words pre-processing functions

In [2]:
def stem(list_text):
    """ Stemming """
    stemmer = nltk.stem.porter.PorterStemmer()
    return [stemmer.stem(token) for token in list_text]

def remove_punctuation(list_text):
    """ Removing punctuation """
    remove_punctuation_map = dict((ord(char), None) for char in string.punctuation)
    return [token.translate(remove_punctuation_map) for token in list_text]

def delete_stop_words(list_text):
    """ Deleting not significant words """
    stop_words = stopwords.words('french') + stopwords.words('english')
    return [" ".join([word for word in token.split() if word not in stop_words]) for token in list_text]

def pre_process_tokenizer(list_text):
    """ Pre-processing by chaining transformations : stemming, removing punctuation, deleting stop words """
    return delete_stop_words(remove_punctuation(stem(list_text)))


### 3 : Model and words base loading

In [3]:
def load_saved_version(saving_directory):
    os.chdir(saving_directory)
    return load('save')
    os.chdir('.\..')
    
    
dict_matching = {}
root_path = os.getcwd()

for language in ['en', 'fr']:    
    dict_matching[language] = load_saved_version(root_path + '\save_03-10-2020_' + language)

### 4 : Prediction

#### 4.1 : Creation of a dataframe of new tokens to classify

In [4]:
df_2_predict = pd.DataFrame({'column_client' : ['Produit Nom', 'ref. item', 'item color', 'coloris du produit', 'Dim.', 'dimensions of item']})
df_2_predict['Language_client'] = df_2_predict['column_client'].apply(detect)

df_2_predict

Unnamed: 0,column_client,Language_client
0,Produit Nom,fr
1,ref. item,en
2,item color,pt
3,coloris du produit,fr
4,Dim.,de
5,dimensions of item,en


#### 4.2 : Projection of new tokens in base 

In [5]:
X = {}

for language in ['en', 'fr']:
    base = dict_matching[language]['base']
    serie_2_project = df_2_predict.loc[df_2_predict['Language_client'] == language, 'column_client']
    
    X[language] = pd.DataFrame(base.transform(pre_process_tokenizer(serie_2_project.to_list())).toarray(), columns = base.get_feature_names())

In [6]:
X['en']

Unnamed: 0,id,identifi,item,numb,product
0,0,0,1,0,0
1,0,0,1,0,0


In [7]:
X['fr']

Unnamed: 0,coloris,couleur,dimens,dimension,dimensions,identifi,masse,numéro,poids,product,produit,référence,teintes
0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,1,0,0


#### 4.3 : Results printing

In [8]:
dict_result = {}

for language in ['en', 'fr']:
    dict_result[language] = pd.DataFrame()
    dict_result[language]['column_client'] = df_2_predict.loc[df_2_predict['Language_client'] == language, 'column_client']
    dict_result[language]['column_result'] = dict_matching[language]['label_encoder'].inverse_transform(dict_matching[language]['model'].predict(X[language])).tolist()

In [9]:
dict_result['en']

Unnamed: 0,column_client,column_result
1,ref. item,Product reference
5,dimensions of item,Product reference


In [10]:
dict_result['fr']

Unnamed: 0,column_client,column_result
0,Produit Nom,Product name
3,coloris du produit,Product color
