In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import string

from nltk.corpus import stopwords

In [2]:
df_original = pd.read_csv("X.csv", index_col = 0)
df = df_original
df.head(5)

Unnamed: 0,designation,description,productid,imageid
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786


In [3]:
def column_junction(column1,column2):
    if str(column2).lower() =="nan":
        return str(column1)
    else:
        return str(column1) + " " + str(column2)

df["text"] = df.apply(lambda x : column_junction(x["designation"],x["description"]),axis=1)
#df["text"] = df.apply(lambda x : str(x["designation"]) + " " + str(x["description"]),axis=1)

In [4]:
df["text"]= df["text"].str.lower()

In [5]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree
df["text"]= df["text"].apply(lambda x:remove_punctuation(x))

In [6]:
stopword = stopwords.words('english')
def remove_stopwords(text):
    output= " ".join([word for word in str(text).split() if word not in stopword])
    return output
#applying the function
df["text"]= df["text"].apply(lambda x:remove_stopwords(x))


In [7]:
stopword = stopwords.words('french')
df["text"]= df["text"].apply(lambda x:remove_stopwords(x))

In [8]:
df.head()

Unnamed: 0,designation,description,productid,imageid,text
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,olivia personalisiertes notizbuch 150 seiten p...
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,journal arts n° 133 28092001 lart marche salon...
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,grand stylet ergonomique bleu gamepad nintendo...
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,peluche donald europe disneyland 2000 marionne...
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,guerre tuques luc ideacutees grandeur veut org...


In [9]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("french")
def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text.split()])
df["text"] = df["text"].apply(lambda text: stem_words(text))

In [10]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df= 50)
vectorizer.fit_transform(df["text"])
tokenized = vectorizer.vocabulary_
X = vectorizer.transform(df["text"]).todense()

In [11]:
X.shape

(84916, 7130)

In [12]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

y = pd.read_csv("Y.csv", index_col = 0)
y["prdtypecode"] = le.fit_transform(y["prdtypecode"])

In [13]:
output_dim = y["prdtypecode"].nunique()

In [14]:
y.shape

(84916, 1)

In [15]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1000)

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D , Dropout ,MaxPooling2D,Flatten, Dense, Input, Reshape, Embedding
from tensorflow.keras.models import Model

input_dim = X_train.shape[1] 

model = Sequential()
model.add(Input(shape = (input_dim ), name = "Input"))
model.add(Dense(30, activation='relu'))
model.add(Dense(output_dim, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 30)                213930    
                                                                 
 dense_1 (Dense)             (None, 27)                837       
                                                                 
Total params: 214,767
Trainable params: 214,767
Non-trainable params: 0
_________________________________________________________________
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [27]:
import joblib
#from tensorflow.keras.saving import save_model

CV_filename = "Rakuten_CountVectorizer.sav"
joblib.dump(vectorizer, CV_filename)
LE_filename = "Rakuten_LabelEncoder.sav"
joblib.dump(le, LE_filename)
model_filename = 'Rakuten_model.sav'
#tf.keras.saving.save_model(
#    model, 'C:/Users/Dan/Documents/GitHub\Rakuten', overwrite=True)
#model.save('C:/Users/Dan/Documents/GitHub/Rakuten')
model.save('Rakuten_model')



INFO:tensorflow:Assets written to: Rakuten_model\assets


INFO:tensorflow:Assets written to: Rakuten_model\assets


In [29]:
from tensorflow.keras.models import load_model

def text_processing(title,desc):
    text = str(title) + " " + str(desc)
    text = text.lower()
    text = "".join([i for i in text if i not in string.punctuation])
    stopword = stopwords.words('english')
    text = " ".join([word for word in str(text).split() if word not in stopword])
    stopword = stopwords.words('french')
    text = " ".join([word for word in str(text).split() if word not in stopword])
    stemmer = SnowballStemmer("french")
    text = " ".join([stemmer.stem(word) for word in text.split()])
    vectorizer = joblib.load("Rakuten_CountVectorizer.sav")
    X = vectorizer.transform([text]).todense()
    return X

title = "Super jouet de folie"
desc = "une figurine qui fera rever les enfants"

def prediction(X):
    model = load_model('Rakuten_model')
    proba = np.max(model.predict(X)) 
    pred_class = np.argmax(model.predict(X), axis=-1) 
    le = joblib.load("Rakuten_LabelEncoder.sav")
    classe = le.inverse_transform(pred_class)
    return (classe, proba)



X = prediction(text_processing(title,desc))
print(X[0])
print(round(X[1]*100))


[1140]
50
