In [1]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import re
import nltk
nltk.download('punkt')
from tqdm.auto import tqdm
from wordcloud import WordCloud
import html
from langdetect import detect, detect_langs, LangDetectException
from google_trans_new import google_translator
import random
from collections import Counter
import unicodedata
from sklearn.feature_extraction.text import TfidfVectorizer

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeremyrava/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Roadmap to Text Mining

Text mining est le processus d'extraction d'informations à partir de grandes quantités de données non structurées. Cela implique plusieurs étapes, qui peuvent être décomposées en ces étapes suivantes :

1. Collecte des données : La première étape du text mining est la collecte de grande quantité de données non structurées. Cette donnée peut provenir de diverses sources telles que les plateformes sociaux, les sites Web, les blogs et les articles de presse.

2. Prétraitement : Une fois les données recueillies, elles doivent être prétraitées. Cela consiste à nettoyer les données en supprimant les caractères indésirables ou les mots, en corrigeant les erreurs orthographiques et en convertissant le texte en minuscule.

3. Tokenisation : Une fois les données prétraitées, elles sont tokenisées. Cela signifie que le texte est divisé en mots individuels ou des tokens. Cette étape est importante car elle permet d'analyser la fréquence de chaque mot dans le texte.

4. Suppression des mots-clés : Une fois le texte tokenisé, nous pouvons supprimer les mots-clés. Les mots-clés sont des mots courants tels que "le", "et" et "est" qui n'apportent pas de sens particulier et peuvent être supprimés sans affecter l'analyse globale.

5. Reduction morphologique ou normalisation du lemme : La prochaine étape est la reduction morphologique ou la normalisation du lemme. Cela consiste à réduire chaque mot à sa forme de base, également connu sous le nom de racine de mot. Par exemple, "courir" serait réduit à "court". Cela est fait pour regrouper des mots similaires ensemble, comme différents temps verbaux du même mot.

6. Extraction des fonctionnalités : Une fois le texte prétraité et nettoyé, nous pouvons extraire les fonctionnalités à partir de lui. Ces fonctionnalités peuvent inclure des choses telles que la fréquence des mots, l'analyse du sentiment ou le modélisation des sujets.

7. Analyse : Enfin, nous pouvons analyser les données à l'aide de diverses techniques telles que l'agrégation ou la classification pour obtenir des informations sur le texte.

In [2]:
# CONSTANTES
FIGSIZE = (12, 4)
PATH_DATA = "../data/"
PATH_RAW = PATH_DATA + "raw/"
PATH_PROCESSED = PATH_DATA + "processed/"
PATH_EXTERNAL = PATH_DATA + "external/"

pd.set_option("display.max_columns", 20)
pd.set_option("display.max_rows", 200)

In [12]:
X_train = pd.read_csv(PATH_RAW + "x_train.csv", index_col=0)
X_test = pd.read_csv(PATH_RAW + "x_test.csv", index_col=0)
display(X_train.head())
display(X_test.head())

target = pd.read_csv(PATH_RAW + "y_train.csv", index_col=0)

display(target.describe())
df_origin = pd.read_csv(PATH_RAW + "x_train.csv", index_col=0)

Unnamed: 0,designation,description,productid,imageid
0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786


Unnamed: 0,designation,description,productid,imageid
84916,Folkmanis Puppets - 2732 - Marionnette Et Théâ...,,516376098,1019294171
84917,Porte Flamme Gaxix - Flamebringer Gaxix - 136/...,,133389013,1274228667
84918,Pompe de filtration Speck Badu 95,,4128438366,1295960357
84919,Robot de piscine électrique,<p>Ce robot de piscine d&#39;un design innovan...,3929899732,1265224052
84920,Hsm Destructeur Securio C16 Coupe Crois¿E: 4 X...,,152993898,940543690


Unnamed: 0,prdtypecode
count,84916.0
mean,1773.2199
std,788.179885
min,10.0
25%,1281.0
50%,1920.0
75%,2522.0
max,2905.0


In [13]:
stop_words_french = pd.read_json(PATH_EXTERNAL + "stop_words_french.json")
print(stop_words_french.shape)
stop_words = []
stop_words.extend(stop_words_french[0].tolist())
stop_words.extend(["cm", "mm"])
print(len(stop_words))
stop_words[:10]

(496, 1)
498


['a', 'à', 'â', 'abord', 'afin', 'ah', 'ai', 'aie', 'ainsi', 'allaient']

# PRE-PROCESSING LEXIQUE FRANÇAIS

In [14]:
def strip_accents(texts):
    sentences = []
    for sentence in tqdm(texts):
        if sentence is np.nan:
            sentences.append(np.nan)
        else:
            s = "".join(
                c
                for c in unicodedata.normalize("NFD", sentence)
                if unicodedata.category(c) != "Mn"
            )
            sentences.append(s)
    return sentences

In [15]:
lexique = pd.read_table(PATH_EXTERNAL + "Lexique383/Lexique383.tsv", delimiter="\t")
lexique_filtred_unique = lexique.drop_duplicates(subset="ortho")[["ortho", "lemme"]]
lexique_filtred_unique.dropna(subset=["ortho", "lemme"], axis="index", inplace=True)
# lexique_filtred_unique["ortho"] = strip_accents(lexique_filtred_unique["ortho"])
# lexique_filtred_unique["lemme"] = strip_accents(lexique_filtred_unique["lemme"])
lexique_filtred_unique = lexique_filtred_unique.drop_duplicates(subset="ortho")[
    ["ortho", "lemme"]
]

lexique_filtred_unique

Unnamed: 0,ortho,lemme
0,a,a
3,a capella,a capella
4,a cappella,a cappella
5,a contrario,a contrario
6,a fortiori,a fortiori
...,...,...
142685,ôtèrent,ôter
142686,ôté,ôter
142688,ôtée,ôter
142690,ôtées,ôter


# Premier nettoyage
### pour pouvoir traduire les textes d'une autre langue en français

In [16]:
def first_cleaning(sentences):
    # Pre-compile regex patterns
    HTML_TAGS_RE = re.compile("<[^>]*")
    URL_RE = re.compile(r"https?://[-_.?&~;+=/#0-9A-Za-z]{1,2076}")
    MAIL_RE = re.compile(
        r"[-_.0-9A-Za-z]{1,64}@[-_0-9A-Za-z]{1,255}[-_.0-9A-Za-z]{1,255}"
    )
    SPE_CHAR_RE = re.compile("[^a-zA-ZÀ-ÿ]")
    SPACES_RE = re.compile("\s+")
    SPACES_RE_2 = re.compile(" +")

    sentences_cleaned = []

    for sentence in tqdm(sentences):
        # Decode HTML entities
        sentence = html.unescape(sentence)

        # Replace HTML tags with spaces
        sentence = HTML_TAGS_RE.sub("", sentence)

        # Remove URL and e-mail adress
        sentence = URL_RE.sub("", sentence)
        sentence = MAIL_RE.sub("", sentence)

        # Replace special characters with spaces while keeping accents
        sentence = SPE_CHAR_RE.sub(" ", sentence)

        # Remove unnecessary spaces from the sentence using regular expressions
        cleaned_sentence = SPACES_RE.sub(" ", sentence)
        cleaned_sentence = SPACES_RE_2.sub(" ", cleaned_sentence)
        sentences_cleaned.append(cleaned_sentence)

    return sentences_cleaned

X_train

In [None]:
X_train["designation"] = first_cleaning(X_train["designation"])
X_train["description"].loc[~X_train["description"].isna()] = first_cleaning(
    X_train["description"].loc[~X_train["description"].isna()]
)

# join texts
X_train["text"] = np.where(
    X_train["description"].isna(),
    X_train["designation"].astype(str),
    X_train["designation"].astype(str) + " " + X_train["description"].astype(str),
)
X_train.head()

X_test

In [17]:
X_test["designation"] = first_cleaning(X_test["designation"])
X_test["description"].loc[~X_test["description"].isna()] = first_cleaning(
    X_test["description"].loc[~X_test["description"].isna()]
)

# join texts
X_test["text"] = np.where(
    X_test["description"].isna(),
    X_test["designation"].astype(str),
    X_test["designation"].astype(str) + " " + X_test["description"].astype(str),
)
X_test.head()

  0%|          | 0/13812 [00:00<?, ?it/s]

  0%|          | 0/8926 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test["description"].loc[~X_test["description"].isna()] = first_cleaning(


Unnamed: 0,designation,description,productid,imageid,text
84916,Folkmanis Puppets Marionnette Et Théâtre Mini ...,,516376098,1019294171,Folkmanis Puppets Marionnette Et Théâtre Mini ...
84917,Porte Flamme Gaxix Flamebringer Gaxix U Twilig...,,133389013,1274228667,Porte Flamme Gaxix Flamebringer Gaxix U Twilig...
84918,Pompe de filtration Speck Badu,,4128438366,1295960357,Pompe de filtration Speck Badu
84919,Robot de piscine électrique,Ce robot de piscine d un design innovant et é...,3929899732,1265224052,Robot de piscine électrique Ce robot de pisci...
84920,Hsm Destructeur Securio C Coupe Crois E X Mm,,152993898,940543690,Hsm Destructeur Securio C Coupe Crois E X Mm


# Detection des langues

In [18]:
def detect_language(sentences):

    UNKNOWN = "unknown"
    FR = "fr"

    languages = []
    k = 15
    lexique_set = set(lexique_filtred_unique["ortho"].str.strip().str.lower())

    for sentence in tqdm(sentences):
        # Split the sentence into words
        words = nltk.word_tokenize(sentence, language="french")

        # If there are more than k words, sample k words randomly.
        if len(words) > k:
            words = random.sample(words, k)

        # Use a list comprehension with a ternary conditional operator for language detection.
        df_la = []
        for word in words:
            if word.lower() in lexique_set:
                df_la.append(FR)
            else:
                try:
                    df_la.append(detect(word))
                except LangDetectException:
                    df_la.append(UNKNOWN)

        # Use Counter to find the most common language.
        reel_lang = Counter(df_la).most_common(1)[0][0] if len(df_la) > 0 else UNKNOWN
        languages.append(reel_lang)

    return languages

X_train

In [None]:
X_train["language"] = detect_language(X_train["text"])
X_train.to_csv(PATH_PROCESSED + "X_train_detected_language.csv")
display(X_train.head())
X_train["language"].value_counts()

X_test

In [22]:
X_test["language"] = detect_language(X_test["text"])
X_test.to_csv(PATH_PROCESSED + "X_test_detected_language.csv")
display(X_test.head())
X_test["language"].value_counts()

  0%|          | 0/13812 [00:00<?, ?it/s]

Unnamed: 0,designation,description,productid,imageid,text,language
84916,Folkmanis Puppets Marionnette Et Théâtre Mini ...,,516376098,1019294171,Folkmanis Puppets Marionnette Et Théâtre Mini ...,fr
84917,Porte Flamme Gaxix Flamebringer Gaxix U Twilig...,,133389013,1274228667,Porte Flamme Gaxix Flamebringer Gaxix U Twilig...,fr
84918,Pompe de filtration Speck Badu,,4128438366,1295960357,Pompe de filtration Speck Badu,fr
84919,Robot de piscine électrique,Ce robot de piscine d un design innovant et é...,3929899732,1265224052,Robot de piscine électrique Ce robot de pisci...,fr
84920,Hsm Destructeur Securio C Coupe Crois E X Mm,,152993898,940543690,Hsm Destructeur Securio C Coupe Crois E X Mm,fr


language
fr    13124
en      295
de      113
tl       29
it       28
ro       25
fi       21
so       20
sl       19
id       17
pt       13
pl       12
no       11
es       11
sw       10
af        9
nl        9
ca        8
cs        7
cy        7
da        6
lt        6
sv        3
tr        3
sq        2
hu        1
et        1
vi        1
sk        1
Name: count, dtype: int64

# Traduction en français

In [23]:
def translate_texts(translator: google_translator, sentences, language):
    texts_translated = translator.translate(sentences, lang_tgt="fr", lang_src=language)
    return texts_translated

X_train

In [None]:
X_train = pd.read_csv(PATH_PROCESSED + "X_train_detected_language.csv", index_col=0)

# si error durant la traduction -> reprendre à l'index done
done = 16

# google translate API
translator = google_translator(timeout=10000000)

X_train_copy = X_train.copy()
j = 0
for language in tqdm(X_train_copy["language"].value_counts().index):
    j += 1
    if j > done and language != "fr":
        texts_translated = []
        df_lang = X_train_copy["text"].loc[X_train["language"] == language].tolist()

        for i in tqdm(range(0, len(df_lang)), desc=language):
            texts_translated.append(translate_texts(translator, df_lang[i], language))
        print(texts_translated[:10])
        X_train_copy["text"].loc[X_train["language"] == language] = texts_translated
        # save
        X_train_copy.to_csv(PATH_PROCESSED + "X_train_translated.csv")

X_train = pd.read_csv(PATH_PROCESSED + "X_train_translated.csv", index_col=0)
X_train["text"].loc[X_train["language"] == "de"]

print("is NaN :")
indexes = X_train[X_train["text"].isna()].index
df_origin.iloc[indexes]

X_test

In [26]:
X_test = pd.read_csv(PATH_PROCESSED + "X_test_detected_language.csv", index_col=0)

# si error durant la traduction -> reprendre à l'index done
done = 0

# google translate API
translator = google_translator(timeout=10000000)

X_test_copy = X_test.copy()
j = 0
for language in tqdm(X_test_copy["language"].value_counts().index):
    j += 1
    if j > done and language != "fr":
        texts_translated = []
        df_lang = X_test_copy["text"].loc[X_test["language"] == language].tolist()

        for i in tqdm(range(0, len(df_lang)), desc=language):
            texts_translated.append(translate_texts(translator, df_lang[i], language))
        print(texts_translated[:10])
        X_test_copy["text"].loc[X_test["language"] == language] = texts_translated
        # save
        X_test_copy.to_csv(PATH_PROCESSED + "X_test_translated.csv")

X_test = pd.read_csv(PATH_PROCESSED + "X_test_translated.csv", index_col=0)
X_test["text"].loc[X_test["language"] == "de"]

print("is NaN :")
indexes = X_test[X_test["text"].isna()].index
df_origin.iloc[indexes]

  0%|          | 0/29 [00:00<?, ?it/s]

en:   0%|          | 0/295 [00:00<?, ?it/s]

["Xbox One à l'édition Call of Duty Advanced Warfare ", 'Kinect Joy Ride Jeu Kinect Jeu Xbox ', 'Annuaire de conception néerlandaise ', 'Enfants Soft Plux Doll Simulation mignon Cat Artisanat Toy Car Decoration GiftZen Enfants Soft Plux Dol Doltesimulation Cat Crafts Toy Car Decoration Cadeau Gifts Les décorations ou les cadeaux de vacances Cat est suffisant pour que vous vous sentiez mieux. ', 'Bouteille de vins Sacs de couverture Décoration Home Party Santa Claus Christmashpp Christmas Gift Sac Sac Candy Merry Christmas Sacs de bonbons de Noël décorféature de haute qualité PC MATÉRIAU Arbres de Noël ou décoration de lieu et ainsi de suite contenu du paquet x sac cadeau de Noël sac de bonbons joyeux Noël sacs de bonbons décor de Noël sans pomme ', 'Les Berenstain Bears visitent le dentiste ', 'Le jeu de chenilles très affamé PA Game de comptage des couleurs et des contrastes P ', "Couleur métallique de luxe Skin étanche PVC Stickers pour DJI Osmo Pocketzen Générique Couleur métallique

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


de:   0%|          | 0/113 [00:00<?, ?it/s]

['Liberté individuelle et collective en droit du travail ', 'Amano du Soleil Amano Herald of the Sun U Trône des marées ', 'Moment de ma vie ', 'Spark Hill GH GP Italie S Spark Fabricant Spark Echelle Ref Fab S Type Hill GH GP Italie Couleur Blanc Rouge Ean Ref Little Bolide ', 'Locomotive ge ton commutateur sud du Pacifique REC Ech Bachmann Bachmann ', 'King Richard Shakespeare Peter Ure Methuen ', 'Bitz Warhammer Fenêtre ', 'Moshi Monsters Glitter and Glow Moshi Monsters brille et brille ', 'Sac de couchage töllner confortable ', 'Oui ma fille chérie ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


tl:   0%|          | 0/29 [00:00<?, ?it/s]

['PS Slim Go GT ', "Génération de puissance conventionnelle et alternative à l'atténuation et à la durabilité de la thermodynamique ", 'Catalogue Casio Calculatrices Micro Ornineurs PB P P P P P P P ', 'Easy le fait pour la phonologie ', 'Classic Tales Magic Cook Pot AB ED ', 'Nobunaga no yabou super famicom ', "Journal de Pitman n du l'étudiant des notes de conservation des notes et des nouvelles de l'enseignement des nouvelles de la banque inférieure AMALGAMATION ", 'Colgante Douard Dangly Elefante Playgro M ', 'Magic Grow Animaux de safari ', 'Débutants de Big Ben ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


it:   0%|          | 0/28 [00:00<?, ?it/s]

['Pontiac GTO juge Orange Motormax Motormax Pontiac GTO juge Orange Motormax ', "Alliances d'armure viscérides VO ", "Il est temps du volume musical recueil est le temps de la musique est une méthode destinée aux jeunes étudiants qui entreprennent le chemin musical est divisé en trois volumes organisés en unités dans chacune des exercices théoriques théoriques avec des résumés et des tests de vérification des applications pratiques sont traités pour parler à une et deux voix et rythmique à une et deux parties, une attention particulière est accordée avec de courtes notes à la fin de chacune des unités qui composent les volumes dans les sujets suivants le volume de volume des instruments de musique Pour la systématicité de la méthode et pour son contenu inspiré par le nouveau programdidactique, le travail est en particulier aux élèves du collège avec une adresse musicale et en général à tous ceux qui veulent entreprendre l'étude de la musique ", 'Lola t norev ', 'Valentino Rossi le jeu 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


ro:   0%|          | 0/25 [00:00<?, ?it/s]

['Mug ML Scar Call of Duty Infini Warfare ', 'Arabes manuscritos del Lebano ', 'Spacemaster Future Law ', 'Livrets Lindner R Belgique Supplement Année de cisaillement Numéro H Nombre de pages ', 'Ty beanie bébé chanceux la coccinelle pty beanie bébés chanceux la coccinelle ', "Acier en acier inoxydable à thé épice d'épices à l'infuseur Filtre de file du maillage de maille avec couvercle Chainhpp en acier inoxydable Balle de thé à thé Infuseur Filtre de maillage de maille avec couvercle Chandescriptif matériau inoxydable STEE SLIVER TAILLE M CMNOTE MADÉE par acier inoxydable est non toxic Les feuilles de thé Pawan peuvent être retirées du thé étranger peut être recyclé, comprend des épices de thé PCS à l'étranger ", 'Heures le mans ixo ', 'Jeu de cartes à collectionner naruto naruto uzumaki pr ', 'Torune ninja dans le livre Naruto Shippuden VF ', 'Microfluidics et nanofluidiques motivés électrokinétiquement ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


fi:   0%|          | 0/21 [00:00<?, ?it/s]

['Maito Chevrolet Camaro SS RS Police Pol Topta ', 'Fallout Vault Swimshort S ', 'Armure plus un camion ', 'Figurine Lucky Luke Dalton Avell Happy Meal McDo ', 'CONSES BI DI GUO ZHU YI LUN ', 'Noir Blanc Feuiilajou Feuiloutan Flamajou Flamoutan Flotajou Flotoutan ', 'Glizer Polaris Blister Glizer Polaris CM Honeym ', 'Karasuno Aobajohsai Nekoma Fukurodani Goodies Havyu ', 'Jikkyou puissant pro yakyuuu ', "Boku wa crochets pour le pont de l'importation ds japonisis "]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


so:   0%|          | 0/20 [00:00<?, ?it/s]

['Mynock dagobah star wars ccg ls ', 'Cadilles de luxe Cadilla Crossover Collectbibles de luxe ', 'Xbox Go ', 'Boit la fièvre umd vidéo ', 'Microsoft Crackdown Xbox One Bassic Xboog One VideogiococococoCo Crackdown Xboosoft Xboosoft Xboo Jebox ', 'Xbox Kinect Noir ', 'Glom Hs Undefted ', 'Jedi Lévitation Dagobah Star Wars CCG ', 'Star Wars Bust Ups Clone Wars Padmala Amidala ', 'Dark Horse Comics ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


sl:   0%|          | 0/19 [00:00<?, ?it/s]



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


id:   0%|          | 0/17 [00:00<?, ?it/s]

["Arbre d'essieu arrière en titane pour Ball R Japan Importation ", 'Paraguas CMS Automático Baggy Aguacero Paraguas CMS Automático Baggy Aguacero ', 'Salamandra Ultra Monster Japan Importation ', 'Bruna Kimono Wedding Doll Japan Import ', 'Cyberdimension Neptunia Goddess en ligne ', 'PROFAIN PERFECT ADDON NURNBERG SAALFELD ', 'Monolithe de basalte révisé VO ', 'Tantam d le métier ', 'Ultraman super-héros de la série Ultra Hero Ultraman Astra Japan Importation ', 'Kamen Rider Blade Ross Emission Shadow Chaser Japan Import ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


pt:   0%|          | 0/13 [00:00<?, ?it/s]

['Remooraid PV ', 'Promouvoir Ferrari Modena Coupé Blister Pro Promotion ', 'Ortide Édition Ed ', "Forme de Dieu forme d'un serviteur ", 'Gundam HCM Pro Riser ', 'Rivaux Emergeans X Nidoran Nidorina Nidorino ', 'Nendoroid Annegasaki Nene Pvc Costurine ', 'Gundam sd croix os gundam x ', 'Armodon Dressé Formé Armodon Magic Mtg Tempète C ', 'Gundam x Gundam Ashtaron Scale ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


pl:   0%|          | 0/12 [00:00<?, ?it/s]

["Manches en silicone en acier inoxydable Paies de consommation d'alcool pour oz tasses Oz Orhpp Silicone Gouetter Pour nettoyer la longueur de taille mm diamètre mmpackage teneur en acier inoxydable paille en acier inoxydable ", 'Sony Playstation Slim Go ', 'Sony PS Vita Noir ', 'Carte WWE Slam Attax Todd Grisham Raw ', 'Brosse à dents parlante chan après après chii votre outil après Chan Japan Importation ', 'Attaque spyro des Rhynocs ', 'Sony Playstation Slim Go ', 'Kid klown en foulard fou ', 'Zoids Grande-Bretagne GB Bio Ptera Scale ', 'Sony Playstation Slim Go ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


no:   0%|          | 0/11 [00:00<?, ?it/s]

['Vds res ', 'Norev Renault Jetcar Noreev ', 'Oxford Manuel de médecine aiguë ', 'Harbinger ', 'Noreev Peugeot t Noreev ', 'Inuyasha naraku no wana mayoi no mori no shoutaijou ', 'La légende de Zelda Zelda no densetsu fushigi no boshi minish capride version japonaise ', 'Héroclix Council Token Avengers Everett K Ross Avengers B ', 'HARRY POTTER PULLOVER HOODIE Girl Poufsouffle XL ', 'Non à la vache ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


es:   0%|          | 0/11 [00:00<?, ?it/s]

['Mercedes Unimog Pompiers et solide ', "Piscine TOI Ethnique X x cm Piscine amovible TOI ethnique cm cm long cm large et cm de haut le complément de votre jardin avec lequel vous obtiendrez une augmentation du confort et de la qualité de vie de cet été fabriqué dans des matériaux résistants aux murs en acier laquée Fermeture en acier avec double vis à double vis Rangée et recouverte d'un boîtier en polyéthylène décoratif de haute densité et de qualité photographique, le filtre de bain d'été le plus rafraîchissant et le plus sûr, volez l'escalier décoratif de décoce de décoce et de la série de décoration de la piscine Traité en PVC, vous pouvez renouveler votre système de décoration de pool de décoco, vous pouvez renouveler votre Pool mural en acier TOI TOI Chaque année, les couvertures du système de piscine DeCoh sont fabriquées en polyéthyle haute densité et qualité photographique créée dans des matériaux biodégradables avec l'environnement le moyen économique et le plus simple le pl

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


sw:   0%|          | 0/10 [00:00<?, ?it/s]

['Masha et Michka Activites Avec Masha ', 'MP Fr Espoir Barian Cxyz ', 'Ichiban Assassinat Assassination Classroom Peluche Koro Sensei ', 'Momification dédiée Amakna ', 'Bokujou Monogatari Mineral Town No Nakama Tachi ', 'Jeu Call of Duty MW Wii ', 'Carte dédiée Yokaï, édition Firefoux Pandala ', 'Wii u édition Zelda Wind Waker Jeuux ', 'San Ku Kai Robot Sidero Popy Japan ', 'Aussi Carrot Sayaka Takai Swimsuit Ver PVC Figure Scale ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


af:   0%|          | 0/9 [00:00<?, ?it/s]

['Notebook Journal Journal grand cahier liné pour tous les projets moutarde ', 'Personnages Disney Crystalux Rapunzel CM ', "Jewels de l'île tropicale perdue ", 'Overwatch Legendary Edition ', 'Walking Dead Mug Maggie ', 'Mon cahier carnet en blanc ', 'Toys Dinky Meccano Paquebot La Normandie Dinky Toys ', 'Monde magique Disney ', 'DK Eyewitness Travel Guide London ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


nl:   0%|          | 0/9 [00:00<?, ?it/s]

['Fleut detlef schrempf n ', 'Devanar Damir Wars TCG déchiffrer ', 'Fleu Clifford Robinson N ', 'Opel Ascona S R I Yellow White Neo Limited PCS Gelb Weiss Neo ', "Pin d'articulation degrés degrés FRX Pin de charnière Brace deg ", 'Lame de la reine Shizuka Megahouse ', 'Volcaropode de Limagma néo ', 'Ferrari FXX Yellow Hotheels Elite Mattel Hot Wheels ', "Avancement de la physiopathologie de l'AVC cérébral "]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


ca:   0%|          | 0/8 [00:00<?, ?it/s]

['Ex Libris Magica R ', 'Mortel kombat mug klassic ', 'Emerald Dragonfly Magic MTG Chronicles C ', 'Ecran tactile inférieur nintendo ds ', 'Duaigües JARGE J Actividad physique pour les personnages avec disque ', 'Aqua Kitty DX Limited Run ', 'Exercice pour les os forts ', 'Pokénav Trainer PV EMERADE FEE ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


cs:   0%|          | 0/7 [00:00<?, ?it/s]

['Pokémon R Rondoud Reverse ', 'Pokémon R ColimUcus inverse ', 'Pokémon Edition Blanca ', 'Pokémon R Electrode Niv Inverse ', 'Pokémon R Milobellus Inverse ', 'Par Désir Project Toyosakomimino Miko Scale ', 'Pokémon Makuhita NIV PV ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


cy:   0%|          | 0/7 [00:00<?, ?it/s]

['Boyds Grannie Annie Wishkabibble par Boyds Bears Boyds Grannie Annie Wishkabble par Boyds Bears ', 'Farcry xbox ', 'BYCMO Subaru Impreza Tunning Edition ', 'Dawn of War Winter Assault Ajouter sur Du Jeu Dawn of War ', 'Japon Import Turtle Purdle ', 'Soyez sage mon fils et rendez mon cœur heureux ', 'Molloy de Samuel Beckett ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


da:   0%|          | 0/6 [00:00<?, ?it/s]

['Eternal Champions Mega CD ', 'Magnét européen valladolid ', 'Vampire la lutte éternelle Agrippina Jyhad Vo ', 'Skylanders Giants Fright Rider ', 'Tiger Woods PGA Tour ', 'Outils créatifs crochets magnétiques à cochons de casier scolaire réfrigérateur faim bluehpp outils créatifs crochets magnétiques cordons de casier scolaire réfrigérateur faim blue dispose de toute marque et de haute qualité en acier inoxydable en plastique magnétique durable et de longue date Couleur Bluemax Roueur kg taille CMPackage comprend x crochet ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


lt:   0%|          | 0/6 [00:00<?, ?it/s]

['Carburant Tenseei Eme Cercle ', 'Upsie Downsie êtes-vous asep ', 'Lamborghini Miura Sv Rouge Italia Rosso Red Deagostini Deagostini ', 'Gravity Rush PS uniquement ', 'Mariokart Jeu DS Nintendo ', 'Painiac mtg instable vo c ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


sv:   0%|          | 0/3 [00:00<?, ?it/s]

['MUSTEFLOTT HOLO NIV RIVUX EMERSIANTS PV ', 'Harn nyl régulation ora ', 'Hasbro Nerf Elite Rayven ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


tr:   0%|          | 0/3 [00:00<?, ?it/s]

['Construisez-le Miami Beach Resort ', 'Lindner Pologne Supplement Year Numéro Nombre de pages ', 'Hitman Import UK ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


sq:   0%|          | 0/2 [00:00<?, ?it/s]

['Shin Megami Tensei personne ', 'LOTR CCG úLAIRE NERTAA HUNTER ENTÉ ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


hu:   0%|          | 0/1 [00:00<?, ?it/s]

['EVOVE Edition Benelux ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


et:   0%|          | 0/1 [00:00<?, ?it/s]

['Solino Mitsubishi Pajero MPR Mitsubishi ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


vi:   0%|          | 0/1 [00:00<?, ?it/s]

['CV Tourring Belge ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


sk:   0%|          | 0/1 [00:00<?, ?it/s]

['Robo force ennemi le dictateur ']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test_copy["text"].loc[X_test["language"] == language] = texts_translated


is NaN :


Unnamed: 0,designation,description,productid,imageid


# Nettoyage et lemmatisation

In [27]:
def clean(text) -> list:
    """
    This function removes stop words from a given text and returns a list of sentences without stop words. It also removes unnecessary spaces, converts all characters to lowercase, and trims leading and trailing spaces.

    Parameters:
        text (str or list): The input text can be either a string containing multiple sentences separated by newline characters ('\n') or a list of strings representing individual sentences.

    Returns:
        list: A list of sentences without stop words and other modifications as described above.
    """

    # Initialize empty list for sentences
    sentences = []
    lemmes = []

    # Pre-compile regex patterns
    SPE_CHAR_RE = re.compile("[^a-zA-ZÀ-ÿ]")
    SPACES_RE = re.compile(r"\s+")

    # Convert 'ortho' column to lowercase and strip spaces, then create a dictionary for quick lookup
    lexique_dict = {
        row["ortho"].lower().strip(): row["lemme"]
        for _, row in lexique_filtred_unique.iterrows()
    }

    # Iterate over each sentence in the text
    for sentence in tqdm(text):
        if sentence is np.nan:
            sentences.append(sentence)
            lemmes.append(sentence)
        else:
            # Replace special characters with spaces while keeping accents
            sentence = SPE_CHAR_RE.sub(" ", sentence)

            # Convert sentence to lowercase and split
            words = nltk.word_tokenize(sentence.lower(), language="french")

            # Look up lemmes in the pre-processed dictionary
            lemme_words = [lexique_dict.get(word, word) for word in words]

            # Join words back into a sentence and strip leading/trailing spaces
            cleaned_sentence = SPACES_RE.sub(" ", " ".join(words)).strip()
            cleaned_lemmes = SPACES_RE.sub(" ", " ".join(lemme_words)).strip()

            # Append the cleaned sentence to the list of sentences
            sentences.append(cleaned_sentence)
            lemmes.append(cleaned_lemmes)

    return sentences, lemmes

X_train

In [26]:
X_train = pd.read_csv(PATH_PROCESSED + "X_train_translated.csv", index_col=0)
# X_train["text"] = strip_accents(X_train["text"])
X_train["text"], X_train["lemmes"] = clean(X_train["text"])
# len texts
X_train["len_text"] = X_train["text"].str.len()
X_train["len_lemmes"] = X_train["lemmes"].str.len()

# add target column
X_train[target.columns[0]] = target

X_train.drop(["designation", "description", "language"], axis="columns", inplace=True)

# save dataframe
X_train.to_csv(PATH_PROCESSED + "X_train_preprocessed.csv")
display(X_train.head())
X_train.describe()

  0%|          | 0/84916 [00:00<?, ?it/s]

Unnamed: 0,productid,imageid,text,lemmes,len_text,len_lemmes,prdtypecode
0,3804725264,1263597046,pages de carnet personnalisées olivia dot grid...,page de carnet personnalisé olivia dot grid ca...,68,64,10
1,436067568,1008141237,journal des arts le n du l art et son marche s...,journal des art le ne du l art et son marche s...,177,172,2280
2,201115110,938777978,grand stylet ergonomique bleu gamepad nintendo...,grand stylet ergonomique bleu gamepad nintendo...,731,732,50
3,50418756,457047496,peluche donald europe disneyland marionnette à...,peluche donald europe disneyland marionnette à...,52,52,1280
4,278535884,1077757786,la guerre des tuques luc a des idées de grande...,la guerre des tuques luc a des idée de grandeu...,203,212,2705


Unnamed: 0,productid,imageid,len_text,len_lemmes,prdtypecode
count,84916.0,84916.0,84916.0,84916.0,84916.0
mean,2555468000.0,1152691000.0,520.531537,517.319422,1773.2199
std,1588656000.0,175142700.0,664.802838,660.806625,788.179885
min,183912.0,67284.0,4.0,4.0,10.0
25%,676051900.0,1056269000.0,57.0,56.0,1281.0
50%,3190506000.0,1213354000.0,265.0,263.0,1920.0
75%,3995599000.0,1275646000.0,806.0,803.0,2522.0
max,4252012000.0,1328824000.0,11919.0,12066.0,2905.0


X_test

In [28]:
X_test = pd.read_csv(PATH_PROCESSED + "X_test_translated.csv", index_col=0)
# X_test["text"] = strip_accents(X_test["text"])
X_test["text"], X_test["lemmes"] = clean(X_test["text"])
# len texts
X_test["len_text"] = X_test["text"].str.len()
X_test["len_lemmes"] = X_test["lemmes"].str.len()

X_test.drop(["designation", "description", "language"], axis="columns", inplace=True)

# save dataframe
X_test.to_csv(PATH_PROCESSED + "X_test_preprocessed.csv")
display(X_test.head())
X_test.describe()

  0%|          | 0/13812 [00:00<?, ?it/s]

Unnamed: 0,productid,imageid,text,lemmes,len_text,len_lemmes
84916,516376098,1019294171,folkmanis puppets marionnette et théâtre mini ...,folkmanis puppets marionnette et théâtre mini ...,52,52
84917,133389013,1274228667,porte flamme gaxix flamebringer gaxix u twilig...,porte flamme gaxix flamebringer gaxix u twilig...,63,62
84918,4128438366,1295960357,pompe de filtration speck badu,pompe de filtration speck badu,30,30
84919,3929899732,1265224052,robot de piscine électrique ce robot de piscin...,robot de piscine électrique ce robot de piscin...,937,934
84920,152993898,940543690,hsm destructeur securio c coupe crois e x mm,hsm destructeur securio c coupe croire 2e x mm,44,46


Unnamed: 0,productid,imageid,len_text,len_lemmes
count,13812.0,13812.0,13812.0,13812.0
mean,2549060000.0,1153300000.0,520.550463,517.21865
std,1593114000.0,170474100.0,695.102422,690.471314
min,184794.0,482661.0,6.0,6.0
25%,614405900.0,1055618000.0,56.0,56.0
50%,3195801000.0,1212607000.0,253.0,250.0
75%,3993171000.0,1275573000.0,805.0,803.0
max,4252011000.0,1328823000.0,19483.0,19169.0


# WORDCLOUD mots normaux

In [None]:
for code in X_train["prdtypecode"].unique():
    # make a map of word of X_train['text']
    word_map = {}
    total_text = ""
    code_df = X_train["text"].loc[X_train["prdtypecode"] == code]
    print(f"########### {code} ###########")
    for i in tqdm(code_df.index, total=len(code_df)):
        text = code_df[i]
        if text is not np.nan:
            total_text += text + " "
            for j in text.split(" "):
                if j not in word_map and j != "":
                    word_map[j] = 1
                elif j != "":
                    word_map[j] += 1

    # sort the map by value
    word_map = sorted(word_map.items(), key=lambda x: x[1], reverse=True)

    # Setting up the subplot for bar plot and word cloud
    fig, axs = plt.subplots(
        1, 2, figsize=FIGSIZE
    )  # Corrected to subplots for creating a 1x2 grid
    fig.suptitle(f"Most frequent words of prdtypecode: {code}")  # Corrected method name

    # Bar plot
    n_words = 30
    axs[0].bar([i[0] for i in word_map[:n_words]], [i[1] for i in word_map[:n_words]])
    axs[0].tick_params(
        axis="x", rotation=90
    )  # Corrected method for setting x-ticks rotation

    # Word cloud
    wordcloud = WordCloud(
        background_color="white",
        max_words=500,
        width=640,
        height=360,
        collocations=False,
    ).generate(total_text)
    axs[1].imshow(wordcloud, interpolation="bilinear")
    axs[1].axis("off")  # Moved inside the loop to apply to each word cloud subplot

    plt.tight_layout(
        rect=[0, 0, 1, 0.96]
    )  # Adjust layout to not overlap with the suptitle
    plt.show()

# WORDCLOUD lemmes

In [None]:
for code in X_train["prdtypecode"].unique():
    # make a map of word of X_train['text']
    word_map = {}
    total_text = ""
    code_df = X_train["lemmes"].loc[X_train["prdtypecode"] == code]
    print(f"########### {code} ###########")
    for i in tqdm(code_df.index, total=len(code_df)):
        text = code_df[i]
        if text is not np.nan:
            total_text += text + " "
            for j in text.split(" "):
                if j not in word_map and j != "":
                    word_map[j] = 1
                elif j != "":
                    word_map[j] += 1

    # sort the map by value
    word_map = sorted(word_map.items(), key=lambda x: x[1], reverse=True)

    # Setting up the subplot for bar plot and word cloud
    fig, axs = plt.subplots(
        1, 2, figsize=FIGSIZE
    )  # Corrected to subplots for creating a 1x2 grid
    fig.suptitle(
        f"Most frequent lemmes of prdtypecode: {code}"
    )  # Corrected method name

    # Bar plot
    n_words = 30
    axs[0].bar([i[0] for i in word_map[:n_words]], [i[1] for i in word_map[:n_words]])
    axs[0].tick_params(
        axis="x", rotation=90
    )  # Corrected method for setting x-ticks rotation

    # Word cloud
    wordcloud = WordCloud(
        background_color="white",
        max_words=500,
        width=640,
        height=360,
        collocations=False,
    ).generate(total_text)
    axs[1].imshow(wordcloud, interpolation="bilinear")
    axs[1].axis("off")  # Moved inside the loop to apply to each word cloud subplot

    plt.tight_layout(
        rect=[0, 0, 1, 0.96]
    )  # Adjust layout to not overlap with the suptitle
    plt.show()

# WORDCLOUD sans pre-processing

In [None]:
df_origin[target.columns[0]] = target
# join texts
df_origin["text"] = np.where(
    df_origin["description"].isna(),
    df_origin["designation"].astype(str),
    df_origin["designation"].astype(str) + " " + df_origin["description"].astype(str),
)
for code in df_origin["prdtypecode"].unique():
    # make a map of word of X_train['text']
    word_map = {}
    total_text = ""
    code_df = df_origin["text"].loc[df_origin["prdtypecode"] == code]
    print(f"########### {code} ###########")
    for i in tqdm(code_df.index, total=len(code_df)):
        text = code_df[i]
        total_text += text + " "
        for j in text.split(" "):
            if j not in word_map and j != "":
                word_map[j] = 1
            elif j != "":
                word_map[j] += 1

    # sort the map by value
    word_map = sorted(word_map.items(), key=lambda x: x[1], reverse=True)

    # Setting up the subplot for bar plot and word cloud
    fig, axs = plt.subplots(
        1, 2, figsize=FIGSIZE
    )  # Corrected to subplots for creating a 1x2 grid
    fig.suptitle(
        f"Most frequent words (without cleaning) of prdtypecode: {code}"
    )  # Corrected method name

    # Bar plot
    n_words = 30
    axs[0].bar([i[0] for i in word_map[:n_words]], [i[1] for i in word_map[:n_words]])
    axs[0].tick_params(
        axis="x", rotation=90
    )  # Corrected method for setting x-ticks rotation

    # Word cloud
    wordcloud = WordCloud(
        background_color="white",
        max_words=500,
        width=640,
        height=360,
        collocations=False,
    ).generate(total_text)
    axs[1].imshow(wordcloud, interpolation="bilinear")
    axs[1].axis("off")  # Moved inside the loop to apply to each word cloud subplot

    plt.tight_layout(
        rect=[0, 0, 1, 0.96]
    )  # Adjust layout to not overlap with the suptitle
    plt.show()