In [None]:
import pandas as pd
import nltk
import json  
import sys  

import seaborn as sns  
import matplotlib.pyplot as plt  

from nltk.stem.snowball import EnglishStemmer
import plotly.express as px  
from sklearn.preprocessing import LabelEncoder  

# nltk.download('punkt')
# nltk.download('stopwords')

import tensorflow as tf
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D, GlobalAveragePooling1D, Flatten, Dense, Dropout 
from tensorflow.keras.layers import Rescaling, RandomFlip, RandomRotation, RandomZoom
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.vgg16 import preprocess_input as preprocess_input_vgg16
from tensorflow.keras.applications.vgg19 import preprocess_input as preprocess_input_vgg19
from tensorflow.keras.preprocessing.image import load_img  as load_img, img_to_array  as img_to_array
from tensorflow.keras.utils import to_categorical
from glob import glob


# TEXTE

## Recupérer et explorer les données

Je commence par lire mon fichier d'entrée

In [None]:
df = pd.read_csv("./../input/flipkart_com-ecommerce_sample_1050.csv")

In [None]:
print("Shape is", df.shape)
df.head()

Je n'ai pas besoin de toute les colonnes, je conserve que ce qui m'interesse

In [None]:
df = df[["uniq_id","product_name","product_category_tree", "description"]]
df.head()

In [None]:
df.info()

In [None]:
df.nunique()

In [None]:
df.isna().sum()

Je mets les categories sous forme de colonne hierachique

In [None]:
# Define a function to split and create the category columns dynamically  
def create_category_columns(row):
    json_categories = json.loads(row["product_category_tree"])
    if(len(json_categories) > 1):
        print("Categories array > 1, update the script to cover that.")
        sys.exit(1)  

    split_categories = json_categories[0].split(">>")
    
    for i, category in enumerate(split_categories):
        column_name = f"category_lvl_{i + 1}"  
        row[column_name] = category.strip().lower()
    return row

# Apply the funcion to all rows
df = df.assign(**df.apply(lambda row: create_category_columns(row), axis=1))  


In [None]:
df.head()

In [None]:
df.isna().sum()

In [None]:
df["category_lvl_3"].fillna("undefined",inplace=True)
df["category_lvl_4"].fillna("undefined",inplace=True)
df["category_lvl_5"].fillna("undefined",inplace=True)
df["category_lvl_6"].fillna("undefined",inplace=True)
df["category_lvl_7"].fillna("undefined",inplace=True)
df.isna().sum()


Je regarde la distribution de mes valeurs sur la première catégorie

In [None]:
df['category_lvl_1'].value_counts()

In [None]:
# Assuming 'df' is your DataFrame  
category_counts = df['category_lvl_1'].value_counts()  
  
# Set color palette from seaborn  
colors = sns.color_palette('Set3')  
  
# Create pie chart  
plt.figure(figsize=(5, 5))  
plt.pie(category_counts, labels=category_counts.index, autopct='%1.1f%%', startangle=90, colors=colors)  
plt.axis('equal')  
plt.title("Value counts of 'category_lvl_1'")  
  
# Show the chart  
plt.show()  

La distribution est similaire pour chaque produits, j'affiche la 2e catégorie via le graphique sunburst

In [None]:
fig = px.sunburst(df, path=['category_lvl_1', 'category_lvl_2'])

fig.update_layout(  
    margin=dict(t=0, l=0, r=0, b=0),  
    width=500,  
    height=500,  
    title="Zoomable Sunburst Chart"  
)  

fig.show()

Je visualize avec Treemap pour observer la différence de rendu avec sunburst pour utiliser lors de la présentation

In [None]:
fig = px.treemap(df, path=['category_lvl_1', 'category_lvl_2', 'category_lvl_3'])  
  
fig.update_layout(  
    margin=dict(t=0, l=0, r=0, b=0),  
    width=800,  
    height=600,  
    title="Treemap Chart"  
)  
  
fig.show()

Je suis ok au niveau de la visualisation, ce qui m'interesse sera de classifier uniquement la catégorie lvl 1.  
Je vais donc mettre à jour mon dataframe

In [None]:
df = df[["uniq_id", "product_name", "description", "category_lvl_1"]]
df.head()

J'effectue maintenant un label encoding sur mes categories

In [None]:
df = df.rename(columns={'category_lvl_1': 'cat'})
original_df = df.copy()

encoder = LabelEncoder()  
df["cat_e"] = encoder.fit_transform(df["cat"])  

In [None]:
df.head()

Je vérifie si j'ai des doublons dans la description

In [None]:
idx = df.duplicated(subset="description", keep="first")
df.loc[idx,:].sort_values("description")

## Nettoyer la description

Je vais maintenant nettoyer la colonne description, ce qui consiste à analyser le nombre original de token (mots) et de les réduire au maximum tout en conservant la pertinence des mots. C'est à dire retirer les mots unique, retirer les mots qui ne sont pas anglais, ...

En nettoyant cette colonne, je vais pouvoir réduire le bruit et étudier au mieux la faisabilité de classification.

Je commence d'abord par créer mon corpus, le document qui va contenir toutes les description de tous les produits.

In [None]:

# HELPERS

import os, sys, time
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize, wordpunct_tokenize

from nltk.corpus import words, stopwords
from nltk.tokenize import RegexpTokenizer

from wordcloud import WordCloud
from PIL import Image
from collections import Counter  

from pandarallel import pandarallel

# ---- 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  
from sklearn.decomposition import PCA  
from sklearn.manifold import TSNE  
from sklearn.metrics import adjusted_rand_score
from sklearn.cluster import KMeans  
from umap import UMAP  
from sklearn.cluster import DBSCAN  
from sklearn.decomposition import TruncatedSVD  
from sklearn.decomposition import LatentDirichletAllocation  
from sklearn.metrics import accuracy_score  
from sklearn.metrics import confusion_matrix  
from gensim.models import Word2Vec  

# nltk.download("omw-1.4")
# nltk.download("wordnet")
# nltk.download("stopwords")
# nltk.download("words")

max_workers = os.cpu_count()  
print("Maximum number of workers:", max_workers)  

pandarallel.initialize(progress_bar=True, nb_workers=max_workers)

sns.set()

# Display the number of tokens and unique
def display_token_info(tokens):
    print(f"nb tokens {len(tokens)}, nb token uniques {len(set(tokens))}")
    print(tokens[:30])
    
english_stop_words = set(stopwords.words("english"))


def process_text_step_1(doc):
    """
    required arguments:
    -------------------
    
    doc: str : the document to process
    
    return:
    -------------------
    
    a list of tokens
    """
    
    # Reduce to lowercase
    doc = doc.lower().strip()
    
    tokenizer = RegexpTokenizer(r"\w+")
    raw_tokens_list = tokenizer.tokenize(doc)
    
    # Remove stop words
    # clean_tokens_list = [w for w in raw_tokens_list if w not in english_stop_words]
    
    return raw_tokens_list

def process_text_step_2(doc, stopwords):
    """
    required arguments:
    -------------------
    
    doc: str : the document to process
    stopwords: list : a list of stopwords to remove from the token
    
    return:
    -------------------
    
    a list of tokens
    """
    processed_tokens = process_text_step_1(doc)
    
    # Remove stop words
    processed_tokens = [w for w in processed_tokens if w not in stopwords]
    
    return processed_tokens

def process_text_step_3(doc, stopwords):
    """
    required arguments:
    -------------------
    
    doc: str : the document to process
    stopwords: list : a list of stopwords to remove from the token
    
    return:
    -------------------
    
    a list of tokens
    """
    processed_tokens = process_text_step_2(doc, stopwords)
    
    processed_tokens = [w for w in processed_tokens if w.isalpha()]
    
    return processed_tokens

def process_text_step_4(doc, 
                        stopwords, 
                        delete_words
                        ):
    """
    required arguments:
    -------------------
    
    doc: str : the document to process
    stopwords: list : a list of stopwords to remove from the token
    delete_words: list : a list of words to remove from the token
    
    return:
    -------------------
    
    a list of tokens
    """
    processed_tokens = process_text_step_3(doc, stopwords)
    processed_tokens = [w for w in processed_tokens if w not in delete_words]
    
    return processed_tokens

def process_text_step_5(doc, 
                        stopwords, 
                        delete_words,
                        min_word_length
                        ):
    """
    required arguments:
    -------------------
    
    doc: str : the document to process
    stopwords: list : a list of stopwords to remove from the token
    delete_words: list : a list of words to remove from the token
    min_word_length: int : a minimum number of characters per word to keep
    
    return:
    -------------------
    
    a list of tokens
    """
    processed_tokens = process_text_step_4(doc, stopwords, delete_words)
    processed_tokens = [w for w in processed_tokens if len(w) >= min_word_length]

    return processed_tokens

def process_text_step_6(doc, 
                        stopwords, 
                        delete_words,
                        min_word_length,
                        use_lemm,
                        ):
    """
    required arguments:
    -------------------
    
    doc: str : the document to process
    stopwords: list : a list of stopwords to remove from the token
    delete_words: list : a list of words to remove from the token
    min_word_length: int : a minimum number of characters per word to keep
    use_lemm: bool : define if it uses lemmatizer, if false, it will defined stemmer
    
    return:
    -------------------
    
    a list of tokens
    """
    processed_tokens = process_text_step_5(doc, stopwords, delete_words, min_word_length)

    if use_lemm:
         trans = WordNetLemmatizer()
         processed_tokens = [trans.lemmatize(i) for i in processed_tokens]
    else:
        trans = PorterStemmer()
        processed_tokens = [trans.stem(i) for i in processed_tokens]

    # Could improve this by reordering step, not needed for now
    # Once again, I remove all tokens >= min_word_length
    processed_tokens = [w for w in processed_tokens if len(w) >= min_word_length]
    # Once again, I remove all delete_words
    processed_tokens = [w for w in processed_tokens if w not in delete_words]

    return processed_tokens

def final_process(doc, 
                        stopwords, 
                        delete_words,
                        min_word_length,
                        use_lemm,
                        ):
    """
    required arguments:
    -------------------
    
    doc: str : the document to process
    stopwords: list : a list of stopwords to remove from the token
    delete_words: list : a list of words to remove from the token
    min_word_length: int : a minimum number of characters per word to keep
    use_lemm: bool : define if it uses lemmatizer, if false, it will defined stemmer
    
    return:
    -------------------
    
    a str of joined token
    """
    processed_tokens = process_text_step_6(doc, stopwords, delete_words, min_word_length, use_lemm)

    if use_lemm:
         trans = WordNetLemmatizer()
         processed_tokens = [trans.lemmatize(i) for i in processed_tokens]
    else:
        trans = PorterStemmer()
        processed_tokens = [trans.stem(i) for i in processed_tokens]

    # Could improve this by reordering step, not needed for now
    # Once again, I remove all tokens >= min_word_length
    processed_tokens = [w for w in processed_tokens if len(w) >= min_word_length]
    # Once again, I remove all delete_words
    processed_tokens = [w for w in processed_tokens if w not in delete_words]

    return " ".join(processed_tokens)

def process_text_step_7_deprecated(doc, 
                        stopwords, 
                        delete_words,
                        min_word_length,
                        use_lemm,
                        allow_words,
                        ):
    """
    required arguments:
    -------------------
    
    doc: str : the document to process
    stopwords: list : a list of stopwords to remove from the token
    delete_words: list : a list of words to remove from the token
    min_word_length: int : a minimum number of characters per word to keep
    use_lemm: bool : define if it uses lemmatizer, if false, it will defined stemmer
    include_words: list : a list of words to allow to keep
    
    return:
    -------------------
    
    a list of tokens
    """
    processed_tokens = process_text_step_6(doc, stopwords, delete_words, min_word_length, use_lemm)
    processed_tokens = [i for i in processed_tokens if i in allow_words]
    return processed_tokens


In [None]:
raw_corpus = " ".join(df["description"].values)
raw_corpus

J'observe le nombre de charactères

In [None]:
len(raw_corpus)

**Première analyse:**
- Je tokenize mon document en conservant les caractères alphanumérique et en mettant chaque mot en minuscule

In [None]:
tokens = process_text_step_1(raw_corpus)
display_token_info(tokens)

J'ai donc un nombre de token initial de 81,219 token et 6,284 unique.  
Je vais tenter de réduire se nombre au maximum par itération.  

Je souhaite retirer tous les mots qui sont considérés comme stopwords en Anglais, des mots qui ne seront pas utile à définir une catégorie (le, la, les, ...)

**Deuxième analyse:**
- Je tokenize mon document en conservant les caractères alphanumérique et en mettant chaque mot en minuscule
- Je supprime les token qui comprennent les stopwords Anglais.

In [None]:
tokens = process_text_step_2(raw_corpus, english_stop_words)
display_token_info(tokens)

Déjà ~20k tokens retiré, c'est que le début

Maintenant je souhaite retirer toutes les valeurs qui sont des nombres, je ne pense pas que les valeurs numériques me permettent de mieux distinguer une catégorie.

**Troisième analyse:**
- Je tokenize mon document en conservant les caractères alphanumérique et en mettant chaque mot en minuscule
- Je supprime les token qui comprennent les stopwords Anglais.
- Je supprime tous les mots numériques

In [None]:
tokens = process_text_step_3(raw_corpus, english_stop_words)
display_token_info(tokens)

6k tokens de réduit donc 1k token uniques.  
Maintenant, je pense que tous les mots présent uniquement une seul fois dans le corpus ne sont pas utiles pour déterminer une catégorie, je vais donc les supprimer.

In [None]:
pd.Series(tokens).value_counts()

In [None]:
all_tokens = pd.Series(tokens).value_counts()
tokens_one_occurence = all_tokens[all_tokens==1]
tokens_one_occurence = list(tokens_one_occurence.index)

print("len",len(tokens_one_occurence))
tokens_one_occurence[:5]

**Quatrième analyse:**
- Je tokenize mon document en conservant les caractères alphanumérique et en mettant chaque mot en minuscule
- Je supprime les token qui comprennent les stopwords Anglais.
- Je supprime tous les mots numériques
- Je supprime tous les mots qui sont présent qu'une seul fois


In [None]:
tokens = process_text_step_4(raw_corpus, english_stop_words, delete_words=tokens_one_occurence)
display_token_info(tokens)

~1500 tokens unique retiré. Je continue.  
Je vais observer tous les mots qui ont moins de 3 caractères, je ne suis pas sur que ces mots définissent une catégorie facilement.

Rien d'interessant a vue d'oeil.  
Je les supprime.

**Cinquième analyse:**
- Je tokenize mon document en conservant les caractères alphanumérique et en mettant chaque mot en minuscule
- Je supprime les token qui comprennent les stopwords Anglais.
- Je supprime tous les mots numériques
- Je supprime tous les mots qui sont présent qu'une seul fois
- Je supprime tous les mots qui font moins de 3 caractères


In [None]:
tokens = process_text_step_5(raw_corpus, english_stop_words, delete_words=tokens_one_occurence, min_word_length=3)
display_token_info(tokens)

Pas beaucoup de tokens retirés, c'est mieux que rien.  
Maintenant le plus intéressant, stemmer ou lemmatizer ?

- Stemmer: réduit les mots à leur racine (supprime les affixes) (rapide et ne tient pas en compte le contexte)
- Lemmatizer: Normalize les mots depuis un dictionnaire (lent et tient en compte le sens des mots)

Ici, je souhaite classifier une catégorie, je ne pense pas que le sens des mots soit important comparé à une analyse de positivité d'un text.  
Je pense utiliser celui qui me supprimera le plus de token. Je pourrais aussi retenter mon approche plus bas avec l'autre méthode.

**Sixième analyse:**
- Je tokenize mon document en conservant les caractères alphanumérique et en mettant chaque mot en minuscule
- Je supprime les token qui comprennent les stopwords Anglais.
- Je supprime tous les mots numériques
- Je supprime tous les mots qui sont présent qu'une seul fois
- Je supprime tous les mots qui font moins de 3 caractères
- J'effectue un Stemmer ou Lemmatizer


In [None]:
tokens = process_text_step_6(raw_corpus, 
                             english_stop_words, 
                             delete_words=tokens_one_occurence, 
                             min_word_length=3,
                             use_lemm=True)
display_token_info(tokens)

In [None]:
tokens = process_text_step_6(raw_corpus, 
                             english_stop_words, 
                             delete_words=tokens_one_occurence, 
                             min_word_length=3,
                             use_lemm=False)
display_token_info(tokens)

Plus rapide et moins de token par stemmer, je vais donc utiliser cette méthode par la suite.  
Ça commence à être pas trop mal. 

Je vais maintenant tester de retirer tous les mots qui ne sont pas dans la langue anglaise.

In [None]:
eng_words = [i.lower() for i in words.words()]
len(set(eng_words))

In [None]:
tokens = process_text_step_7_deprecated(raw_corpus, 
                             english_stop_words, 
                             delete_words=tokens_one_occurence, 
                             min_word_length=3,
                             use_lemm=False,
                             allow_words=eng_words)
display_token_info(tokens)

Je suis pas fan, j'ai peur de retirer trop de mots. Je perds "featur", même si je modifie le step 7 pour ne pas faire de stemmer, je perds également le mot "feature".  
Je n'ai pas confiance en cette méthode, je ne l'utiliserai pas.

C'est déjà pas mal, je vais observer les mots les plus courant pour chaque catégorie puis vérifier si il y a des doublons entre toute les catégories et supprimer ce que je trouve inutile.

In [None]:
df.columns

In [None]:
tokens_per_cat={}
for idx, cat in enumerate(df["cat"].unique()):
    df_cat = df[df['cat'] == cat]
    cat_corpus = " ".join(df_cat["description"])
    
    tokens_per_cat[cat] = process_text_step_6(cat_corpus, 
                             english_stop_words, 
                             delete_words=tokens_one_occurence, 
                             min_word_length=3,
                             use_lemm=False)
    
    print(f"{cat} has {len(cat_corpus)} characters for {len(tokens_per_cat[cat])} words")
    word_frequencies = Counter(tokens_per_cat[cat])  

    wordcloud = WordCloud(background_color="white",
                      stopwords=[],
                      max_words=50).generate_from_frequencies(frequencies=word_frequencies)

    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    
    

In [None]:
category_keys = list(tokens_per_cat.keys())  
  
lengths = [len(tokens_per_cat[key]) for key in category_keys]  
df_cat = pd.DataFrame({"Category": category_keys, "Length": lengths})  
df_cat = df_cat.sort_values(by="Length", ascending=False)  
  
color_palette = sns.color_palette("Set3")  
sns.set(style="whitegrid")  
sns.barplot(x="Category", y="Length", data=df_cat, palette=color_palette)  
plt.xticks(rotation=90)  # Rotate x-axis labels if necessary  
plt.xlabel("Category")  
plt.ylabel("Length")  
plt.title("Category Lengths")  
plt.show()  

In [None]:
categories = list(tokens_per_cat.keys())  
duplicated_values = []  
  
for value in pd.Series(tokens_per_cat[categories[0]]).value_counts().index:  
    if all(value in pd.Series(tokens_per_cat[cat]).values for cat in categories[1:]):  
        duplicated_values.append(value)
  
print(len(duplicated_values))  
print(duplicated_values)

J'ai 79 mots en doublons dans toutes les catégories. Ça me rajoutera du bruit, je préfère les retirer.  
Pour rappel, le résultat précédent que je conserve est: `nb tokens 51097, nb token uniques 3123`

In [None]:
words_deleted_manually = []
# words_deleted_manually = ["com","guarante","best","onlin","warranti","print"]
new_delete_words = list(set(tokens_one_occurence + duplicated_values + words_deleted_manually))
tokens = process_text_step_6(raw_corpus, 
                             english_stop_words, 
                             delete_words=new_delete_words, 
                             min_word_length=3,
                             use_lemm=True)
display_token_info(tokens)

Beaucoup de mots retirés mais je suis confiant que c'est utile.

In [None]:
tokens_per_cat[cat]

In [None]:
tokens_per_cat={}
for idx, cat in enumerate(df["cat"].unique()):
    df_cat = df[df['cat'] == cat]
    cat_corpus = " ".join(df_cat["description"])
    
    tokens_per_cat[cat] = process_text_step_6(cat_corpus, 
                             english_stop_words, 
                             delete_words=new_delete_words, 
                             min_word_length=3,
                             use_lemm=False)
    
    print(f"{cat} has {len(cat_corpus)} characters for {len(tokens_per_cat[cat])} words")
    word_frequencies = Counter(tokens_per_cat[cat])  

    wordcloud = WordCloud(background_color="white",
                      stopwords=[],
                      max_words=50).generate_from_frequencies(frequencies=word_frequencies)

    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()

Je suis satisfait de ce nettoyage.

Pour rappel, step initial: `nb tokens 81219, nb token uniques 6284`  
maintenant: `nb tokens 39942, nb token uniques 3030`

Réduction de plus de 50%.
Je vais donc lancer cette transormation sur toutes les lignes de mon dataframe et rajouter une colonne "description_clean"

In [None]:
df_clean_lemm = df.copy()
df_clean_stemm = df.copy()

df_clean_stemm["clean_desc"] = df_clean_stemm["description"].parallel_apply(lambda x: final_process(x, 
                             english_stop_words, 
                             delete_words=new_delete_words, 
                             min_word_length=3,
                             use_lemm=False))

df_clean_lemm["clean_desc"] = df_clean_lemm["description"].parallel_apply(lambda x: final_process(x, 
                             english_stop_words, 
                             delete_words=new_delete_words, 
                             min_word_length=3,
                             use_lemm=False))



In [None]:
### BONUS FOR TESTING
df_clean = df.copy()
df_clean["clean_desc_step1"] = df_clean_stemm["description"].parallel_apply(lambda x: " ".join(process_text_step_1(x)))

df_clean["clean_desc_step2"] = df_clean_stemm["description"].parallel_apply(lambda x: " ".join(process_text_step_2(x, 
                             english_stop_words)))

df_clean["clean_desc_step3"] = df_clean_stemm["description"].parallel_apply(lambda x: " ".join(process_text_step_3(x, 
                             english_stop_words)))

df_clean["clean_desc_step4"] = df_clean_stemm["description"].parallel_apply(lambda x: " ".join(process_text_step_4(x, 
                             english_stop_words, 
                             delete_words=new_delete_words)))

df_clean["clean_desc_step5"] = df_clean_stemm["description"].parallel_apply(lambda x: " ".join(process_text_step_5(x, 
                             english_stop_words, 
                             delete_words=new_delete_words, 
                             min_word_length=3)))

J'affiche la colonne description orginal et celle nettoyé pour les comparés

In [None]:
pd.set_option('display.max_colwidth', -1)  
df_clean_stemm[["description","clean_desc"]].head(3)

In [None]:
#df[df["clean_desc"].isna()]["description"]
df_clean_stemm[df_clean_stemm["clean_desc"] == ""]
# df = df.drop(df[df["clean_desc"] == ""].index)

Je sauvegarde mon dataframe dans un csv.

In [None]:
df_clean_stemm.to_csv("./../input/df_cleaned_stemm.csv", index=False)
df_clean_lemm.to_csv("./../input/df_cleaned_lemm.csv", index=False)

À noter, je n'ai pas utiliser de bi-grams ou tri-grams car dans une première réflexion, je ne pense pas que c'est utile pour classifier des catégories.

Néamoins, pour valider cette approche, je le ferais tout de même par la suite pour observer la différence de précision.

In [None]:
for idx, item in df_clean_stemm.iterrows():
    desc = item["clean_desc"]
    words = desc.split(" ")  
    
    df_clean_stemm.at[idx, 'word_count_clean'] = len(words)  
    df_clean_stemm.at[idx, 'unique_word_count_clean'] = len(set(words))
    
for idx, item in df_clean_lemm.iterrows():
    desc = item["clean_desc"]
    words = desc.split(" ")  
    
    df_clean_lemm.at[idx, 'word_count_clean'] = len(words)  
    df_clean_lemm.at[idx, 'unique_word_count_clean'] = len(set(words)) 

In [None]:
sorted_df_clean = df_clean_stemm.sort_values("word_count_clean", ascending=False)    
top_names = sorted_df_clean["product_name"].head(20)  
  
top_word_counts = sorted_df_clean["word_count_clean"].head(20)  
top_unique_counts = sorted_df_clean["unique_word_count_clean"].head(20)  

plt.figure(figsize=(15, 10))  
plt.subplot(2, 1, 1)

# Create a horizontal barplot using seaborn    
sns.barplot(y=top_names, x=top_word_counts, orient="h",  color="#e7a44a", label="Word Count")    
sns.barplot(y=top_names, x=top_unique_counts, orient="h",  color="#313c59", label="Unique Word Count")    

# Set the title and axes labels    
plt.title("Top 20 Names by Word Count (STEMM)")    
plt.xlabel("Word Count")    
plt.ylabel("Name")    

# Rotate x-axis labels for better readability    
plt.xticks(rotation=90)
    
plt.subplot(2, 1, 2)
sorted_df_clean = df_clean_lemm.sort_values("word_count_clean", ascending=False)    
top_names = sorted_df_clean["product_name"].head(20)  
  
top_word_counts = sorted_df_clean["word_count_clean"].head(20)  
top_unique_counts = sorted_df_clean["unique_word_count_clean"].head(20)  

# Create a horizontal barplot using seaborn    
sns.barplot(y=top_names, x=top_word_counts, orient="h",  color="#e7a44a", label="Word Count")    
sns.barplot(y=top_names, x=top_unique_counts, orient="h",  color="#313c59", label="Unique Word Count")    

# Set the title and axes labels    
plt.title("Top 20 Names by Word Count (LEMM)")    
plt.xlabel("Word Count")    
plt.ylabel("Name")    

# Rotate x-axis labels for better readability    
plt.xticks(rotation=90)

# Display the plot 
plt.legend()
plt.tight_layout()  
plt.show()  


## Classification text non supervisé & supervisé

Étapes:
- **Non supervisé**
  - Extraction des features
  - Réduction en 2 dimensions
  - Kmeans
  - Visualisation Graphique
  - Score

- **Supervisé**
  - Extraction des features
  - Réduction en 2 dimensions
  - Classification avec apprentissage
  - Visualisation Graphique
  - Score


In [None]:
from transformers import BertTokenizer, BertModel  
import tensorflow as tf  
import tensorflow_hub as hub  
from sklearn.model_selection import train_test_split  
from sklearn.ensemble import RandomForestClassifier  

def plot_confusion_matrix(y_true, y_pred, labels, title):  
    """  
    Plots the confusion matrix for clustering results.  
    """  
    cm = confusion_matrix(y_true, y_pred)  
    sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)  
    plt.xlabel('Predicted')  
    plt.ylabel('True')  
    plt.title(f"Confusion matrix {title}")
  
# Plot the confusion matrix  
# labels = np.unique(test_df['orginal_cat'])  # Unique labels in target labels  
# plot_confusion_matrix(test_df['orginal_cat'], test_df['predicted_cat'], labels)  


def modify_labels(y_true, y_pred):  
    """  
    Modifies the predicted values based on the confusion matrix.  
    """  
    cm = confusion_matrix(y_true, y_pred)  
    n_classes = len(np.unique(y_true))  
    modified_pred = np.copy(y_pred)
    
    mapping = {}
    for true_label in range(n_classes): 
        max_count = -1  
        for pred_label in range(n_classes):
            if cm[true_label, pred_label] > max_count and pred_label not in mapping.values():  
                max_count = cm[true_label, pred_label]
                mapping[true_label] = pred_label
        
        # print(mapping)
        # # print(mapping)
        # print(f"Replace y_pred {mapping[true_label]} by {true_label}, because {max_count}")
        # print()
    for true_label, pred_label in mapping.items():  
         modified_pred[y_pred == pred_label] = true_label
        
    return modified_pred
  
# # Modify the predicted values based on the confusion matrix  
# test_df['predicted_cat_adjusted'] = modify_labels(test_df['orginal_cat'], test_df['predicted_cat'])
# plot_confusion_matrix(test_df['orginal_cat'], test_df['predicted_cat_adjusted'], labels)  

def check_performance(df, 
                      reduction_name, 
                      reduction_method,
                      
                      clustering_name,
                      clustering_method,
                      
                      original_df,
                      draw_graph=False):
    
    # Apply a dimensional reduction  
    reduced_data = reduction_method.fit_transform(df)  

    # Determine new clusters  
    # kmeans = KMeans(n_clusters=7, n_init=10) # Change the number of clusters based on your data  
    clusters = clustering_method.fit_predict(reduced_data)

    # clusters_adjusted = clusters
    clusters_adjusted = modify_labels(original_df['cat_e'], clusters)
    # Print the ARI score  
    ari_score = adjusted_rand_score(original_df['cat_e'], clusters)
    accuracy = accuracy_score(original_df['cat_e'], clusters)  
    
    accuracy_adjusted = accuracy_score(original_df['cat_e'], clusters_adjusted)  
    
    if(draw_graph):
        print(f"ARI Score ({clustering_name} - {reduction_name}) : %.2f" % ari_score)  
        print(f"ACCURACY ({clustering_name} - {reduction_name}) : %.2f" % accuracy)  
        print(f"ACCURACY Adjusted ({clustering_name} - {reduction_name}) : %.2f" % accuracy_adjusted)  
        
        # Plot on the left the datapoints with existing categories  
        plt.figure(figsize=(12, 3))  
        plt.subplot(1, 2, 1)  
        sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=original_df['cat'], palette='Set1')  
        plt.title(f"Real Categories {clustering_name} - {reduction_name}")  
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)  
        
        # Plot on the right the datapoints with determined categories  
        plt.subplot(1, 2, 2)  
        sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=clusters, palette='Set1')  
        plt.title(f"Predicted Categories {clustering_name} - {reduction_name}")  
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)  
    
        plt.tight_layout()  
        plt.show()
        
        plt.figure(figsize=(12, 3))
        plt.subplot(1, 2, 1)
        plot_confusion_matrix(original_df['cat_e'], clusters, np.unique(original_df['cat_e']), "Predicted")
        plt.subplot(1, 2, 2)
        plot_confusion_matrix(original_df['cat_e'], clusters_adjusted, np.unique(original_df['cat_e']), "Predicted and adjusted")
        plt.tight_layout()  
        plt.show()
    
    
    return {"label": f"{reduction_name}", "ari_score": ari_score,"accuracy": accuracy_adjusted, "clusters": clusters}

In [None]:
# kmeans = KMeans(n_clusters=7, n_init=10, init='k-means++', max_iter=300, random_state=23)
# tsne = PCA(n_components=2)
# test_res = check_performance(tfidf_df, 
#                             "tsne", 
#                             tsne, 
#                             "kmeans", 
#                             kmeans, 
#                             df,
#                             draw_graph=True)
# kmeans = KMeans(n_clusters=7, n_init=10, init='k-means++', max_iter=300, random_state=23)
# tsne = TSNE(n_components=2, random_state=23)
# test_res = check_performance(tfidf_df, 
#                             "tsne", 
#                             tsne, 
#                             "kmeans", 
#                             kmeans, 
#                             df,
#                             draw_graph=True)

In [None]:
clusterings = [
    {"name": "KMEANS", "model": KMeans(n_clusters=7, n_init=10, init='k-means++', max_iter=300, random_state=42)},
    # {"name": "DBSCAN", "model": DBSCAN(eps=0.5, min_samples=5)  }
]

dimension_reduction = [
    {"name": "PCA", "model": PCA(n_components=2)},
    {"name": "t-SNE", "model": TSNE(n_components=2, perplexity=40, random_state=42)  },
    {"name": "UMAP", "model": UMAP(n_components=2) },
    {"name": "SVD", "model": TruncatedSVD(n_components=2) },
]


def check_one_df(dataFrame, draw_graph=False):
    
    df_scores = pd.DataFrame(columns=["label", "ari", "accuracy"])  

    for idx, dim_reduc in enumerate(dimension_reduction):
        for idx2, clustering in enumerate(clusterings):
            res = check_performance(dataFrame, 
                            dim_reduc["name"], 
                            dim_reduc["model"], 
                            clustering["name"], 
                            clustering["model"], 
                            df,
                            draw_graph)
            row = pd.DataFrame({"label": res["label"], "ari": res["ari_score"], "accuracy": res["accuracy"]}, index=[0])
            df_scores = pd.concat([df_scores, row], ignore_index=True)
            
    return df_scores

In [None]:
# CountVectorizer
count_vectorizer = CountVectorizer()  
bow_matrix = count_vectorizer.fit_transform(df_clean_stemm['clean_desc'])  
bow_df = pd.DataFrame(bow_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())   
bow_df_scores = check_one_df(bow_df,False)

In [None]:
df_clean_lemm['clean_desc']

In [None]:
# TF-IDF method  
tfidf_vectorizer = TfidfVectorizer()  
tfidf_matrix = tfidf_vectorizer.fit_transform(df_clean_lemm['clean_desc'])  
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df_scores = check_one_df(tfidf_df, True)

In [None]:

# Will be redefined later on
def check_one_df_supervised():
    return np.nan

def get_score_bert(texts, supervised=False):  
    # Load pre-trained BERT model & tokenizer  
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')  
    model = BertModel.from_pretrained('bert-base-uncased')  
      
    bert_embeddings = []  
      
    for desc in texts:  
        # Encode the descriptions using BERT tokenizer  
        inputs = tokenizer(desc, padding=True, truncation=True, return_tensors="pt", max_length=512)  
        # Get the output from BERT model  
        outputs = model(**inputs)  
        # Use the mean of the last hidden states as the document representation  
        embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()  
        bert_embeddings.append(embeddings[0])  
          
    bert_df = pd.DataFrame(bert_embeddings)
    
    if(supervised):
        return check_one_df_supervised(bert_df, False)
    else:
        return check_one_df(bert_df, False)

def get_score_use(texts, supervised=False):  
    # Load pre-trained USE model  
    use = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/frameworks/TensorFlow2/variations/universal-sentence-encoder/versions/2")  
      
    use_embeddings = []  
      
    for desc in texts:  
        # Generate embedding for the description  
        embeddings = use([desc])  
        use_embeddings.append(embeddings.numpy()[0])  
  
    use_df = pd.DataFrame(use_embeddings)
    if(supervised):
        return check_one_df_supervised(use_df, False)
    else:
        return check_one_df(use_df, False)
  
def get_score_count_vectorizer(texts, supervised=False):
    # CountVectorizer
    count_vectorizer = CountVectorizer()  
    bow_matrix = count_vectorizer.fit_transform(texts)  
    df_model = pd.DataFrame(bow_matrix.toarray(), columns=count_vectorizer.get_feature_names_out())
    if(supervised):
        return check_one_df_supervised(df_model, False)
    else:
        return check_one_df(df_model, False)
    
def get_score_tfidf(texts, supervised=False):
    # TF-IDF method  
    tfidf_vectorizer = TfidfVectorizer()  
    tfidf_matrix = tfidf_vectorizer.fit_transform(texts)  
    df_model = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
    if(supervised):
            return check_one_df_supervised(df_model, False)
    else:
        return check_one_df(df_model, False)    
def get_score_word2vec(texts, supervised=False):
    sentences = [doc.split() for doc in texts]  
    model = Word2Vec(sentences, vector_size=100,
                                    window=5,
                                    min_count=2)  

    word2vec_embeddings = []  
    for doc in sentences:  
        embeddings = []  
        for word in doc:  
            if word in model.wv:  
                embeddings.append(model.wv[word])  
        if len(embeddings) > 0:  
            word2vec_embeddings.append(np.mean(embeddings, axis=0))  
        else:  
            word2vec_embeddings.append(np.zeros(model.vector_size))  
    
    word2vec_df = pd.DataFrame(word2vec_embeddings)  
    if(supervised):
            return check_one_df_supervised(word2vec_df, False)
    else:
        return check_one_df(word2vec_df, False)

def calculate_accuracy_from_df(texts, supervised=False):
    
    scores = [
        # Calculate using count vectorizer
        {"name": "count-vectorizer", "values": get_score_count_vectorizer(texts,supervised)},  
        # Calculate using tf-idf
        {"name": "tf-idf", "values": get_score_tfidf(texts,supervised)},  
    ]
    
    if(supervised == False):
        # Calculate using Word2Vec
        scores.append({"name": "word2vec", "values": get_score_word2vec(texts,supervised)})
        
        # Calculate using bert
        scores.append({"name": "bert", "values": get_score_bert(texts,supervised)}),  
        # Calculate using use
        scores.append({"name": "use", "values": get_score_use(texts,supervised)}),  
        
    
    return scores

In [None]:
def show_score_for_one_df(df_scores):
    
    combined_df_ari = pd.concat([df['values'].assign(source=df['name']) for df in df_scores])  
    combined_df_accuracy = pd.concat([df['values'].assign(source=df['name']) for df in df_scores])  

    max_accuracy = combined_df_accuracy["accuracy"].max()
    max_accuracy_name = combined_df_accuracy.loc[combined_df_accuracy["accuracy"] == max_accuracy, "source"].values[0]  
    max_accuracy_label = combined_df_accuracy.loc[combined_df_accuracy["accuracy"] == max_accuracy, "label"].values[0]  

    max_ari = combined_df_ari["ari"].max()
    max_ari_name = combined_df_ari.loc[combined_df_ari["ari"] == max_ari, "source"].values[0]  
    max_ari_label = combined_df_ari.loc[combined_df_ari["ari"] == max_ari, "label"].values[0]  

    print("Max accuracy", round(max_accuracy,2), "from",max_accuracy_name, "with", max_accuracy_label)
    print("Max ari", round(max_ari,2), "from",max_ari_name, "with", max_ari_label)
    plt.figure(figsize=(15, 3))  
    plt.subplot(1, 2, 1)  
    sns.set(style='darkgrid')    
    sns.pointplot(x='label', y='ari', hue='source', data=combined_df_ari, palette='colorblind')    
    plt.xlabel('Methods')    
    plt.ylabel('ARI')    
    plt.title('ARI per methods using KMEANS')    
    plt.xticks(rotation=45)  
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)    
  
    plt.subplot(1, 2, 2)    
    sns.pointplot(x='label', y='accuracy', hue='source', data=combined_df_accuracy, palette='colorblind')    
    plt.xlabel('Methods')    
    plt.ylabel('Accuracy')    
    plt.title('Accuracy per methods KMEANS')    
    plt.xticks(rotation=45)  
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)    
  
    plt.tight_layout()  # Adjust spacing between subplots  
    plt.show()  
    

In [None]:
scs = calculate_accuracy_from_df(df_clean_lemm["clean_desc"])

In [None]:
show_score_for_one_df(scs)

In [None]:
scores_by_df = [
    {"name": "all_with_lemm", "scores": calculate_accuracy_from_df(df_clean_lemm["clean_desc"])},
    {"name": "all_with_stemm", "scores": calculate_accuracy_from_df(df_clean_stemm["clean_desc"])},
    {"name": "raw", "scores": calculate_accuracy_from_df(df["description"])},
    {"name": "step1", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step1"])},
    {"name": "step2", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step2"])},
    {"name": "step3", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step3"])},
    {"name": "step4", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step4"])},
    {"name": "step5", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step5"])},
]

In [None]:
full_scores_dfs = pd.DataFrame({"dataframe":[], "feature_extraction":[], "reduction_method": [], "ari": [], "accuracy":[], "full_label":[] })
for _, df_item in enumerate(scores_by_df):
    for _, global_score in enumerate(df_item["scores"]):
        global_score["values"]["dataframe"] = df_item["name"]
        global_score["values"]["feature_extraction"] = global_score["name"]
        global_score["values"] = global_score["values"].rename(columns={'label': 'reduction_method'})
        global_score["values"]["full_label"] = global_score["values"].apply(lambda row: f'{df_item["name"]}-{global_score["name"]}-{row["reduction_method"]}', axis=1)  
        full_scores_dfs = pd.concat([full_scores_dfs, global_score["values"]], ignore_index=True)
        # for _, score in enumerate(global_score["values"]):
        #     # print(f'Label = {score["label"]}')
        #     # print(f'Label = {score["label"]}, accuracy = {score["accuracy"]}')
        #     print(global_score["values"].head())

In [None]:
full_scores_dfs

In [None]:
# Sort the dataframe by accuracy in descending order  
sorted_df = full_scores_dfs.sort_values(by='accuracy', ascending=False)  
  
# Select the top 20 records  
top_20_df = sorted_df.head(20).reset_index()
  
# Create the bar plot  
plt.figure(figsize=(10, 6))  
ax = sns.barplot(data=top_20_df, x='accuracy', y='full_label', orient='h')  
plt.xlabel('Accuracy')  
plt.ylabel('Full Label')  
plt.title('Top 20 by Accuracy')  

# Add accuracy scores within each bar  
for index, row in top_20_df.iterrows():  
    ax.text(0.05, index + 0.2, f"{row['accuracy']:.3f}", color='black', ha="right")  
  

plt.show()  

### Supervised Text

In [None]:
from sklearn.naive_bayes import MultinomialNB  
from sklearn.svm import SVC 

clusterings = [
    {"name": "RandomForestClassifier", "model": RandomForestClassifier()},
    # {"name": "Naive Bayes", "model": MultinomialNB(alpha=1.0)},  
    {"name": "Support Vector Machines", "model": SVC(kernel='linear', C=1.0)}  
]

dimension_reduction = [
    {"name": "None", "model": np.nan},
    {"name": "PCA", "model": PCA(n_components=2)},
    {"name": "t-SNE", "model": TSNE(n_components=2, perplexity=40, random_state=42)  },
    {"name": "UMAP", "model": UMAP(n_components=2) },
    {"name": "SVD", "model": TruncatedSVD(n_components=2) },
]


def check_one_df_supervised(dataFrame, draw_graph=False):
    
    df_scores = pd.DataFrame(columns=["label", "ari", "accuracy"])  

    for idx, dim_reduc in enumerate(dimension_reduction):
        for idx2, clustering in enumerate(clusterings):
            res = check_performance_supervised(dataFrame, 
                            dim_reduc["name"], 
                            dim_reduc["model"], 
                            clustering["name"], 
                            clustering["model"], 
                            df,
                            draw_graph)
            row = pd.DataFrame({"label": res["label"], "ari": res["ari_score"], "model": res["model"], "accuracy": res["accuracy"]}, index=[0])
            df_scores = pd.concat([df_scores, row], ignore_index=True)
            
    return df_scores

def check_performance_supervised(df, 
                      reduction_name,
                      reduction_method,
                      
                      supervised_name,
                      supervised_method,
                      
                      original_df,
                      draw_graph=False):
    
    print(reduction_name, supervised_method, df.shape)
    X = df
    y = original_df["cat_e"]

    if(reduction_name == "None"):
        X = X
    else:
        # Apply a dimensional reduction  
        X_reduced = reduction_method.fit_transform(df)  
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    supervised_method.fit(X_train, y_train)
    
    y_pred = supervised_method.predict(X_test)
    y_pred_full = supervised_method.predict(X)

    # Print the ARI score  
    ari_score = adjusted_rand_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    
    # accuracy_adjusted = accuracy_score(y, clusters_adjusted)  
    
    if(draw_graph and reduction_name != "None"):
        print(f"ARI Score ({supervised_name} - {reduction_name}) : %.2f" % ari_score)  
        print(f"ACCURACY ({supervised_name} - {reduction_name}) : %.2f" % accuracy)  
        
        # Plot on the left the datapoints with existing categories  
        plt.figure(figsize=(12, 3))  
        plt.subplot(1, 2, 1)  
        sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=original_df['cat'], palette='Set1')  
        plt.title(f"Real Categories {supervised_name} - {reduction_name}")  
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)  
        
        # Plot on the right the datapoints with determined categories
        plt.subplot(1, 2, 2)  
        sns.scatterplot(x=X_reduced[:, 0], y=X_reduced[:, 1], hue=y_pred_full, palette='Set1')  
        plt.title(f"Predicted Categories {supervised_name} - {reduction_name}")  
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)  
    
        plt.tight_layout()  
        plt.show()
    
    return {"label": f"{reduction_name}", "model": f"{supervised_name}", "ari_score": ari_score,"accuracy": accuracy, "clusters": y_pred_full}

Maintenant pour ma propre connaissance, je veux faire un test de précision sur plusieurs modèles ainsi que plusieurs hypothèses.
Je regroupe le tout dans un tableau qui sera mon benchmark leaderboard.

Un peu overkill mais très utile pour ma compréhension et futurs projets.

In [None]:
scores_by_df_supervised = [
    {"name": "all_with_lemm", "scores": calculate_accuracy_from_df(df_clean_lemm["clean_desc"], True)},
    {"name": "all_with_stemm", "scores": calculate_accuracy_from_df(df_clean_stemm["clean_desc"], True)},
    {"name": "raw", "scores": calculate_accuracy_from_df(df["description"], True)},
    {"name": "step1", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step1"], True)},
    {"name": "step2", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step2"], True)},
    {"name": "step3", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step3"], True)},
    {"name": "step4", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step4"], True)},
    {"name": "step5", "scores": calculate_accuracy_from_df(df_clean["clean_desc_step5"], True)},
]

In [None]:
full_scores_dfs_supervised = pd.DataFrame({"dataframe":[], "feature_extraction":[], "reduction_method": [], "ari": [], "accuracy":[], "full_label":[] })
for _, df_item in enumerate(scores_by_df_supervised):
    for _, global_score in enumerate(df_item["scores"]):
        global_score["values"]["dataframe"] = df_item["name"]
        global_score["values"]["feature_extraction"] = global_score["name"]
        global_score["values"] = global_score["values"].rename(columns={'label': 'reduction_method'})
        global_score["values"]["full_label"] = global_score["values"].apply(lambda row: f'{df_item["name"]}-{global_score["name"]}-{row["reduction_method"]}-{row["model"]}', axis=1)  
        full_scores_dfs_supervised = pd.concat([full_scores_dfs_supervised, global_score["values"]], ignore_index=True)
        # for _, score in enumerate(global_score["values"]):
        #     # print(f'Label = {score["label"]}')
        #     # print(f'Label = {score["label"]}, accuracy = {score["accuracy"]}')
        #     print(global_score["values"].head())

J'affiche le top 20 puis le bottom 20

In [None]:
# Sort the dataframe by accuracy in descending order  
sorted_df_supervised = full_scores_dfs_supervised.sort_values(by='accuracy', ascending=False)  
  
# Select the top 20 records  
top_20_df_supervised = sorted_df_supervised.head(20).reset_index()
  
# Create the bar plot  
plt.figure(figsize=(10, 6))  
ax = sns.barplot(data=top_20_df_supervised, x='accuracy', y='full_label', orient='h')  
plt.xlabel('Accuracy')
plt.ylabel('Full Label')
plt.title('Top 20 by Accuracy')

# Add accuracy scores within each bar  
for index, row in top_20_df_supervised.iterrows():  
    ax.text(0.06, index + 0.2, f"{row['accuracy']:.3f}", color='black', ha="right")  
  

plt.show()  

In [None]:
# Sort the dataframe by accuracy in descending order  
sorted_df_supervised = full_scores_dfs_supervised.sort_values(by='accuracy', ascending=True)  
  
# Select the top 20 records  
low_20_df_supervised = sorted_df_supervised.head(20).reset_index()
  
# Create the bar plot  
plt.figure(figsize=(10, 6))  
ax = sns.barplot(data=low_20_df_supervised, x='accuracy', y='full_label', orient='h')  
plt.xlabel('Accuracy')
plt.ylabel('Full Label')
plt.title('Worst 20 by Accuracy')

# Add accuracy scores within each bar  
for index, row in low_20_df_supervised.iterrows():  
    ax.text(0.06, index + 0.2, f"{row['accuracy']:.3f}", color='black', ha="right")  
  

plt.show()

J'observe les clusters sur le meilleur modèles

In [None]:
# Visualisation des clusters réels & prédits sur le meilleur dataframe
tfidf_vectorizer = TfidfVectorizer()  
tfidf_matrix = tfidf_vectorizer.fit_transform(df_clean_stemm['clean_desc'])  
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

tfidf_df_scores = check_one_df_supervised(tfidf_df, True)

Axes d'améliorations potentiels:
- Prédire les lvls 2/3/n
- Ajouter Bigrams/Trigrams