In [1]:
import pandas as pd
import spacy

In [2]:
# Charger les données
df = pd.read_json('reviews.jsonl', lines=True)
print("Aperçu des données initiales :")
print(df.head(20))

Aperçu des données initiales :
    rating                                              title  \
0        4                   No white background! It’s clear!   
1        5                Awesome!  Great price!  Works well!   
2        5                 Worked but took an hour to install   
3        4                                             Decent   
4        5                                           LOVE IT!   
5        5        Works Great with my IPhone 13 & Magna Case!   
6        5                       Great item! Easy to install!   
7        4                                         Four Stars   
8        5  It is a great value & protects the phone from ...   
9        5                         Good to have these around!   
10       5                                        These work!   
11       5                 Finally something I can Hang onto!   
12       5                               Great, great, great!   
13       5                                       Great valu

In [3]:
# Sélection des champs pertinents
df = df[['rating', 'title', 'text']]
print("\nDonnées avec champs sélectionnés :")
print(df.head())


Données avec champs sélectionnés :
   rating                                title  \
0       4     No white background! It’s clear!   
1       5  Awesome!  Great price!  Works well!   
2       5   Worked but took an hour to install   
3       4                               Decent   
4       5                             LOVE IT!   

                                                text  
0  I bought this bc I thought it had the nice whi...  
1  Perfect. How pissed am I that I recently paid ...  
2  Overall very happy with the end result. If you...  
3  Lasted about 9 months then the lock button bro...  
4  LOVE THIS CASE! Works better than my expensive...  


In [4]:
# Charger le modèle spaCy
nlp = spacy.load("en_core_web_sm")

In [5]:
# Fonction de prétraitement
def preprocess_text_spacy(text):
    doc = nlp(text.lower())
    tokens = []
    for token in doc:
        if (
            not token.is_stop  # Exclure les stop words
            and token.is_alpha  # Exclure les symboles et chiffres
            and len(token) > 2  # Exclure les mots courts
        ):
            tokens.append(token.lemma_)  # Lemmatisation
    return tokens

In [6]:
# Appliquer la fonction de prétraitement
df['processed_tokens'] = df['text'].apply(preprocess_text_spacy)
print("\nDonnées après prétraitement :")
print(df.head())


Données après prétraitement :
   rating                                title  \
0       4     No white background! It’s clear!   
1       5  Awesome!  Great price!  Works well!   
2       5   Worked but took an hour to install   
3       4                               Decent   
4       5                             LOVE IT!   

                                                text  \
0  I bought this bc I thought it had the nice whi...   
1  Perfect. How pissed am I that I recently paid ...   
2  Overall very happy with the end result. If you...   
3  Lasted about 9 months then the lock button bro...   
4  LOVE THIS CASE! Works better than my expensive...   

                                    processed_tokens  
0  [buy, think, nice, white, background, turn, cl...  
1  [perfect, pissed, recently, pay, fitbit, cable...  
2  [overall, happy, end, result, hate, puzzle, lo...  
3  [last, month, lock, button, break, decent, pro...  
4     [love, case, work, well, expensive, case, lol]  


In [7]:
# Sauvegarder les données prétraitées si nécessaire
df.to_csv('processed_reviews.csv', index=False)
print("\nLes données prétraitées ont été sauvegardées dans 'processed_reviews.csv'.")


Les données prétraitées ont été sauvegardées dans 'processed_reviews.csv'.


In [8]:
#Étape 2 : Génération des embeddings

In [9]:
# Charger les données prétraitées
df = pd.read_csv('processed_reviews.csv')
documents = df['processed_tokens'].apply(eval)  # Convertir les chaînes de tokens en listes Python
documents = [" ".join(tokens) for tokens in documents]  # Convertir les listes en chaînes
print(f"Nombre de documents : {len(documents)}")

Nombre de documents : 1000


In [26]:
#2. Génération des embeddings 
#Option  : Utiliser TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# Configurer et appliquer TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limiter à 5000 caractéristiques
tfidf_embeddings = tfidf_vectorizer.fit_transform(documents)
print(f"Taille de la matrice TF-IDF : {tfidf_embeddings.shape}")
print("Exemple d'embedding TF-IDF :")
print(tfidf_embeddings.toarray()[:])


Taille de la matrice TF-IDF : (1000, 2592)
Exemple d'embedding TF-IDF :
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
#Option 2 : Utiliser SentenceTransformers (all-MiniLM-L6-v2)
#from sentence_transformers import SentenceTransformer

# Charger le modèle SentenceTransformers
#model = SentenceTransformer('all-MiniLM-L6-v2')

# Générer les embeddings pour chaque document
#embeddings = model.encode(documents, show_progress_bar=True)
#print(f"Taille des embeddings : {embeddings.shape}")


In [19]:
# 3. Clustering
#Option : DBSCAN
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

# Appliquer DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5, metric='cosine')  # 'cosine' est recommandé pour les données textuelles
dbscan_labels = dbscan.fit_predict(tfidf_embeddings)

# Nombre de clusters identifiés (ignorer le bruit, label = -1)
num_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
print(f"Nombre de clusters identifiés : {num_clusters}")

Nombre de clusters identifiés : 8


In [25]:
#4. Analyse des clusters : 

from collections import defaultdict

# Associer les documents à leurs clusters
clusters = defaultdict(list)
for i, label in enumerate(dbscan_labels):
    clusters[label].append(documents[i])

# Analyser les mots-clés dans chaque cluster
from sklearn.feature_extraction.text import CountVectorizer

for cluster_id, docs in clusters.items():
    print(f"\nCluster {cluster_id} :")
    vectorizer = CountVectorizer(max_features=20)  # Limiter à 10 mots-clés
    word_counts = vectorizer.fit_transform(docs)
    words = vectorizer.get_feature_names_out()
    freqs = word_counts.sum(axis=0).A1
    keywords = sorted(zip(words, freqs), key=lambda x: -x[1])
    print("Mots-clés les plus fréquents :", keywords)



Cluster -1 :
Mots-clés les plus fréquents : [('phone', np.int64(726)), ('case', np.int64(551)), ('work', np.int64(246)), ('like', np.int64(215)), ('great', np.int64(212)), ('charge', np.int64(196)), ('fit', np.int64(193)), ('good', np.int64(188)), ('use', np.int64(172)), ('love', np.int64(167)), ('screen', np.int64(152)), ('easy', np.int64(151)), ('look', np.int64(145)), ('time', np.int64(133)), ('need', np.int64(117)), ('get', np.int64(115)), ('cover', np.int64(109)), ('nice', np.int64(106)), ('iphone', np.int64(104)), ('new', np.int64(97))]

Cluster 0 :
Mots-clés les plus fréquents : [('great', np.int64(43)), ('work', np.int64(30)), ('love', np.int64(29)), ('good', np.int64(27)), ('case', np.int64(23)), ('fit', np.int64(14)), ('car', np.int64(13)), ('phone', np.int64(13)), ('quality', np.int64(13)), ('perfect', np.int64(9)), ('look', np.int64(8)), ('nice', np.int64(8)), ('price', np.int64(8)), ('color', np.int64(6)), ('durable', np.int64(6)), ('product', np.int64(6)), ('feel', np.in