In [1]:
import pandas as pd
import numpy as np
import os
import sys
from dotenv import load_dotenv

# 1. On charge les variables du .env
load_dotenv()

FICHIER_AVIS = os.getenv("INPUT_REVIEWS")

df = pd.read_json(FICHIER_AVIS, lines=True, chunksize=50000)
df = next(df) # On ne prend que le premier bloc de 50k avis

# 2. Création de la target 1 : Polarité (Sentiment)
def get_sentiment(stars):
    if stars > 3: return 2 # Positif (Mapping numérique pour les modèles)
    elif stars < 3: return 0 # Négatif
    else: return 1 # Neutre

df['label_sentiment'] = df['stars'].apply(get_sentiment)

# 3. Target 2 : Score (Rating)
# Déjà présent dans 'stars' (1 à 5). 
# Note : Pour les modèles, il vaut mieux souvent décaler à 0-4 (donc stars - 1)
df['label_rating'] = df['stars'] - 1 

# 4. Nettoyage basique (optionnel mais conseillé)
# Minuscules, suppression ponctuation...
df['clean_text'] = df['text'].str.lower()

In [2]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label_sentiment'] # Ou label_rating selon la tâche

# Séparation Train / Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 1. Bag of Words
bow_vectorizer = CountVectorizer(max_features=5000) # On limite à 5000 mots pour la mémoire
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# 2. TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# pip install sentence-transformers
from sentence_transformers import SentenceTransformer

model_emb = SentenceTransformer('all-MiniLM-L6-v2') # Modèle rapide et performant

# Cela convertit les phrases en vecteurs de 384 dimensions
X_train_emb = model_emb.encode(X_train.tolist(), show_progress_bar=True)
X_test_emb = model_emb.encode(X_test.tolist(), show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 168.48it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Batches:  22%|██▏       | 280/1250 [19:20<1:07:28,  4.17s/it]

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Exemple Régression Logistique sur TF-IDF
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)
pred = clf.predict(X_test_tfidf)

print("Logistic Regression + TF-IDF :")
print(classification_report(y_test, pred))

In [None]:
import tensorflow as tf
from tensorflow.keras import layers

# Exemple simple MLP (Multi-Layer Perceptron)
model = tf.keras.Sequential([
    layers.Input(shape=(384,)), # 384 si vous utilisez les embeddings MiniLM, 5000 si TF-IDF
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(3, activation='softmax') # 3 sorties pour (Neg, Neutre, Pos)
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_emb, y_train, epochs=5, batch_size=32)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

model_name = "distilbert-base-uncased" # Plus léger que BERT standard
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Il faudra créer un Dataset compatible HuggingFace ici...
# (C'est souvent la partie qui demande un peu plus de code de formatting)

In [None]:
# Concept : On donne le texte et on demande la réponse
prompt = f"""
Analyse le sentiment de cet avis Yelp.
Avis : "{un_avis_du_dataset}"
Réponds uniquement par : POSITIF, NÉGATIF ou NEUTRE.
"""
# Envoyer ce prompt à l'API et comparer la réponse avec la vraie note (stars)