In [None]:
import json
import re
import pandas as pd
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import os
from dotenv import load_dotenv

# =========================
# CONFIG
# =========================

load_dotenv()

FICHIER_AVIS = os.getenv("INPUT_REVIEWS")
FICHIER_BUSINESS = os.getenv("INPUT_BUSINESS")
FICHIER_SORTIE = os.getenv("OUTPUT_FILE")

FILES = {
    "tout": FICHIER_AVIS,
    "Health_Medical": os.getenv("HEALTH_MEDICAL"),
    "Hotels": os.getenv("HOTELS"),
    "Restaurants": os.getenv("RESTAURANTS"),
    "Shopping": os.getenv("SHOPPING"),
}

print("Fichiers utilisés :", FILES)


# =========================
# FONCTIONS UTILES
# =========================

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zàâçéèêëîïôûùüÿñæœ\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()


def load_dataset(path):
    data = []

    # Vérification simple si le fichier existe
    if not path or not os.path.exists(path):
        print(f"⚠️ Fichier introuvable : {path}")
        return pd.DataFrame(columns=["text", "stars"])

    with open(path, "r", encoding="utf-8") as f:
        for line in f:
            try:
                # La magie opère ici : json.loads comprend automatiquement le format
                review = json.loads(line)

                # On extrait proprement
                data.append({
                    "text": review.get("text", ""), # .get évite le crash si la clé manque
                    "stars": review.get("stars", 0)
                })
            except json.JSONDecodeError:
                continue # On saute juste la ligne si elle est illisible

    return pd.DataFrame(data)


# =========================
# BOUCLE PRINCIPALE
# =========================

for etablissement, filepath in FILES.items():

    print(f"\n=============================")
    print(f" MODELE POUR : {etablissement.upper()}")
    print(f"=============================")

    # 1. Chargement
    df = load_dataset(filepath)
    print("Nb lignes :", len(df))
    print(df["stars"].value_counts())

    # 2. Nettoyage
    df["clean_text"] = df["text"].apply(clean_text)

    # (optionnel) réduire pour aller plus vite
    df = df.sample(200000, random_state=42)

    # 3. Split
    X_train, X_test, y_train, y_test = train_test_split(
        df["clean_text"],
        df["stars"],
        test_size=0.2,
        random_state=42,
        stratify=df["stars"]
    )

    # 4. TF-IDF (UN par type)
    vectorizer = TfidfVectorizer(
        max_features=200000,
        ngram_range=(1, 2),
        stop_words="english"
    )

    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # 5. Modèle
    model = svm.SVC(kernel='linear')

    model.fit(X_train_vec, y_train)

    # 6. Évaluation
    y_pred = model.predict(X_test_vec)

    print("\n--- RESULTATS ---")
    print(classification_report(y_test, y_pred))

    accuracy = (y_test == y_pred).mean()
    print(f"Accuracy : {accuracy:.4f}")

    # 7. Sauvegarde prédictions
    df_pred = pd.DataFrame({
        "true_stars": y_test.values,
        "predicted_stars": y_pred
    })

    output_file = f"predictions_{etablissement}.csv"
    df_pred.to_csv(output_file, index=False)

    print(f"Fichier sauvegardé : {output_file}")


Fichiers utilisés : {'tout': 'C:/Users/floco/OneDrive/Documents/S6/Sae/yelp_academic_reviews4students.jsonl', 'Health_Medical': 'C:/Users/floco/OneDrive/Documents/S6/Sae/donnees_triees/yelp_Health_Medical.json', 'Hotels': 'C:/Users/floco/OneDrive/Documents/S6/Sae/donnees_triees/yelp_Hotels.json', 'Restaurants': 'C:/Users/floco/OneDrive/Documents/S6/Sae/donnees_triees/yelp_Restaurants.json', 'Shopping': 'C:/Users/floco/OneDrive/Documents/S6/Sae/donnees_triees/yelp_Shopping.jsonl'}

 MODELE POUR : TOUT
Nb lignes : 1000000
stars
5    462646
4    207953
1    153057
3     98714
2     77630
Name: count, dtype: int64


In [3]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['label_sentiment'] # Ou label_rating selon la tâche

# Séparation Train / Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 1. Bag of Words
bow_vectorizer = CountVectorizer(max_features=5000) # On limite à 5000 mots pour la mémoire
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# 2. TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [4]:
# pip install sentence-transformers
from sentence_transformers import SentenceTransformer

model_emb = SentenceTransformer('all-MiniLM-L6-v2') # Modèle rapide et performant

# Cela convertit les phrases en vecteurs de 384 dimensions
X_train_emb = model_emb.encode(X_train.tolist(), show_progress_bar=True)
X_test_emb = model_emb.encode(X_test.tolist(), show_progress_bar=True)

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 103/103 [00:00<00:00, 534.96it/s, Materializing param=pooler.dense.weight]                             
BertModel LOAD REPORT from: sentence-transformers/all-MiniLM-L6-v2
Key                     | Status     |  | 
------------------------+------------+--+-
embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
Batches: 100%|██████████| 1250/1250 [10:14<00:00,  2.03it/s]
Batches: 100%|██████████| 313/313 [02:41<00:00,  1.94it/s]


In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Exemple Régression Logistique sur TF-IDF
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_tfidf, y_train)
pred = clf.predict(X_test_tfidf)

print("Logistic Regression + TF-IDF :")
print(classification_report(y_test, pred))

Logistic Regression + TF-IDF :
              precision    recall  f1-score   support

           0       0.82      0.85      0.84      2297
           1       0.48      0.24      0.32       950
           2       0.90      0.96      0.93      6753

    accuracy                           0.86     10000
   macro avg       0.74      0.68      0.70     10000
weighted avg       0.85      0.86      0.85     10000



In [6]:
import tensorflow as tf
from tensorflow.keras import layers

# Exemple simple MLP (Multi-Layer Perceptron)
model = tf.keras.Sequential([
    layers.Input(shape=(384,)), # 384 si vous utilisez les embeddings MiniLM, 5000 si TF-IDF
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(3, activation='softmax') # 3 sorties pour (Neg, Neutre, Pos)
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train_emb, y_train, epochs=5, batch_size=32)

Epoch 1/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.8200 - loss: 0.4732
Epoch 2/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.8393 - loss: 0.4186
Epoch 3/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8433 - loss: 0.4068
Epoch 4/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8473 - loss: 0.3971
Epoch 5/5
[1m1250/1250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step - accuracy: 0.8516 - loss: 0.3890


<keras.src.callbacks.history.History at 0x2a30feb7620>

In [7]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments

model_name = "distilbert-base-uncased" # Plus léger que BERT standard
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Il faudra créer un Dataset compatible HuggingFace ici...
# (C'est souvent la partie qui demande un peu plus de code de formatting)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 677.09it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/archite

In [8]:
# Concept : On donne le texte et on demande la réponse
prompt = f"""
Analyse le sentiment de cet avis Yelp.
Avis : "{un_avis_du_dataset}"
Réponds uniquement par : POSITIF, NÉGATIF ou NEUTRE.
"""
# Envoyer ce prompt à l'API et comparer la réponse avec la vraie note (stars)

NameError: name 'un_avis_du_dataset' is not defined