In [None]:
import os
import re
from dotenv import load_dotenv
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE


🔹 Chargement des variables d'environnement

In [None]:
load_dotenv()
DATA_PROCESSED = os.getenv("DATA_PROCESSED")
DATA_MODEL = os.getenv("DATA_MODEL")
DATA_REPORT = os.getenv("DATA_REPORT")
os.makedirs(DATA_MODEL, exist_ok=True)
os.makedirs(DATA_REPORT, exist_ok=True)


1️⃣ Chargement des données

In [None]:
file_path = os.path.join(DATA_PROCESSED, "export_preprocess_clean_avis.csv")
print("\n📥 Chargement des données...")
df = pd.read_csv(file_path)
print(f"📄 Colonnes du fichier : {df.columns.tolist()}")


2️⃣ Nettoyage

In [None]:
df = df.dropna(subset=['commentaire'])
df = df[df['commentaire'].str.strip().astype(bool)]
print(f"✅ Nombre de lignes après nettoyage : {len(df)}")


3️⃣ Fonction avancée pour gérer la négation

In [None]:
def mark_negation(text, window=3):
    negation_words = {"ne", "pas", "plus", "jamais", "rien", "aucun", "sans", "nul"}
    punctuation = {".", ",", ";", ":", "!", "?"}
    stop_words = {"mais", "et", "ou", "donc", "or", "ni", "car"}
    tokens = text.split()
    new_tokens = []
    neg_countdown = 0
    for tok in tokens:
        tok_lower = tok.lower()
        if tok_lower in negation_words:
            neg_countdown = window
            new_tokens.append(tok)
        elif neg_countdown > 0:
            if tok_lower in punctuation or tok_lower in stop_words:
                neg_countdown = 0
                new_tokens.append(tok)
            else:
                new_tokens.append("NOT_" + tok)
                neg_countdown -= 1
        else:
            new_tokens.append(tok)
    return " ".join(new_tokens)

df['commentaire_preprocessed'] = df['commentaire'].apply(mark_negation)
X = df['commentaire_preprocessed']
y_notes = df['note_commentaire']

def map_sentiment(note):
    if note == 1: return 'negatif'
    elif note == 5: return 'positif'
    else: return 'neutre'

y_sentiment = y_notes.apply(map_sentiment)


4️⃣ TF-IDF

In [None]:
print("\n✍️ Vectorisation TF-IDF avec bigrammes...")
tfidf = TfidfVectorizer(max_features=3000, ngram_range=(1,2))
X_vect = tfidf.fit_transform(X)
joblib.dump(tfidf, os.path.join(DATA_MODEL, "tfidf_vectorizer_dual.pkl"))
print(f"💾 TF-IDF vectorizer sauvegardé dans {DATA_MODEL}")


5️⃣ Split train/test stratifié

In [None]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in sss.split(X_vect, y_sentiment):
    X_train, X_test = X_vect[train_index], X_vect[test_index]
    y_train_sent, y_test_sent = y_sentiment.iloc[train_index], y_sentiment.iloc[test_index]
    y_train_note, y_test_note = y_notes.iloc[train_index], y_notes.iloc[test_index]


5.1️⃣ Ré-échantillonnage SMOTE

In [None]:
print("\n📈 Application du ré-échantillonnage pour équilibrer les classes...")
smote_sent = SMOTE(random_state=42)
X_train_sent, y_train_sent = smote_sent.fit_resample(X_train, y_train_sent)
smote_note = SMOTE(random_state=42)
X_train_note, y_train_note = smote_note.fit_resample(X_train, y_train_note)
print(f"✅ Classes équilibrées pour sentiment et notes.")


6️⃣ Fonction pour créer le modèle

In [None]:
def get_model(name):
    if name == "LogisticRegression": return LogisticRegression(max_iter=1000, class_weight=None)
    elif name == "LinearSVC": return LinearSVC(class_weight=None)
    elif name == "RandomForest": return RandomForestClassifier(n_estimators=100, class_weight=None, random_state=42)


7️⃣ Définition des tâches

In [None]:
tasks = {
    "sentiment": (X_train_sent, y_train_sent, X_test, y_test_sent, ['negatif','neutre','positif']),
    "note": (X_train_note, y_train_note, X_test, y_test_note, [1,2,3,4,5])
}

results = []


Fonction graphique pour top features

In [None]:
def plot_top_features(model, feature_names, task, model_name, n_features=20):
    try:
        if hasattr(model,'coef_'):
            coef = model.coef_
            coef = coef.flatten() if coef.shape[0]==1 else np.mean(np.abs(coef), axis=0)
        elif hasattr(model,'feature_importances_'):
            coef = model.feature_importances_
        else:
            return
        indices = np.argsort(np.abs(coef))[-n_features:]
        top_words = feature_names[indices]
        top_values = coef[indices]
        plt.figure(figsize=(10,6))
        colors = ['green' if v>0 else 'red' for v in top_values] if hasattr(model,'coef_') else 'blue'
        plt.barh(top_words, top_values, color=colors)
        plt.gca().invert_yaxis()
        plt.tight_layout()
        filename = f"report_preprocess_top{n_features}_{model_name.lower()}_{task}.png"
        plt.savefig(os.path.join(DATA_REPORT, filename))
        plt.close()
    except Exception as e:
        print(f"❌ Erreur top features {model_name} ({task}): {e}")


8️⃣ Entraînement et évaluation

In [None]:
for task, (X_tr, y_tr, X_te, y_te, labels) in tasks.items():
    print(f"\n===== 🔹 Tâche : {task.upper()} =====")
    for name in ["LogisticRegression","LinearSVC","RandomForest"]:
        print(f"\n⚡ Entraînement du modèle {name}...")
        model = get_model(name)
        model.fit(X_tr, y_tr)
        y_pred = model.predict(X_te)
        acc = accuracy_score(y_te, y_pred)
        f1 = f1_score(y_te, y_pred, average='macro', zero_division=0)
        print(f"📊 Accuracy: {acc*100:.2f}% | F1-score macro: {f1:.4f}")
        print(classification_report(y_te, y_pred, zero_division=0))
        

        # Matrice de confusion

In [None]:
        cm = confusion_matrix(y_te, y_pred, labels=labels)
        plt.figure(figsize=(6,4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
        plt.title(f"Matrice de confusion - {name} ({task})")
        plt.tight_layout()
        plt.savefig(os.path.join(DATA_REPORT,f"report_preprocess_confusion_{name.lower()}_{task}.png"))
        plt.close()
        

        # Top features

In [None]:
        feature_names = np.array(tfidf.get_feature_names_out())
        plot_top_features(model, feature_names, task, name)
        

        # Sauvegarde du modèle

In [None]:
        joblib.dump(model, os.path.join(DATA_MODEL,f"{name.lower()}_{task}.pkl"))
        results.append({"Tâche":task,"Modèle":name,"Accuracy":round(acc,4),"F1_score_macro":round(f1,4)})


9️⃣ Résumé comparatif

In [None]:
df_results = pd.DataFrame(results)
print("\n📋 Résumé comparatif des modèles :")
print(df_results)
df_results.to_csv(os.path.join(DATA_PROCESSED,"resultats_modeles.csv"), index=False)


🔟 Tests manuels

In [None]:
phrases = {
    "positif":["Super service, je suis très satisfait !","Livraison rapide et produit conforme","Expérience excellente du début à la fin"],
    "neutre":["C'était correct, sans plus","Pas de problème mais rien d'extraordinaire","Service moyen, livraison standard"],
    "negatif":["Service catastrophique, à fuir","Je ne suis pas content du tout","Produit défectueux et aucune réponse du SAV"]
}
print("\n🧪 Tests manuels :")
for cat,samples in phrases.items():
    for phrase in samples:
        vec = tfidf.transform([mark_negation(phrase)])
        print(f"\n💬 Phrase ({cat}): {phrase}")
        for task in ["sentiment","note"]:
            print(f"🔹 Prédictions {task}:", end=" ")
            for name in ["LogisticRegression","LinearSVC","RandomForest"]:
                model_path = os.path.join(DATA_MODEL,f"{name.lower()}_{task}.pkl")
                if os.path.exists(model_path):
                    mdl = joblib.load(model_path)
                    pred = mdl.predict(vec)[0]
                    print(f"{name}={pred}", end=" | ")
            print()

print("\n✅ Script complet terminé.")
