# 1 - Chargement & configuration

In [None]:
import sys
import os
sys.path.append('../src')
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from requirements import *
from src import data_preprocessing as dp
from src import model_training as mt
from src import evaluate as ev
from src import utils

import mlflow
mlflow.set_experiment("Sentiment Analysis Project")

## 1.1 - Affichage de la structure dossier

In [None]:
utils.afficher_structure_dossier("..", max_niveaux = 1)

## 1.2 - Chargement des données

In [None]:
data_path = "data/tweets.csv"
if os.path.exists(data_path):
    tweets = pd.read_csv(data_path, encoding="ISO-8859-1")
    print("✅ Dataset chargé avec succès !")
else:
    print("❌ Le fichier tweets.csv est introuvable.")

# 2 - Exploration & nettoyage
## 2.1 - Nettoyage rapide

In [None]:
print(tweets.info())
print(tweets['label'].value_counts())

## 2.2 - Nettoyage avancé

In [None]:
tweets_cleaned = dp.preprocess_tweets_parallel(tweets)

# 3 - Vader scoring

In [None]:
vader_scores = dp.compute_vader_scores(tweets_cleaned)

# 4 - Vectorisation des tweets

In [None]:
X_bow, X_tfidf, X_fasttext, X_use = dp.vectorize_tweets(tweets_cleaned['text'], tweets_cleaned['text'].sample(frac = 0.01))

# 5 - Modélisation Classique (TF-IDF + Régression Logistique)

In [None]:
X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(
    X_tfidf, tweets_cleaned['label'], test_size = 0.2, random_state = 70, stratify=tweets_cleaned['label']
)
log_reg_model = mt.train_logistic_regression_with_cv(X_tfidf_train, y_train)

# 6 - Modèles Avancés (Random Forest / LightGBM / LSTM)
## 6.1 - FastText + Random Forest

In [None]:
X_ft_train, X_ft_test, y_train, y_test = train_test_split(
    X_fasttext, tweets_cleaned['label'], test_size = 0.2, random_state = 70, stratify = tweets_cleaned['label']
)
rf_model = mt.train_random_forest(X_ft_train, y_train)

## 6.2 - FastText + LSTM

In [None]:
lstm_model, (X_ft_test_reshaped, y_ft_test), history = mt.train_lstm_model(X_fasttext, tweets_cleaned['label'])

## 6.3 - USE + LightGBM

In [None]:
X_use_train, X_use_test, y_train, y_test = train_test_split(
    X_use, tweets_cleaned['label'], test_size = 0.2, random_state = 70, stratify = tweets_cleaned['label']
)
lgbm_model = mt.train_lightgbm(X_use_train, y_train, X_use_test, y_test)

## 6.4 - DistilBERT
### 6.4.1 - Préparation

In [None]:
df_sample = dp.prepare_distilbert_dataset(tweets_cleaned)
tokenized = dp.tokenize_distilbert_dataset(df_sample)

### 6.2.2 - Fine-tuning 

In [None]:
model, trainer, _ = mt.train_distilbert_model(tokenized)

# 7 - Evaluation

In [None]:
distilbert_acc, distilbert_f1 = ev.evaluate_distilbert_model(model, tokenized)

# 8 - Comparaison finale des modèles

In [None]:
models_dict = {
    'logreg': log_reg_model,
    'rf': rf_model,
    'lstm': lstm_model,
    'lgbm': lgbm_model,
    'distilbert_metrics': {
        'accuracy': distilbert_acc,
        'f1': distilbert_f1
    }
}

datasets_dict = {
    'tfidf': {'X_test': X_tfidf_test, 'y_test': y_test},
    'fasttext': {'X_test': X_ft_test, 'y_test': y_test},
    'lstm': (X_ft_test_reshaped, y_ft_test),
    'use': {'X_test': X_use_test, 'y_test': y_test}
}

In [None]:
df_results = ev.get_all_model_scores(models_dict, datasets_dict)

# 8 - Tracking MLFlow

In [None]:
with mlflow.start_run():
    # entraînement modèle, log params & metrics
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))