# 1 - Chargement & configuration

In [1]:
import sys
import os
sys.path.append('../src')
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

from requirements import *
from src import data_preprocessing as dp
from src import model_training as mt
from src import evaluate as ev
from src import utils

import mlflow
mlflow.set_experiment("Sentiment Analysis Project")

  from .autonotebook import tqdm as notebook_tqdm




✅ Toutes les librairies sont présentes et prêtes à être utilisées !



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\motar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\motar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


<Experiment: artifact_location='file:///c:/Users/motar/Desktop/1-openclassrooms/AI_Engineer/1-projets/P07/P7_sentiment_MLOps/notebooks/mlruns/906586012259731436', creation_time=1742576058928, experiment_id='906586012259731436', last_update_time=1742576058928, lifecycle_stage='active', name='Sentiment Analysis Project', tags={}>

## 1.1 - Affichage de la structure dossier

In [2]:
utils.afficher_structure_dossier("..", max_niveaux = 1)

├── .git
├── ├── COMMIT_EDITMSG
├── ├── FETCH_HEAD
├── ├── HEAD
├── ├── config
├── ├── description
├── ├── hooks
├── ├── index
├── ├── info
├── ├── logs
├── ├── objects
├── └── refs
├── .gitattributes
├── .gitignore
├── README.md
├── data
├── └── tweets.csv
├── distilbert_model
├── ├── config.json
├── └── model.safetensors
├── distilbert_output
├── └── checkpoint-10000
├── distilbert_results
├── └── checkpoint-32
├── env_p7_MLOps
├── ├── Include
├── ├── Lib
├── ├── Scripts
├── ├── etc
├── ├── pyvenv.cfg
├── └── share
├── logs
├── ├── events.out.tfevents.1741361244.PC-ARNAUD.37024.0
├── ├── events.out.tfevents.1741362034.PC-ARNAUD.9628.0
├── ├── events.out.tfevents.1741364804.PC-ARNAUD.38328.0
├── ├── events.out.tfevents.1741507476.PC-ARNAUD.3024.0
├── ├── events.out.tfevents.1741507646.PC-ARNAUD.3024.1
├── ├── events.out.tfevents.1741513667.PC-ARNAUD.13340.0
├── ├── events.out.tfevents.1741599815.PC-ARNAUD.27736.0
├── ├── events.out.tfevents.1741603247.PC-ARNAUD.3852.0
├── ├── events.o

## 1.2 - Chargement des données

In [3]:
print(f"📂 Répertoire courant : {os.getcwd()}")

📂 Répertoire courant : c:\Users\motar\Desktop\1-openclassrooms\AI_Engineer\1-projets\P07\P7_sentiment_MLOps\notebooks


In [4]:
data_path = "../data/tweets.csv"
if os.path.exists(data_path):
    tweets = pd.read_csv(data_path, encoding = "ISO-8859-1")
    print("✅ Dataset chargé avec succès !")
else:
    print("❌ Le fichier tweets.csv est introuvable.")

✅ Dataset chargé avec succès !


# 2 - Exploration & nettoyage
## 2.1 - Nettoyage initial

In [5]:
print(tweets.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599999 entries, 0 to 1599998
Data columns (total 6 columns):
 #   Column                                                                                                               Non-Null Count    Dtype 
---  ------                                                                                                               --------------    ----- 
 0   0                                                                                                                    1599999 non-null  int64 
 1   1467810369                                                                                                           1599999 non-null  int64 
 2   Mon Apr 06 22:19:45 PDT 2009                                                                                         1599999 non-null  object
 3   NO_QUERY                                                                                                             1599999 non-null  object
 4   _

In [6]:
# Renommage des colonnes
tweets.columns = ["label", "id", "date", "query", "user", "text"]

In [7]:
# Suppression des colonnes inutiles
tweets = tweets.drop(columns=["id", "date", "query", "user"])

In [8]:
# Conversion des labels (0 et 4 → 0 et 1)
tweets['label'] = tweets['label'].map({0: 0, 4: 1})

In [9]:
# Reset index
tweets = tweets.reset_index(drop=True)

## 2.2 - Nettoyage avancé

In [None]:
tweets_cleaned = dp.preprocess_tweets_parallel(tweets)

🚀 Nettoyage des tweets en cours...


# 3 - Vader scoring

In [None]:
vader_scores = dp.compute_vader_scores(tweets_cleaned)

# 4 - Vectorisation des tweets

In [None]:
X_bow, X_tfidf, X_fasttext, X_use = dp.vectorize_tweets(tweets_cleaned['text'], tweets_cleaned['text'].sample(frac = 0.01))

# 5 - Modélisation Classique (TF-IDF + Régression Logistique)

In [None]:
X_tfidf_train, X_tfidf_test, y_train, y_test = train_test_split(
    X_tfidf, tweets_cleaned['label'], test_size = 0.2, random_state = 70, stratify=tweets_cleaned['label']
)
log_reg_model = mt.train_logistic_regression_with_cv(X_tfidf_train, y_train)

# 6 - Modèles Avancés (Random Forest / LightGBM / LSTM)
## 6.1 - FastText + Random Forest

In [None]:
X_ft_train, X_ft_test, y_train, y_test = train_test_split(
    X_fasttext, tweets_cleaned['label'], test_size = 0.2, random_state = 70, stratify = tweets_cleaned['label']
)
rf_model = mt.train_random_forest(X_ft_train, y_train)

## 6.2 - FastText + LSTM

In [None]:
lstm_model, (X_ft_test_reshaped, y_ft_test), history = mt.train_lstm_model(X_fasttext, tweets_cleaned['label'])

## 6.3 - USE + LightGBM

In [None]:
X_use_train, X_use_test, y_train, y_test = train_test_split(
    X_use, tweets_cleaned['label'], test_size = 0.2, random_state = 70, stratify = tweets_cleaned['label']
)
lgbm_model = mt.train_lightgbm(X_use_train, y_train, X_use_test, y_test)

## 6.4 - DistilBERT
### 6.4.1 - Préparation

In [None]:
df_sample = dp.prepare_distilbert_dataset(tweets_cleaned)
tokenized = dp.tokenize_distilbert_dataset(df_sample)

### 6.2.2 - Fine-tuning 

In [None]:
model, trainer, _ = mt.train_distilbert_model(tokenized)

# 7 - Evaluation

In [None]:
distilbert_acc, distilbert_f1 = ev.evaluate_distilbert_model(model, tokenized)

# 8 - Comparaison finale des modèles

In [None]:
models_dict = {
    'logreg': log_reg_model,
    'rf': rf_model,
    'lstm': lstm_model,
    'lgbm': lgbm_model,
    'distilbert_metrics': {
        'accuracy': distilbert_acc,
        'f1': distilbert_f1
    }
}

datasets_dict = {
    'tfidf': {'X_test': X_tfidf_test, 'y_test': y_test},
    'fasttext': {'X_test': X_ft_test, 'y_test': y_test},
    'lstm': (X_ft_test_reshaped, y_ft_test),
    'use': {'X_test': X_use_test, 'y_test': y_test}
}

In [None]:
df_results = ev.get_all_model_scores(models_dict, datasets_dict)

# 8 - Tracking MLFlow

In [None]:
with mlflow.start_run():
    # entraînement modèle, log params & metrics
    mlflow.log_param("model", "Logistic Regression")
    mlflow.log_metric("accuracy", accuracy_score(y_test, y_pred))