In [1]:
pip install tensorflow==2.13.0 tensorflow-intel==2.13.0 transformers==4.52.4 mlflow==2.22.0 cloudpickle==3.1.1 jinja2==3.1.6 numpy==1.24.3 pandas==2.2.3 scikit-learn==1.6.1 pyyaml transformers datasets tensorflow==2.13.0 tensorflow_hub

Collecting tensorflow==2.13.0
  Using cached tensorflow-2.13.0-cp310-cp310-win_amd64.whl.metadata (2.6 kB)
Collecting tensorflow-intel==2.13.0
  Using cached tensorflow_intel-2.13.0-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Collecting transformers==4.52.4
  Using cached transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Collecting mlflow==2.22.0
  Using cached mlflow-2.22.0-py3-none-any.whl.metadata (30 kB)
Collecting cloudpickle==3.1.1
  Using cached cloudpickle-3.1.1-py3-none-any.whl.metadata (7.1 kB)
Collecting numpy==1.24.3
  Using cached numpy-1.24.3-cp310-cp310-win_amd64.whl.metadata (5.6 kB)
Collecting pandas==2.2.3
  Using cached pandas-2.2.3-cp310-cp310-win_amd64.whl.metadata (19 kB)
Collecting scikit-learn==1.6.1
  Using cached scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting datasets
  Using cached datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting tensorflow_hub
  Using cached tensorflow_hub-0.16.1-py2.py3-none-any.whl.metadata (1.3 kB)

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 8.30.0 requires typing-extensions>=4.6; python_version < "3.12", but you have typing-extensions 4.5.0 which is incompatible.


In [1]:
pip install streamlit

Collecting streamlitNote: you may need to restart the kernel to use updated packages.


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.14.0 which is incompatible.



  Downloading streamlit-1.46.0-py3-none-any.whl.metadata (9.0 kB)
Collecting altair<6,>=4.0 (from streamlit)
  Downloading altair-5.5.0-py3-none-any.whl.metadata (11 kB)
Collecting tenacity<10,>=8.1.0 (from streamlit)
  Downloading tenacity-9.1.2-py3-none-any.whl.metadata (1.2 kB)
Collecting toml<2,>=0.10.1 (from streamlit)
  Downloading toml-0.10.2-py2.py3-none-any.whl.metadata (7.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-win_amd64.whl.metadata (44 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting narwhals>=1.14.2 (from altair<6,>=4.0->streamlit)
  Downloading narwhals-1.44.0-py3-none-any.whl.metadata (11 kB)
Collecting typing-extensions<5,>=4.4.0 (from streamlit)
  Using cached typing_extensions-4.14.0-py3-none-any.whl.metadata (3.0 kB)
Downloading streamlit-1.46.0-py3-none-any.whl (10.1 MB)
   ---------------------------------------- 0.0/10.1 MB ? eta -:--:

In [1]:
import mlflow
import mlflow.sklearn
import mlflow.lightgbm
import mlflow.tensorflow
import yaml
import os

from utils import load_data_tfidf, load_data_use
from models.use_model import train_use
from models.lstm_model import train_lstm
from models.bert_model import build_and_train_bert


from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score, log_loss, roc_auc_score

# Chargement du fichier YAML
def load_config():
    config_path = "config.yml"
    with open(config_path, "r") as f:
        return yaml.safe_load(f)

# Fonction d'entraînement générique pour les modèles scikit-learn
def train_model_sklearn(model, model_name, X_train, X_test, y_train, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)

        # Prédictions
        y_pred = model.predict(X_test)
        y_proba = model.predict_proba(X_test)[:, 1]

        # Métriques
        accuracy = model.score(X_test, y_test)
        f1 = f1_score(y_test, y_pred)
        logloss = log_loss(y_test, y_proba)
        roc_auc = roc_auc_score(y_test, y_proba)

        # Logging
        mlflow.log_param("model", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("log_loss", logloss)
        mlflow.log_metric("roc_auc", roc_auc)

        # ✅ Ici la correction importante
        mlflow.sklearn.log_model(model, artifact_path="model")

        print(f"✅ {model_name} terminé avec précision={accuracy:.2f} | F1={f1:.2f} | AUC={roc_auc:.2f}")


# Main
if __name__ == "__main__":
    config = load_config()

    # === 1. Données TF-IDF pour les modèles classiques ===
    X_train_tfidf, X_test_tfidf, y_train, y_test, _ = load_data_tfidf()

    # === 2. Random Forest ===
    rf_params = {k: v[0] for k, v in config["grid_search"]["random_forest"].items()}
    rf_model = RandomForestClassifier(**rf_params)
    train_model_sklearn(rf_model, "RandomForest", X_train_tfidf, X_test_tfidf, y_train, y_test)

    # === 3. Logistic Regression ===
    logreg_params = {k: v[0] for k, v in config["grid_search"]["logistic_regression"].items()}
    logreg_model = LogisticRegression(**logreg_params)
    train_model_sklearn(logreg_model, "LogisticRegression", X_train_tfidf, X_test_tfidf, y_train, y_test)

    # === 4. LightGBM ===
    lgbm_params = {k: v[0] for k, v in config["grid_search"]["lightgbm"].items()}
    lgbm_model = LGBMClassifier(**lgbm_params)
    train_model_sklearn(lgbm_model, "LightGBM", X_train_tfidf, X_test_tfidf, y_train, y_test)

    # === 5. Universal Sentence Encoder ===
    X_train_use, X_test_use, y_train_use, y_test_use = load_data_use()
    train_use(config["use"], X_train_use, X_test_use, y_train_use, y_test_use)

    # === LSTM ===
    train_lstm()


    # === Entraînement BERT ===

    print("📦 Modèle BERT")
    build_and_train_bert()

  from pkg_resources import parse_version
  from .autonotebook import tqdm as notebook_tqdm


✅ RandomForest terminé avec précision=0.71 | F1=0.73 | AUC=0.79




✅ LogisticRegression terminé avec précision=0.71 | F1=0.70 | AUC=0.77




✅ LightGBM terminé avec précision=0.73 | F1=0.74 | AUC=0.81
🔄 Embedding avec Universal Sentence Encoder...




✅ USE + MLP terminé avec accuracy=0.77 | F1=0.75 | AUC=0.85 | log_loss=0.49
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7




INFO:tensorflow:Assets written to: C:\Users\Djamel\AppData\Local\Temp\tmpxb6ilild\model\data\model\assets


INFO:tensorflow:Assets written to: C:\Users\Djamel\AppData\Local\Temp\tmpxb6ilild\model\data\model\assets


✅ Bidirectional LSTM terminé avec acc=0.71 | f1=0.72 | auc=0.77
📦 Modèle BERT


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'DistilBertTokenizer'. 
The class this function is called from is 'BertTokenizer'.


📥 Chargement des données BERT...


You are using a model of type distilbert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.


🧠 Initialisation du modèle BERT...


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['distilbert.transformer.layer.0.sa_layer_norm.bias', 'distilbert.transformer.layer.2.attention.out_lin.bias', 'distilbert.transformer.layer.0.ffn.lin1.bias', 'vocab_transform.weight', 'distilbert.transformer.layer.2.ffn.lin1.bias', 'distilbert.transformer.layer.1.ffn.lin1.bias', 'distilbert.transformer.layer.4.attention.k_lin.weight', 'distilbert.transformer.layer.1.ffn.lin2.bias', 'distilbert.transformer.layer.4.attention.out_lin.weight', 'distilbert.transformer.layer.5.attention.q_lin.weight', 'distilbert.transformer.layer.1.attention.out_lin.weight', 'distilbert.transformer.layer.3.output_layer_norm.weight', 'distilbert.transformer.layer.5.sa_layer_norm.weight', 'distilbert.transformer.layer.5.sa_layer_norm.bias', 'distilbert.transformer.layer.5.ffn.lin1.weight', 'distilbert.transformer.layer.2.attention.v_lin.weight', 'distilbert.transformer.layer.0.attention.q_lin.b

🚀 Entraînement en cours...
Epoch 1/4
 13/400 [..............................] - ETA: 7:50:17 - loss: 2.4144 - accuracy: 0.4760

KeyboardInterrupt: 

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

# Chargement du fichier CSV avec encodage correct
df = pd.read_csv("data/sampled_sentiment140.csv", encoding="ISO-8859-1")

# Index d'un tweet
index = 56
tweet = df["comment"].iloc[index]
print("📌 Tweet original :", tweet)

# Initialisation et entraînement du TF-IDF sur tous les tweets
vectorizer = TfidfVectorizer(max_features=5000)
tfidf_matrix = vectorizer.fit_transform(df["comment"])

# Récupération du vecteur TF-IDF du tweet sélectionné
tweet_vector = tfidf_matrix[index].toarray().flatten()

# Affichage des valeurs non nulles avec leurs mots
feature_names = vectorizer.get_feature_names_out()
nonzero_indices = np.where(tweet_vector > 0)[0]

print("\n📊 Mots avec valeurs TF-IDF non nulles :")
for idx in nonzero_indices:
    print(f"{feature_names[idx]}: {tweet_vector[idx]:.4f}")


📌 Tweet original : If anyone wanted to attend TEDMED but can't make the date, I'll happily take your place, and will live-blog it for you too 

📊 Mots avec valeurs TF-IDF non nulles :
and: 0.1177
anyone: 0.2720
attend: 0.3595
blog: 0.2784
but: 0.1487
can: 0.1635
date: 0.3045
for: 0.1299
if: 0.1973
it: 0.1197
live: 0.2570
ll: 0.2009
make: 0.2205
place: 0.2732
take: 0.2394
the: 0.0984
to: 0.0953
too: 0.1806
wanted: 0.2697
will: 0.1884
you: 0.1214
your: 0.1800


In [17]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np

# 1. Charger un tweet
df = pd.read_csv("data/sampled_sentiment140.csv", encoding="ISO-8859-1")
tweet = df["comment"].iloc[56]
print("Tweet :", tweet)

# 2. Tokenisation
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df["comment"])
sequence = tokenizer.texts_to_sequences([tweet])
padded_sequence = pad_sequences(sequence, maxlen=100)

# 3. Embedding layer (ici aléatoire, mais peut être pré-entraîné)
embedding_layer = tf.keras.layers.Embedding(input_dim=5000, output_dim=128, input_length=100)

# 4. Obtenir les vecteurs d'embedding du tweet
embedded_vector = embedding_layer(padded_sequence)
print("Shape embedding LSTM :", embedded_vector.shape)  # (1, 100, 128)
print("Embedding vector (premier mot) :", embedded_vector[0][0].numpy())


Tweet : If anyone wanted to attend TEDMED but can't make the date, I'll happily take your place, and will live-blog it for you too 
Shape embedding LSTM : (1, 100, 128)
Embedding vector (premier mot) : [-0.00056794 -0.00934255  0.01322973 -0.0229493  -0.00227188  0.04573922
  0.03027841  0.01007912  0.01360359  0.0447765  -0.0397891   0.04101736
 -0.02597667 -0.00192901  0.03441172  0.039655   -0.02648856  0.00503271
 -0.01114555 -0.00398059  0.01175957  0.00555298 -0.03172211 -0.03819232
  0.04475028  0.04827727  0.01463798 -0.00806355 -0.03557465 -0.00364597
 -0.02177804 -0.02211501  0.03880891 -0.01855807  0.00566362 -0.02702664
 -0.00902102  0.00501378  0.03940764  0.01299422 -0.0100211   0.01595975
  0.00175556  0.02726612  0.00046525 -0.03092632  0.01677897  0.04038279
  0.01179936  0.00149968 -0.0215245  -0.00365239 -0.02854688 -0.02607759
 -0.03241494  0.01076878 -0.01018733 -0.03252889 -0.00963283 -0.02760348
 -0.03535657 -0.03059613  0.03183253  0.0002993  -0.00627763 -0.0002

In [18]:
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf
import pandas as pd

# 1. Charger un tweet
df = pd.read_csv("data/sampled_sentiment140.csv", encoding="ISO-8859-1")
tweet = df["comment"].iloc[56]
print("Tweet :", tweet)

# 2. Tokenizer BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
inputs = tokenizer(tweet, return_tensors="tf", truncation=True, padding=True)

# 3. Charger modèle BERT
model = TFBertModel.from_pretrained("bert-base-uncased")

# 4. Obtenir l’embedding
outputs = model(**inputs)
cls_embedding = outputs.last_hidden_state[:, 0, :]  # [CLS] token
print("Shape embedding BERT (CLS) :", cls_embedding.shape)
print("Embedding vector CLS :", cls_embedding.numpy())


Tweet : If anyone wanted to attend TEDMED but can't make the date, I'll happily take your place, and will live-blog it for you too 


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Shape embedding BERT (CLS) : (1, 768)
Embedding vector CLS : [[ 2.65063703e-01 -1.59233928e-01  2.06310973e-01 -1.16271228e-01
  -2.69432217e-01 -4.91885662e-01  3.05916011e-01  7.24431157e-01
  -1.41839311e-02 -2.26781070e-01 -1.15854718e-01 -1.86690539e-02
   2.93296903e-01  1.80884317e-01  2.31919855e-01 -1.70843810e-01
  -6.24235630e-01  5.12621284e-01  1.03402294e-01 -3.95187527e-01
   1.20114893e-01 -4.31352973e-01  1.78198665e-01  6.13646507e-02
   1.43020213e-01 -1.10118166e-01 -3.20776910e-01  6.77842647e-02
   4.60163876e-03 -1.08175993e-01  4.98781689e-02  3.25281143e-01
  -1.86604649e-01 -2.90946290e-03  4.77709681e-01 -1.68603763e-01
   3.59182246e-02  8.66905674e-02  4.55407538e-02  1.73311800e-01
  -2.84502774e-01  1.35069750e-02  4.66617346e-01  5.56073152e-02
   5.04833013e-02 -3.53936315e-01 -3.13654613e+00  1.45910773e-02
  -1.92105100e-02 -1.72098041e-01  2.76868761e-01 -2.94239670e-02
   4.11841601e-01  2.18596190e-01  2.46845633e-01  2.35620394e-01
  -2.84144193e-