In [1]:
! pip install gensim



In [2]:
# Imports essentiels
import numpy as np
import pandas as pd
import re
from collections import Counter
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import mlflow
import mlflow.pytorch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, fbeta_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
from PIL import Image
import io

from gensim.models import Word2Vec

In [3]:
# Chargement et pr√©paration des donn√©es
df = pd.read_csv('datas/training.1600000.processed.noemoticon.csv',
                 sep=',', encoding='latin-1', header=None)
df.columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df.target = df.target.replace(4, 1)  # Binarisation

# √âchantillonnage pour r√©duction
df_pos = df[df.target == 1].sample(8000, random_state=42)
df_neg = df[df.target == 0].sample(8000, random_state=42)
df_reduced = pd.concat([df_pos, df_neg]).sample(frac=1, random_state=42)

texts = df_reduced.text.astype(str).values
labels = df_reduced.target.values


In [4]:
# Tokenization et vocabulaire
def simple_tokenize(text):
    return re.findall(r"\w+", text.lower())

def build_vocab(texts, vocab_size=10000):
    counter = Counter()
    for t in texts:
        counter.update(simple_tokenize(t))
    most_common = counter.most_common(vocab_size - 2)
    
    word2idx = {'<PAD>': 0, '<OOV>': 1}
    for i, (word, _) in enumerate(most_common, start=2):
        word2idx[word] = i
    return word2idx

def texts_to_sequences(texts, word2idx):
    sequences = []
    for t in texts:
        tokens = simple_tokenize(t)
        seq = [word2idx.get(tok, 1) for tok in tokens]
        sequences.append(seq)
    return sequences

def pad_sequences(sequences, maxlen=50):
    arr = np.zeros((len(sequences), maxlen), dtype=np.int64)
    for i, seq in enumerate(sequences):
        s = seq[:maxlen]
        arr[i, :len(s)] = s
    return arr


In [5]:
# Pr√©paration des donn√©es
vocab_size = 10000
maxlen = 50

word2idx = build_vocab(texts, vocab_size)
sequences = texts_to_sequences(texts, word2idx)
X = pad_sequences(sequences, maxlen)

X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42)


In [6]:
# Entra√Ænement Word2Vec sur nos donn√©es
embedding_dim = 100

# Pr√©paration des phrases tokenis√©es pour Word2Vec
tokenized_texts = [simple_tokenize(text) for text in texts]

# Entra√Ænement du mod√®le Word2Vec
print("Entra√Ænement Word2Vec...")
w2v_model = Word2Vec(
    sentences=tokenized_texts,
    vector_size=embedding_dim,
    window=5,
    min_count=5,
    workers=4,
    sg=1,
    epochs=10
)

# Cr√©ation de la matrice d'embedding
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, idx in word2idx.items():
    if idx < vocab_size and word in w2v_model.wv:
        embedding_matrix[idx] = w2v_model.wv[word]

print(f"Embedding matrix cr√©√©e: {embedding_matrix.shape}")


Entra√Ænement Word2Vec...
Embedding matrix cr√©√©e: (10000, 100)


In [7]:
# Mod√®le LSTM PyTorch avec Word2Vec
class LSTMSentiment(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim=64, maxlen=50):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_matrix))
        self.embedding.weight.requires_grad = False  # Word2Vec non trainable
        
        self.lstm = nn.LSTM(embedding_dim, hidden_dim,
                           batch_first=True, dropout=0.2, num_layers=1)
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, (hn, cn) = self.lstm(x)
        hn = self.dropout(hn[-1])
        out = torch.sigmoid(self.fc(hn)).squeeze()
        return out


In [8]:
# Pr√©paration des donn√©es PyTorch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

X_train_torch = torch.LongTensor(X_train).to(device)
y_train_torch = torch.FloatTensor(y_train).to(device)
X_test_torch = torch.LongTensor(X_test).to(device)
y_test_torch = torch.FloatTensor(y_test).to(device)

train_dataset = TensorDataset(X_train_torch, y_train_torch)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Configuration MLflow
mlflow.set_tracking_uri("http://127.0.0.1:8080")
mlflow.set_experiment("Twitter_Sentiment_Models")


Using device: cuda


<Experiment: artifact_location='mlflow-artifacts:/330540034538193051', creation_time=1763668642900, experiment_id='330540034538193051', last_update_time=1763668642900, lifecycle_stage='active', name='Twitter_Sentiment_Models', tags={}>

In [9]:
# Fonction pour plot et log confusion matrix
def plot_and_log_confusion_matrix(y_true, y_pred, model_name):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.xlabel('Pr√©dictions')
    plt.ylabel('Valeurs r√©elles')
    plt.title(f'Matrice de confusion - {model_name}')
    
    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    img = Image.open(buf)
    mlflow.log_image(img, f"confusion_matrix_{model_name}.png")
    plt.close()


In [10]:
# Entra√Ænement et logging MLflow
with mlflow.start_run(run_name="LSTMWord2Vec"):
    model = LSTMSentiment(vocab_size, embedding_dim).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCELoss()
    
    # Entra√Ænement
    model.train()
    for epoch in range(3):
        total_loss = 0.0
        for batch_x, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_x)
            loss = criterion(outputs, batch_y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()
        
        print(f'Epoch {epoch+1}/3 - loss {total_loss/len(train_loader):.4f}')
    
    # √âvaluation
    model.eval()
    with torch.no_grad():
        preds_proba = model(X_test_torch).cpu().numpy()
    preds = (preds_proba > 0.5).astype(int)
    
    # M√©triques
    acc = accuracy_score(y_test, preds)
    prec = precision_score(y_test, preds)
    rec = recall_score(y_test, preds)
    f1 = f1_score(y_test, preds)
    f2 = fbeta_score(y_test, preds, beta=2)
    roc_auc = roc_auc_score(y_test, preds_proba)
    
    # Logging m√©triques
    mlflow.log_metrics({
        'accuracy': acc,
        'precision': prec,
        'recall': rec,
        'f1_score': f1,
        'f2_score': f2,
        'roc_auc': roc_auc
    })
    
    # Logging mod√®le
    mlflow.pytorch.log_model(model, "lstm_word2vec_model")
    
    # Confusion matrix
    plot_and_log_confusion_matrix(y_test, preds, "LSTMWord2VecPyTorch")
    
    print("LSTM Word2Vec")
    print(f"Accuracy: {acc:.4f}, F1: {f1:.4f}, ROC-AUC: {roc_auc:.4f}")
    mlflow.end_run()





Epoch 1/3 - loss 0.6933
Epoch 2/3 - loss 0.6934




Epoch 3/3 - loss 0.6932




LSTM Word2Vec
Accuracy: 0.5053, F1: 0.6714, ROC-AUC: 0.5012
üèÉ View run LSTMWord2Vec at: http://127.0.0.1:8080/#/experiments/330540034538193051/runs/c64576d201fd4cb6a16895eb3c5551fe
üß™ View experiment at: http://127.0.0.1:8080/#/experiments/330540034538193051
