In [1]:
import re
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("data/train.csv")
df = df.sample(n=50000, random_state=42).reset_index(drop=True)  # on ne garde que 10k

In [2]:
# Concaténer texte + hashtags
df["full_text"] = (
    df["text"].fillna("") + " " +
    df["hashtags"].fillna("").str.replace(r"[^\w#]+", " ", regex=True)
)
#y = np.log1p(df["retweet_count"].values).astype(np.float32)
y = df["retweet_count"].values.astype(np.float32)

In [3]:

# Features numériques et scaler
num_cols = ["user_verified", "user_statuses_count", "user_followers_count", "user_friends_count"]
df["user_verified"] = df["user_verified"].astype(int)
scaler = StandardScaler()
X_num = scaler.fit_transform(df[num_cols].values).astype(np.float32)

# 2) Tokenizer maison + vocabulaire
def simple_tokenizer(text):
    return re.findall(r"\w+", text.lower())



In [4]:

from collections import Counter
counter = Counter(tok for txt in df["full_text"] for tok in simple_tokenizer(txt))
specials = ["<pad>","<unk>"]
itos = specials + sorted(counter.keys())
stoi = {tok:i for i,tok in enumerate(itos)}
pad_idx, unk_idx = stoi["<pad>"], stoi["<unk>"]

In [5]:
# 3) Dataset & DataLoader
from torch.utils.data import random_split
class TweetDataset(Dataset):
    def __init__(self, texts, numerics, labels):
        self.texts, self.numerics, self.labels = texts, numerics, labels
    def __len__(self): return len(self.labels)
    def __getitem__(self, i):
        toks = simple_tokenizer(self.texts[i]) or ["<unk>"]
        token_ids = [stoi.get(t, unk_idx) for t in toks]
        return {
            "tokens":    torch.tensor(token_ids, dtype=torch.long),
            "numerics":  torch.tensor(self.numerics[i], dtype=torch.float32),
            "label":     torch.tensor(self.labels[i], dtype=torch.float32),
        }


def collate_fn(batch):
    seqs = [b["tokens"] for b in batch]
    lengths = torch.tensor([len(s) for s in seqs], dtype=torch.long)
    padded  = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=pad_idx)
    nums    = torch.stack([b["numerics"] for b in batch])
    labs    = torch.stack([b["label"]    for b in batch])
    return padded, lengths, nums, labs

# split 80/20
texts = df["full_text"].tolist()
full_ds = TweetDataset(texts, X_num, y)
n_train = int(0.8 * len(full_ds))
n_test  = len(full_ds) - n_train
train_ds, test_ds = random_split(full_ds, [n_train, n_test],
                                 generator=torch.Generator().manual_seed(42))

train_loader = DataLoader(train_ds, batch_size=512, shuffle=True,  collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=512, shuffle=False, collate_fn=collate_fn)


In [6]:

# 4) Définition du modèle hybride
class LSTM_MLP(nn.Module):
    def __init__(self, vocab_size, emb_dim=100, lstm_hid=128, num_feat_dim=4):
        super().__init__()
        # embedding + LSTM
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(emb_dim, lstm_hid, batch_first=True)
        # MLP numérique
        self.num_mlp = nn.Sequential(
            nn.Linear(num_feat_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.1),
        )
        # tête de fusion
        self.head = nn.Sequential(
            nn.Linear(lstm_hid + 64, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 1)
        )

    def forward(self, tokens, lengths, numerics):
        emb = self.embedding(tokens)                            # (B, L, emb_dim)
        # pack/pad pour LSTM
        packed = nn.utils.rnn.pack_padded_sequence(emb, lengths.cpu(), 
                                                   batch_first=True, enforce_sorted=False)
        out_p, (h_n, _) = self.lstm(packed)
        # h_n[-1] = dernier hidden state de la dernière couche
        text_repr = h_n[-1]                                      # (B, lstm_hid)
        num_repr  = self.num_mlp(numerics)                       # (B, 64)
        x = torch.cat([text_repr, num_repr], dim=1)              # (B, lstm_hid+64)
        return self.head(x).squeeze(1)                           # (B,)


In [7]:
device    = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model     = LSTM_MLP(len(itos)).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
criterion = nn.L1Loss()  # Utiliser MAE pour l'entraînement

n_epochs  = 100

# ——— Entraînement ———
for epoch in range(1, n_epochs+1):
    model.train()
    total_train_loss = 0.0

    for tokens, lengths, numerics, labels in train_loader:
        tokens, lengths = tokens.to(device), lengths.to(device)
        numerics, labels = numerics.to(device), labels.to(device)

        optimizer.zero_grad()
        preds = model(tokens, lengths, numerics)
        loss  = criterion(preds, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * tokens.size(0)

    avg_train_loss = total_train_loss / len(train_ds)
    print(f"Epoch {epoch}/{n_epochs} — Train MAE: {avg_train_loss:.4f}")

# ——— Évaluation finale sur test set ———
model.eval()
total_test_loss = 0.0
criterion = nn.L1Loss()  # Utiliser MAE pour l'évaluation finale

with torch.no_grad():
    for tokens, lengths, numerics, labels in test_loader:
        tokens, lengths = tokens.to(device), lengths.to(device)
        numerics, labels = numerics.to(device), labels.to(device)

        preds = model(tokens, lengths, numerics)
        total_test_loss += criterion(preds, labels).item() * tokens.size(0)

avg_test_loss = total_test_loss / len(test_ds)
print(f"Final Test MAE: {avg_test_loss:.4f}")


Epoch 1/100 — Train MAE: 134.3818
Epoch 2/100 — Train MAE: 133.6907
Epoch 3/100 — Train MAE: 132.0033
Epoch 4/100 — Train MAE: 128.9764
Epoch 5/100 — Train MAE: 124.9385
Epoch 6/100 — Train MAE: 120.9878
Epoch 7/100 — Train MAE: 116.2255
Epoch 8/100 — Train MAE: 112.3328
Epoch 9/100 — Train MAE: 106.3381
Epoch 10/100 — Train MAE: 102.8333
Epoch 11/100 — Train MAE: 99.0770
Epoch 12/100 — Train MAE: 96.2867
Epoch 13/100 — Train MAE: 93.2358
Epoch 14/100 — Train MAE: 90.1555
Epoch 15/100 — Train MAE: 86.1277
Epoch 16/100 — Train MAE: 84.8720
Epoch 17/100 — Train MAE: 82.0888
Epoch 18/100 — Train MAE: 80.3633
Epoch 19/100 — Train MAE: 76.2942
Epoch 20/100 — Train MAE: 76.7045
Epoch 21/100 — Train MAE: 75.0718
Epoch 22/100 — Train MAE: 72.6284
Epoch 23/100 — Train MAE: 71.5428
Epoch 24/100 — Train MAE: 70.0765
Epoch 25/100 — Train MAE: 69.9302
Epoch 26/100 — Train MAE: 67.6412
Epoch 27/100 — Train MAE: 68.2561
Epoch 28/100 — Train MAE: 65.7613
Epoch 29/100 — Train MAE: 67.0376
Epoch 30/100 

In [8]:
# ——— Évaluation finale sur test set ———
model.eval()
total_test_loss = 0.0
criterion = nn.L1Loss()  # Utiliser MAE pour l'évaluation finale

with torch.no_grad():
    for tokens, lengths, numerics, labels in test_loader:
        tokens, lengths = tokens.to(device), lengths.to(device)
        numerics, labels = numerics.to(device), labels.to(device)

        preds = model(tokens, lengths, numerics)
        total_test_loss += criterion(preds, labels).item() * tokens.size(0)

avg_test_loss = total_test_loss / len(test_ds)
print(f"Final Test MAE: {avg_test_loss:.4f}")


Final Test MAE: 126.0021
