In [None]:
import re
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler
df = pd.read_csv("data/train.csv")
df = df.sample(n=10000, random_state=42).reset_index(drop=True)  # on ne garde que 10k

In [None]:
df = df.dropna(subset=["text"])

In [29]:
import re
def clean_text(t):
    t = t.lower()
    # enlever URLs
    t = re.sub(r"http\S+", "", t)
    # enlever mentions si vous ne voulez pas les considérer
    t = re.sub(r"@\w+", "", t)
    # garder hashtags (#mot) ou extraire séparément plus bas
    # enlever ponctuation inutile
    t = re.sub(r"[^a-z0-9#\s]", "", t)
    # retirer espaces redondants
    t = re.sub(r"\s+", " ", t).strip()
    return t

In [None]:
df["clean_text"] = df["text"].apply(clean_text)
df["clean_text"] += " " + df["hashtags"].fillna("").apply(lambda x: "#" + x if x != "" else "")


In [31]:
from collections import Counter

# 1. Tokenisation simple
df['tokens'] = df['clean_text'].str.lower().str.split()

# 2. Comptage
all_tokens = [tok for tokens in df['tokens'] for tok in tokens]
freq = Counter(all_tokens)

# 3. Top N tokens
N = 10000
most_common = freq.most_common(N)

# 4. Dictionnaire
vocab = {tok: idx+2 for idx, (tok, _) in enumerate(most_common)}
vocab['<pad>'] = 0
vocab['<unk>'] = 1

# 5. Vérif
print(f"Vocab size (incl. pad & unk) : {len(vocab)}")
print("Exemples :", list(vocab.items())[:10])


Vocab size (incl. pad & unk) : 10002
Exemples : [('the', 2), ('to', 3), ('of', 4), ('a', 5), ('and', 6), ('in', 7), ('is', 8), ('this', 9), ('for', 10), ('covid19', 11)]


In [32]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

class TweetDataset(Dataset):
    def __init__(self, df, vocab, max_len=50):
        self.texts  = df['clean_text'].tolist()
        self.targets= torch.log1p(torch.tensor(df['retweet_count'].values, dtype=torch.float))
        #self.targets = torch.tensor(df['retweet_count'].values, dtype=torch.float )
        self.vocab  = vocab
        self.max_len= max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        toks = self.texts[idx].split()  # ou utilisez une tokenisation plus robuste
        idxs = [ self.vocab.get(t, self.vocab['<unk>']) for t in toks ][:self.max_len]
        # padding
        if len(idxs) < self.max_len:
            idxs += [self.vocab['<pad>']] * (self.max_len - len(idxs))
        return torch.tensor(idxs), self.targets[idx]


# 2) Modèle Embedding + BiLSTM + régression
class LSTMRegressor(nn.Module):
    def __init__(self, vocab_size, emb_dim=100, hid_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.lstm      = nn.LSTM(emb_dim, hid_dim, num_layers=1,
                                 bidirectional=True, batch_first=True, dropout=0.4)
        self.fc1       = nn.Linear(hid_dim*2, hid_dim)
        self.out       = nn.Linear(hid_dim, 1)
        self.dropout   = nn.Dropout(0.4)

    def forward(self, x):
        emb, _ = self.lstm(self.embedding(x))        # [B, L, 2*hid]
        # pooling : moyenne sur la séquence
        pooled = emb.mean(dim=1)                     # [B, 2*hid]
        x = torch.relu(self.fc1(self.dropout(pooled)))
        return self.out(self.dropout(x)).squeeze(-1)


In [None]:
import torch
from torch.utils.data import random_split, DataLoader

# 1) Instanciation du Dataset complet
full_ds = TweetDataset(df, vocab, max_len=50)

# 2) Calcul des tailles
total_size = len(full_ds)
train_size = int(0.8 * total_size)
val_size   = int(0.1 * total_size)
test_size  = total_size - train_size - val_size

# 3) Split aléatoire et reproductible
train_ds, val_ds, test_ds = random_split(
    full_ds,
    [train_size, val_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

# 4) Création des DataLoader
batch_size = 64
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,  num_workers=4)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False, num_workers=4)
test_loader  = DataLoader(test_ds,  batch_size=batch_size, shuffle=False, num_workers=4)

# 5) Choix du device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model  = LSTMRegressor(len(vocab), emb_dim=100, hid_dim=64).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

criterion = nn.L1Loss()


# 6) Boucle d'entraînement avec validation
num_epochs = 100
for epoch in range(1, num_epochs+1):
    # -- phase train --
    model.train()
    train_loss = 0.0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        preds = model(batch_x)
        loss  = criterion(preds, batch_y)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * batch_x.size(0)

    train_loss /= train_size

    # -- phase validation --
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for batch_x, batch_y in val_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            preds = model(batch_x)
            val_loss += criterion(preds, batch_y).item() * batch_x.size(0)
    val_loss /= val_size

    print(f"Epoch {epoch}/{num_epochs} — Train Loss: {train_loss:.4f} — Val Loss: {val_loss:.4f}")

# 7) Évaluation finale sur le test set
model.eval()
test_loss = 0.0
with torch.no_grad():
    for batch_x, batch_y in test_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        preds = model(batch_x)
        test_loss += criterion(preds, batch_y).item() * batch_x.size(0)
test_loss /= test_size
print(f"Test Loss: {test_loss:.4f}")


KeyError: 'retweet_count'