### Maintenant on va essayer d'aller plus loin en fine-tunant un modèle pré-entrainé (RoBERTa) sur nos données.

In [None]:
# !pip install torch torchvision torchaudio transformers scikit-learn pandas numpy matplotlib seaborn mlflow skl2onnx onnxruntime requests onnx datasets onnxruntime skl2onnx

In [1]:
import torch
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME = "distilbert-base-uncased"
tokenizer = DistilBertTokenizer.from_pretrained(MODEL_NAME)

In [3]:
model = DistilBertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

print(f"Using device: {device}")  # Vérifier si MPS est activé

Using device: mps


### Chargement du dataset

In [7]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification
from transformers import AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
import time
from sklearn.utils import resample

In [15]:
ds = load_dataset("bhavyagiri/imdb-spoiler")
data_train = pd.DataFrame(ds['train']).dropna()
data_test = pd.DataFrame(ds['validation']).dropna()

In [24]:
X_train = data_train['text']
y_train = data_train['label']
X_test = data_test['text']
y_test = data_test['label']

# Combiner les features et labels en un seul DataFrame pour faciliter le traitement
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

# Séparer les classes
train_majority = train_df[train_df['label'] == 0]
train_minority = train_df[train_df['label'] == 1]

# Sur-échantillonnage de la classe minoritaire
train_minority_oversampled = resample(
    train_minority,
    replace=True,  # Rééchantillonnage avec remise
    n_samples=len(train_majority),  # Équilibrer les classes
    random_state=42
)

# Fusion des classes équilibrées et mélange
train_balanced = pd.concat([train_majority, train_minority_oversampled]).sample(frac=1, random_state=42)

# Sélectionner 3000 échantillons pour le training
train_balanced = train_balanced.sample(n=3000, random_state=42)

# Sélectionner 1000 échantillons pour le test (en conservant la distribution originale)
test_sampled = test_df.sample(n=1000, random_state=42)

# Mise à jour des variables finales
X_train = train_balanced['text']
y_train = train_balanced['label']
X_test = test_sampled['text']
y_test = test_sampled['label']

In [4]:
def tokenize_function(texts, tokenizer, max_length=512):
    return tokenizer(
        texts, 
        padding="max_length", 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )

In [17]:
train_tokens = tokenizer(list(X_train), padding = True, truncation=True)
test_tokens = tokenizer(list(X_test), padding = True, truncation=True)

In [36]:
train_tokens.keys()

dict_keys(['input_ids', 'attention_mask'])

In [37]:
print(train_tokens['input_ids'][0])
print(tokenizer.decode(train_tokens['input_ids'][0]))

[101, 16357, 3854, 25487, 2003, 2028, 1997, 2216, 2396, 2160, 5501, 1045, 2074, 2064, 1005, 1056, 5993, 2007, 2043, 2009, 3310, 2000, 2010, 7696, 2005, 1996, 2087, 2112, 1010, 2174, 1045, 2064, 2036, 4066, 1997, 3305, 2339, 2002, 6732, 1996, 2126, 2002, 2515, 1012, 2025, 2000, 2272, 2125, 2969, 1011, 27427, 5313, 11461, 1010, 2021, 2002, 2003, 5399, 3305, 3085, 2000, 1037, 2391, 1012, 2002, 3849, 2000, 2022, 1037, 3124, 2007, 1037, 2843, 1997, 4301, 1998, 4784, 2055, 8438, 2130, 2065, 2009, 2788, 5260, 2000, 6245, 1998, 10089, 1006, 2029, 2024, 2941, 2261, 3114, 2339, 2111, 2175, 2000, 3422, 5691, 1007, 1012, 3071, 2038, 2045, 10740, 1998, 2070, 2089, 2272, 2125, 2200, 2367, 2013, 1996, 2591, 13373, 1010, 1998, 2023, 3185, 2003, 2200, 2367, 2013, 1996, 2591, 13373, 1012, 2023, 8857, 3262, 2006, 2108, 1037, 6396, 8737, 23393, 13241, 2278, 1998, 1996, 14337, 2791, 1997, 14938, 1012, 2009, 1005, 1055, 2066, 16357, 4117, 3770, 1003, 2010, 4301, 2055, 3348, 1998, 2322, 1003, 2055, 1996, 206

In [18]:
class TokenData(Dataset):
    def __init__(self, train = False):
        if train:
            self.text_data = X_train
            self.tokens = train_tokens
            self.labels = list(y_train)
        else:
            self.text_data = X_test
            self.tokens = test_tokens
            self.labels = list(y_test)

    def __len__(self):
        return len(self.text_data)

    def __getitem__(self, idx):
        sample = {}
        for k, v in self.tokens.items():
            sample[k] = torch.tensor(v[idx])
        sample['labels'] = torch.tensor(self.labels[idx])
        return sample

In [19]:
batch_size = 40
train_dataset = TokenData(train = True)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)

test_dataset = TokenData(train = False)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=batch_size)

In [8]:
optimizer = AdamW(model.parameters(), lr=1e-5) # Optimization function
loss_fn = torch.nn.CrossEntropyLoss() # Loss function

In [20]:
num_epochs = 3
total_batches = len(train_loader) + len(test_loader)
start_time = time.time()

for epoch in range(num_epochs):
    print(f"\n🔄 Époque {epoch + 1}/{num_epochs}")

    # Phase d'entraînement
    model.train()
    epoch_train_loss = 0
    start_epoch_time = time.time()

    for i, batch in enumerate(train_loader):
        batch = {k: v.to(device) for k, v in batch.items()}
        optimizer.zero_grad()

        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])
        loss = loss_fn(outputs.logits, batch['labels'])

        loss.backward()
        optimizer.step()

        epoch_train_loss += loss.item()
        elapsed_time = time.time() - start_time
        remaining_time = (elapsed_time / ((epoch * len(train_loader)) + i + 1)) * (total_batches - ((epoch * len(train_loader)) + i + 1))

        print(f"[Train] Batch {i+1}/{len(train_loader)} - Loss: {loss.item():.4f} | Temps écoulé: {elapsed_time:.2f}s | Temps restant estimé: {remaining_time:.2f}s")

    avg_train_loss = epoch_train_loss / len(train_loader)
    print(f"✅ Fin entraînement époque {epoch+1} - Loss moyenne: {avg_train_loss:.4f} | Temps: {time.time() - start_epoch_time:.2f}s")

    # Phase de test
    model.eval()
    epoch_test_loss = 0
    correct = 0
    total_samples = 0

    with torch.no_grad():
        for i, batch in enumerate(test_loader):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'])

            loss = loss_fn(outputs.logits, batch['labels'])
            epoch_test_loss += loss.item()

            preds = outputs.logits.argmax(dim=1)
            correct += (preds == batch['labels']).sum().item()
            total_samples += batch['labels'].size(0)

            print(f"[Test] Batch {i+1}/{len(test_loader)} - Loss: {loss.item():.4f}")

    avg_test_loss = epoch_test_loss / len(test_loader)
    accuracy = correct / total_samples

    print(f"📊 Fin test époque {epoch+1} - Loss moyenne: {avg_test_loss:.4f} | Accuracy: {accuracy:.4%}")

print(f"\n⏳ Entraînement terminé en {time.time() - start_time:.2f}s")


🔄 Époque 1/3
[Train] Batch 1/13 - Loss: 0.4811 | Temps écoulé: 98.00s | Temps restant estimé: 3626.08s
[Train] Batch 2/13 - Loss: 0.5525 | Temps écoulé: 330.16s | Temps restant estimé: 5942.89s
[Train] Batch 3/13 - Loss: 0.5506 | Temps écoulé: 450.10s | Temps restant estimé: 5251.21s
[Train] Batch 4/13 - Loss: 0.5424 | Temps écoulé: 571.62s | Temps restant estimé: 4858.80s
[Train] Batch 5/13 - Loss: 0.5408 | Temps écoulé: 645.66s | Temps restant estimé: 4261.38s
[Train] Batch 6/13 - Loss: 0.5122 | Temps écoulé: 699.48s | Temps restant estimé: 3730.58s
[Train] Batch 7/13 - Loss: 0.4910 | Temps écoulé: 753.42s | Temps restant estimé: 3336.58s
[Train] Batch 8/13 - Loss: 0.6180 | Temps écoulé: 829.97s | Temps restant estimé: 3112.37s
[Train] Batch 9/13 - Loss: 0.4531 | Temps écoulé: 900.94s | Temps restant estimé: 2903.01s
[Train] Batch 10/13 - Loss: 0.5713 | Temps écoulé: 975.02s | Temps restant estimé: 2730.05s
[Train] Batch 11/13 - Loss: 0.6009 | Temps écoulé: 1046.65s | Temps restant 

In [43]:
torch.save({
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss_fn': loss_fn,
}, "checkpoint_final.pth")

### Test du modele

In [9]:
checkpoint = torch.load("models/checkpoint_final.pth", weights_only=False)

In [10]:
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
loss_fn = checkpoint['loss_fn']

On réduit le learning rate pour éviter de trop déformer le modèle pré-entrainé.

In [11]:
def preprocess_texts(texts, tokenizer, max_length=512):
    encodings = tokenizer(
        texts, 
        padding="max_length", 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )
    return {key: tensor.to(device) for key, tensor in encodings.items()} 

def make_inference(texts):
    inputs = preprocess_texts(texts, tokenizer)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
    return predictions.cpu().numpy()  

In [16]:
texts = [
    "I love this movie but at the end we can see Aurore see her father that's strange",
    "The plot twist was completely unexpected and brilliant!",
    "Bob is the real villain of the story, and he dies in the end.",
]

type(make_inference(texts)[0])

numpy.int64