# 0. Testing environnement and import

In [39]:
try:
    import torch
    import pytorch_lightning as pl
    import transformers
    import datasets
    print("Import successful")
except ModuleNotFoundError :
    !./create_conda_env.sh

Import successful


# 1. Import Data

In [184]:
from datasets import load_dataset
from pathlib import Path
import pandas as pd

def DL_oscar_subset(n_subset:int=1e4)->pd.DataFrame:
    dataset_stream = load_dataset("oscar", "unshuffled_deduplicated_fr", split="train", streaming=True)
    
    subset = []
    
    for i, example in tqdm(enumerate(dataset_stream), total=n_subset):
        subset.append(example)
        if i+1 >= n_subset:
            break
    return pd.DataFrame(subset)

def save_dataset(dataset, name="oscar_subset.csv"):
    dataset.to_csv(name, index=False)


def get_oscar_dataset(n_subset=1e4, name="oscar_subset.csv"):
    file_path = Path.cwd() / name
    
    # Si le fichier n'existe pas, on le télécharge
    if not file_path.exists():
        print(f"Le fichier {name} n'existe pas. Téléchargement en cours...")
        subset = DL_oscar_subset(n_subset)
        save_dataset(subset)
        print("Téléchargement et enregistrement du dataset effectués.")
    
    # Chargement du dataset
    dataset = pd.read_csv(name)
    
    if len(dataset) < n_subset:
        print(f"Le dataset contient moins de {n_subset} exemples. Téléchargement supplémentaire...")
        subset = DL_oscar_subset(n_subset)
        save_dataset(subset)
        dataset = pd.read_csv(name)
    
    # Retourne un sous-ensemble du dataset (limité à n_subset exemples)
    return dataset.head(int(n_subset))

In [188]:
n_subset = 1.2e4
dataset = get_oscar_dataset(n_subset)

In [189]:
dataset

Unnamed: 0,id,text
0,0,"Média de débat d'idées, de culture et de litté..."
1,1,24 janv. 2018 Sources de donnéesData Sources. ...
2,2,"Sous-forums: Abstrait, Architecture (intérieur..."
3,3,19h45 : buffet populaire : repas ouverts aux f...
4,4,"Greg est un lycéen introverti, adepte de l’aut..."
...,...,...
11995,11995,Coiffures pour les femmes âgées visent à rendr...
11996,11996,Ce billet a été publié dans Photo et taggé ave...
11997,11997,"John Hurt, d'abord, immense acteur anglais mor..."
11998,11998,"Voici un aperçu de l’atelier balle de gages, u..."


# 2. Create Model

In [119]:
import random

import numpy as np
import pandas as pd
from datasets import Dataset

from tqdm.auto import tqdm

import torch
import torch.nn as nn

from transformers import RobertaConfig
from transformers import CamembertTokenizerFast

In [41]:
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

In [166]:
class CamembertModel(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        # Embedding layers
        self.embeddings = nn.Embedding(self.config.vocab_size, 
                                       self.config.hidden_size)
        self.position_embeddings = nn.Embedding(self.config.max_position_embeddings, 
                                                self.config.hidden_size)
        self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
        
        # Transformer layers
        self.encoder_layers = nn.ModuleList([
            nn.TransformerEncoderLayer(d_model=self.config.hidden_size, 
                                       nhead=self.config.num_attention_heads, 
                                       dim_feedforward=self.config.intermediate_size, 
                                       dropout=self.config.hidden_dropout_prob)
            for _ in range(self.config.num_hidden_layers)
        ])
        
        self.linear = nn.Linear(self.config.hidden_size,
                                self.config.vocab_size)  # Projection to vocab_size

    def forward(self, input_ids):
        # Compute embeddings
        seq_length = input_ids.size(0)
        position_ids = torch.arange(seq_length, device=input_ids.device).unsqueeze(0)
        embeddings = self.embeddings(input_ids) + self.position_embeddings(position_ids)
        embeddings = self.dropout(embeddings)
        
        # Pass through Transformer encoder layers
        hidden_states = embeddings
        for layer in self.encoder_layers:
            hidden_states = layer(hidden_states)
        
        logits = self.linear(hidden_states)  # Output shape: (batch_size, seq_len, vocab_size)
        
        return logits


In [167]:
config = RobertaConfig.from_pretrained("camembert-base")
config

You are using a model of type camembert to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.


RobertaConfig {
  "architectures": [
    "CamembertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 5,
  "classifier_dropout": null,
  "eos_token_id": 6,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 32005
}

In [168]:
model = CamembertModel(config)
model

CamembertModel(
  (embeddings): Embedding(32005, 768)
  (position_embeddings): Embedding(514, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (encoder_layers): ModuleList(
    (0-11): 12 x TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
      )
      (linear1): Linear(in_features=768, out_features=3072, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
      (linear2): Linear(in_features=3072, out_features=768, bias=True)
      (norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout1): Dropout(p=0.1, inplace=False)
      (dropout2): Dropout(p=0.1, inplace=False)
    )
  )
  (linear): Linear(in_features=768, out_features=32005, bias=True)
)

# 3. Pre-Processe data
## 3.1 preprocesse function

In [81]:
def preprocess_and_mask_data(texts, tokenizer, max_length=128, mlm_probability=0.15):
    inputs = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    labels = input_ids.clone()

    # Create a mask
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # Replace masked input tokens with tokenizer.mask_token_id
    input_ids[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    return input_ids, attention_mask, labels


# Example preprocessing step
texts = [
    "Bonjour, comment ça va ?", 
    "J'aime apprendre l'intelligence artificielle."
]
processed_data = preprocess_and_mask_data(texts, tokenizer)
np.array(processed_data)[0,:,:20]


array([[    5, 32004,     7,   404,   136,   198,   106,     6,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1],
       [    5,   121,    11,   660,  1891,    17,    11,  6031, 32004,
            9,     6,     1,     1,     1,     1,     1,     1,     1,
            1,     1]])

## 3.2 Dataset & Dataloader

In [126]:
class OscarDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128, mlm_probability=0.15):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length
        self.mlm_probability = mlm_probability

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        input_ids, attention_mask, labels = preprocess_and_mask_data(
            [text], self.tokenizer, self.max_length, self.mlm_probability
        )
        return {
            'input_ids': input_ids.squeeze(0),
            'attention_mask': attention_mask.squeeze(0),
            'labels': labels.squeeze(0)
        }

In [127]:
texts = dataset['text']
sample_texts = texts.sample(frac=1)

p = 0.83334
train_size = int(len(texts)*p)
test_size = len(texts) - train_size

train_texts = sample_texts.head(train_size)
test_texts = sample_texts.tail(test_size)

len(texts), len(train_texts), len(test_texts)

(12000, 10000, 2000)

In [175]:
from torch.utils.data import DataLoader
train_dataset = OscarDataset(train_texts, tokenizer) # dont try to use this :/
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_dataset = OscarDataset(test_texts, tokenizer) # dont try to use this :/
test_dataloader = DataLoader(test_dataset, batch_size=8)

# 4. Training
## 4.1 With pytorch-lightning : Trainer

In [134]:
torch.cuda.is_available() # ouch

False

In [144]:
import pytorch_lightning as pl

class CamembertTrainer(pl.LightningModule):
    def __init__(self, model, tokenizer, learning_rate=1e-4):
        super().__init__()
        self.model = model
        self.tokenizer = tokenizer
        self.loss_fn = nn.CrossEntropyLoss()
        self.learning_rate = learning_rate

    def forward(self, input_ids):
        return self.model(input_ids)

    def training_step(self, batch, batch_idx):
        input_ids, labels = batch["input_ids"], batch["labels"]
        outputs = self.model(input_ids)
        loss = self.loss_fn(outputs.view(-1, config.vocab_size), labels.view(-1))
        self.log("train_loss", loss)
        return loss

    def configure_optimizers(self):
        return torch.optim.AdamW(self.model.parameters(), lr=self.learning_rate)

In [149]:
# Train
%%time 
trainer = pl.Trainer(max_epochs=1)
trainer.fit(CamembertTrainer(model, tokenizer), train_dataloaders=train_dataloader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | model   | CamembertModel   | 134 M  | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
134 M     Trainable params
0         Non-trainable params
134 M     Total params
538.564   Total estimated model params size (MB)
127       Modules in train mode
0         Modules in eval mode


Epoch 0: 100%|██████████| 1250/1250 [13:39<00:00,  1.53it/s, v_num=3]

`Trainer.fit` stopped: `max_epochs=1` reached.


Epoch 0: 100%|██████████| 1250/1250 [13:48<00:00,  1.51it/s, v_num=3]


## 4.2 Regular pytorch

In [None]:
%%time
from torch.optim import AdamW
from transformers import RobertaConfig

# Load CamembertModel from the earlier definition
config = RobertaConfig.from_pretrained("camembert-base")
model = CamembertModel(
    vocab_size=config.vocab_size,
    max_position_embeddings=config.max_position_embeddings,
    num_hidden_layers=config.num_hidden_layers,
    hidden_size=config.hidden_size,
    num_attention_heads=config.num_attention_heads,
    intermediate_size=config.intermediate_size,
    hidden_dropout_prob=config.hidden_dropout_prob,
)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)

# Loss function
loss_fn = nn.CrossEntropyLoss()

# Training loop
epochs = 1
model.train()

for epoch in range(epochs):
    epoch_loss = 0.0
    for batch in dataloader:
        optimizer.zero_grad()
        
        # Move inputs and labels to device
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        
        # Forward pass
        outputs = model(input_ids)
        
        # Compute loss
        loss = loss_fn(outputs.view(-1, config.vocab_size), labels.view(-1))
        epoch_loss += loss.item()
        
        # Backward pass
        loss.backward()
        optimizer.step()
    
    print(f"Epoch {epoch + 1} | Loss: {epoch_loss / len(dataloader)}")


# 5. Save model

In [152]:
model_save_path = './models/camembert_custom_model.pth'
tokenizer_save_path = './models/camembert_custom_tokenizer'
 
torch.save(model.state_dict(), model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

('./models/camembert_custom_tokenizer/tokenizer_config.json',
 './models/camembert_custom_tokenizer/special_tokens_map.json',
 './models/camembert_custom_tokenizer/tokenizer.json')

# 6. Load & Tets model

In [183]:
model_load = CamembertModel(config) # init

tokenizer = CamembertTokenizerFast.from_pretrained(tokenizer_save_path)
model_load.load_state_dict(torch.load(model_save_path))
model_load.eval()  

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  model_load.load_state_dict(torch.load(model_save_path))


In [178]:
def evaluate_model(model_load, dataloader, device):
    total_loss = 0
    total_examples = 0
    
    for batch in tqdm(dataloader, desc="Evaluating"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        with torch.inference_mode(): 
            outputs = model(input_ids)

        loss_fn = torch.nn.CrossEntropyLoss(ignore_index=-100)
        # output xhape (batch_size, seq_len, vocab_size)
        logits = outputs.view(-1, model.config.vocab_size)
        labels = labels.view(-1)
        
        loss = loss_fn(logits, labels)
        total_loss += loss.item() * input_ids.size(0) 
        total_examples += input_ids.size(0) 

    avg_loss = total_loss / total_examples
    return avg_loss

In [180]:
%%time 
avg_loss = evaluate_model(model_load, test_dataloader, device)
print(f"Perte moyenne sur le dataset : {avg_loss}") # 10.51747866821289 ca marche mais bon 1 epoch sur 1000 exemples pour l'entrainement (pas d'erreur)

Evaluating: 100%|██████████| 250/250 [01:29<00:00,  2.80it/s]

Perte moyenne sur le dataset : 10.52037271118164



