In [2]:
from datasets import load_dataset
import pandas as pd
from pathlib import Path

def DL_oscar_subset(n_subset:int=1e4)->pd.DataFrame:
    dataset_stream = load_dataset("oscar", "unshuffled_deduplicated_fr", split="train", streaming=True)
    
    subset = []
    
    for i, example in tqdm(enumerate(dataset_stream), total=n_subset):
        subset.append(example)
        if i+1 >= n_subset:
            break
    return pd.DataFrame(subset)

def save_dataset(dataset, name="oscar_subset.csv"):
    dataset.to_csv(name, index=False)


def get_oscar_dataset(n_subset=1e4, name="oscar_subset.csv"):
    file_path = Path.cwd() / name
    
    # Si le fichier n'existe pas, on le télécharge
    if not file_path.exists():
        print(f"Le fichier {name} n'existe pas. Téléchargement en cours...")
        subset = DL_oscar_subset(n_subset)
        save_dataset(subset)
        print("Téléchargement et enregistrement du dataset effectués.")
    
    # Chargement du dataset
    dataset = pd.read_csv(name)
    
    if len(dataset) < n_subset:
        print(f"Le dataset contient moins de {n_subset} exemples. Téléchargement supplémentaire...")
        subset = DL_oscar_subset(n_subset)
        save_dataset(subset)
        dataset = pd.read_csv(name)
    
    # Retourne un sous-ensemble du dataset (limité à n_subset exemples)
    return dataset.head(int(n_subset))

In [3]:
n_subset = 1.2e4
dataset = get_oscar_dataset(n_subset)

In [6]:
import pandas as pd
from transformers import CamembertTokenizerFast
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import numpy as np

In [7]:
tokenizer = CamembertTokenizerFast.from_pretrained("camembert-base")

In [8]:
def preprocess_and_mask_data(texts, tokenizer, max_length=128, mlm_probability=0.15):
    inputs = tokenizer(
        texts,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'
    )
    input_ids = inputs['input_ids']
    attention_mask = inputs['attention_mask']
    labels = input_ids.clone()

    # Create a mask
    probability_matrix = torch.full(labels.shape, mlm_probability)
    special_tokens_mask = [
        tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
    ]
    probability_matrix.masked_fill_(torch.tensor(special_tokens_mask, dtype=torch.bool), value=0.0)
    masked_indices = torch.bernoulli(probability_matrix).bool()
    labels[~masked_indices] = -100  # We only compute loss on masked tokens

    # Replace masked input tokens with tokenizer.mask_token_id
    input_ids[masked_indices] = tokenizer.convert_tokens_to_ids(tokenizer.mask_token)

    return input_ids, attention_mask, labels


In [9]:
class OscarDataset(Dataset):
    def __init__(self, texts, tokenizer, max_length=128, mlm_probability=0.15):
        self.tokenizer = tokenizer
        self.texts = texts
        self.max_length = max_length
        self.mlm_probability = mlm_probability

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts.iloc[idx])
        input_ids, attention_mask, labels = preprocess_and_mask_data(
            [text], self.tokenizer, self.max_length, self.mlm_probability
        )
        return {
            'input_ids': input_ids.squeeze(0),
            'attention_mask': attention_mask.squeeze(0),
            'labels': labels.squeeze(0)
        }


In [10]:
texts = dataset['text']

train_dataset = OscarDataset(texts, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

In [11]:
import torch.nn as nn
from transformers import CamembertConfig


In [44]:
class CamembertForMaskedLM(nn.Module):
    def __init__(self, config):
        super().__init__()
        # Load the configuration
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.can_generate = lambda : False # a verifier ; sans imposible de faire l'inference fin de notebook

        # Embeddings
        self.embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
        self.token_type_embeddings = nn.Embedding(1, config.hidden_size)  # CamemBERT uses only one segment

        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)

        # Transformer Encoder
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=config.hidden_size,
            nhead=config.num_attention_heads,
            dim_feedforward=config.intermediate_size,
            dropout=config.hidden_dropout_prob,
            activation='gelu'
        )
        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=config.num_hidden_layers)

        # MLM Head
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights
        self._init_weights()

    def _init_weights(self):
        # Initialize weights following the BERT initialization
        for module in self.modules():
            if isinstance(module, (nn.Linear, nn.Embedding)):
                nn.init.normal_(module.weight, mean=0.0, std=self.config.initializer_range)
            elif isinstance(module, nn.LayerNorm):
                nn.init.constant_(module.bias, 0)
                nn.init.constant_(module.weight, 1.0)
        if isinstance(self.lm_head, nn.Linear) and self.lm_head.bias is not None:
            nn.init.constant_(self.lm_head.bias, 0)

    def forward(self, input_ids, attention_mask=None, labels=None):
        # Get input embeddings
        seq_length = input_ids.size(1)
        position_ids = torch.arange(seq_length, dtype=torch.long, device=input_ids.device)
        position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
        token_type_ids = torch.zeros_like(input_ids, dtype=torch.long, device=input_ids.device)

        inputs_embeds = self.embeddings(input_ids) + self.position_embeddings(position_ids) + self.token_type_embeddings(token_type_ids)
        hidden_states = self.layer_norm(inputs_embeds)
        hidden_states = self.dropout(hidden_states)

        # Apply attention mask
        if attention_mask is not None:
            # Invert the attention mask for nn.TransformerEncoder
            attention_mask = attention_mask.bool()
            attention_mask = ~attention_mask
        else:
            attention_mask = None

        # Transformer encoder
        hidden_states = self.encoder(hidden_states.transpose(0, 1), src_key_padding_mask=attention_mask)
        hidden_states = hidden_states.transpose(0, 1)

        # MLM Head
        prediction_scores = self.lm_head(hidden_states)

        output = {'logits': prediction_scores}

        # Compute loss if labels are provided
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            # Shift prediction scores and labels for computing loss
            loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
            output['loss'] = loss

        return output


In [13]:
config = CamembertConfig(
    vocab_size=tokenizer.vocab_size,
    max_position_embeddings=tokenizer.model_max_length,
    num_hidden_layers=6,  # Use fewer layers for faster training
    num_attention_heads=8,
    hidden_size=512,
    intermediate_size=2048,
    hidden_act='gelu',
    hidden_dropout_prob=0.1,
    attention_probs_dropout_prob=0.1,
    initializer_range=0.02,
    layer_norm_eps=1e-12,
    pad_token_id=tokenizer.pad_token_id,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id
)

model = CamembertForMaskedLM(config)




In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

CamembertForMaskedLM(
  (embeddings): Embedding(32005, 512, padding_idx=1)
  (position_embeddings): Embedding(512, 512)
  (token_type_embeddings): Embedding(1, 512)
  (layer_norm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=512, out_features=512, bias=True)
        )
        (linear1): Linear(in_features=512, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=512, bias=True)
        (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (lm_head): Li

In [15]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(), lr=5e-5)

num_epochs = 3
total_steps = len(train_dataloader) * num_epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * total_steps),
    num_training_steps=total_steps
)



In [16]:
%%time 
from torch.nn.utils import clip_grad_norm_

model.train()

for epoch in range(num_epochs):
    epoch_loss = 0.0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{num_epochs}'):
        optimizer.zero_grad()
        
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs['loss']
        
        loss.backward()
        clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        
        epoch_loss += loss.item()
    
    avg_loss = epoch_loss / len(train_dataloader)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')


Epoch 1/3: 100%|██████████| 1500/1500 [17:36<00:00,  1.42it/s]


Epoch 1/3, Loss: 8.1267


Epoch 2/3: 100%|██████████| 1500/1500 [21:16<00:00,  1.18it/s]


Epoch 2/3, Loss: 7.6863


Epoch 3/3: 100%|██████████| 1500/1500 [18:51<00:00,  1.33it/s]

Epoch 3/3, Loss: 7.5965





In [20]:
model_save_path = './models/camembertPRO_custom_model.pth'
tokenizer_save_path = './models/camembertPRO_custom_tokenizer'

torch.save(model.state_dict(), model_save_path)
tokenizer.save_pretrained(tokenizer_save_path)

('./models/camembertPRO_custom_tokenizer/tokenizer_config.json',
 './models/camembertPRO_custom_tokenizer/special_tokens_map.json',
 './models/camembertPRO_custom_tokenizer/tokenizer.json')

In [45]:
%%time
from transformers import pipeline
from transformers import CamembertTokenizerFast

model_load = CamembertForMaskedLM(config)
tokenizer_load = CamembertTokenizerFast.from_pretrained(tokenizer_save_path)
model_load.load_state_dict(torch.load(model_save_path))

fill_mask = pipeline(
    "fill-mask",
    model=model_load,
    tokenizer=tokenizer_load,
    device=0 if torch.cuda.is_available() else -1
)

# Test sentence
sentence = "Paris est la capitale de la <mask>."
results = fill_mask(sentence)

for result in results:
    print(f"Prediction: {result['sequence']}, Score: {result['score']:.4f}") 

# ouch, mais techniquement ca marche 


Prediction: Paris est la capitale de la de ., Score: 0.0476
Prediction: Paris est la capitale de la, ., Score: 0.0289
Prediction: Paris est la capitale de la. ., Score: 0.0244
Prediction: Paris est la capitale de la la ., Score: 0.0203
Prediction: Paris est la capitale de las ., Score: 0.0203


  model_load.load_state_dict(torch.load(model_save_path))
