

1.   Dhimen Ayemane
2.   EL ANSARI Mostapha



In [1]:
import torch
import numpy as np
import random
import matplotlib.pyplot as plt
import math
import re


from transformers import BertTokenizer, BertForMaskedLM, DataCollatorForLanguageModeling,Trainer,TrainingArguments
from datasets import load_dataset
from tqdm import tqdm
from typing import Dict, List
from torch.optim import AdamW

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

print(f"Device utilisé: {device}")
print(f"CUDA disponible: {torch.cuda.is_available()}")

Device utilisé: cuda
CUDA disponible: True


# 1. Chargement et exploration du corpus

In [3]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Generating test split: 100%|██████████| 435

In [4]:
print(f"\nStructure du dataset:")
print(dataset)
print(f"\nTrain size: {len(dataset['train'])}")
print(f"Validation size: {len(dataset['validation'])}")
print(f"Test size: {len(dataset['test'])}")


Structure du dataset:
DatasetDict({
    test: Dataset({
        features: ['text'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['text'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['text'],
        num_rows: 3760
    })
})

Train size: 36718
Validation size: 3760
Test size: 4358


In [5]:
print("Premier exemple:")
print(dataset['train']['text'][:100])

Premier exemple:
['', ' = Valkyria Chronicles III = \n', '', ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n', " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making th

**Nettoyage des données**

In [6]:
def extract_and_clean_texts(split_dataset, max_samples=10000):
    texts = split_dataset['text'][:max_samples]
    cleaned = []
    for text in texts:
        if text.strip():  # Ignorer lignes vides
            text = re.sub(r'<unk>', '', text)
            text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
            text = re.sub(r'\s+', ' ', text).strip()
            if len(text.split()) > 1:
                cleaned.append(text)
    return cleaned

In [7]:
train_texts = extract_and_clean_texts(dataset['train'], max_samples=10000)
val_texts = extract_and_clean_texts(dataset['validation'], max_samples=2000)

In [8]:
print(f"Textes nettoyés - Train: {len(train_texts)}")
print(f"Textes nettoyés - Validation: {len(val_texts)}")

Textes nettoyés - Train: 5811
Textes nettoyés - Validation: 1172


# 2. Tokenisation

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [None]:
max_length = 128

def tokenize_texts(texts, max_len=128):
    all_input_ids = []

    for text in tqdm(texts, desc="Tokenisation"):
        encoded = tokenizer.encode(
            text,
            add_special_tokens=True,
            max_length=max_len,
            truncation=True,
            padding='max_length'
        )
        all_input_ids.append(encoded)

    return torch.tensor(all_input_ids)

In [None]:
train_input_ids = tokenize_texts(train_texts, max_length)
val_input_ids = tokenize_texts(val_texts, max_length)

In [None]:
print(f"Shape train_input_ids: {train_input_ids.shape}")
print(f"Shape val_input_ids: {val_input_ids.shape}")

# 3. Préparation du dataset

In [None]:
class MLMDataset(torch.utils.data.Dataset):
    """Dataset pour Masked Language Modeling"""
    def __init__(self, input_ids):
        self.input_ids = input_ids

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': (self.input_ids[idx] != tokenizer.pad_token_id).long()
        }

In [None]:
train_dataset = MLMDataset(train_input_ids)
val_dataset = MLMDataset(val_input_ids)

In [None]:
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15
)


# 4. Entraînement du modèle

In [None]:
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model.to(device)

In [None]:
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
print(f"Nombre total de paramètres: {total_params:,}")
print(f"Paramètres entraînables: {trainable_params:,}")

In [None]:
training_args = TrainingArguments(
    output_dir="./bert-mlm-wikitext2",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir='./logs',
    logging_steps=100,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    seed=seed,
    fp16=torch.cuda.is_available(),
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
)


In [None]:
train_result = trainer.train()

In [None]:
print(f"Loss finale (train): {train_result.training_loss:.4f}")
print(f"Temps total: {train_result.metrics['train_runtime']:.2f}s")
print(f"Samples/seconde: {train_result.metrics['train_samples_per_second']:.2f}")


In [None]:
trainer.save_model("./bert-mlm-final")
tokenizer.save_pretrained("./bert-mlm-final")
print("\nModèle sauvegardé dans ./bert-mlm-final")

# 5. Évaluation

In [None]:
eval_results = trainer.evaluate()
print(f"Loss (validation): {eval_results['eval_loss']:.4f}")

In [None]:
perplexity = math.exp(eval_results['eval_loss'])
print(f"Perplexité: {perplexity:.2f}")

print("\nInterprétation de la perplexité:")
if perplexity < 20:
    print("  Excellente performance")
elif perplexity < 30:
    print("  Bonne performance")
elif perplexity < 50:
    print("  Performance correcte")
else:
    print("  Performance à améliorer")

# Courbes d'entraînement et d'évaluation

In [None]:
log_history = trainer.state.log_history

# Extraire les données
train_losses = []
train_steps = []
eval_losses = []
eval_steps = []

for log in log_history:
    if 'loss' in log:
        train_losses.append(log['loss'])
        train_steps.append(log['step'])
    if 'eval_loss' in log:
        eval_losses.append(log['eval_loss'])
        eval_steps.append(log['step'])

In [None]:
# Créer les graphiques
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss d'entraînement
axes[0].plot(train_steps, train_losses, 'b-', linewidth=2, label='Train Loss')
axes[0].set_xlabel('Steps', fontsize=12)
axes[0].set_ylabel('Loss', fontsize=12)
axes[0].set_title('Training Loss', fontsize=14, fontweight='bold')
axes[0].legend(fontsize=10)
axes[0].grid(True, alpha=0.3)

# Loss de validation
axes[1].plot(eval_steps, eval_losses, 'r-', linewidth=2, label='Validation Loss')
axes[1].set_xlabel('Steps', fontsize=12)
axes[1].set_ylabel('Loss', fontsize=12)
axes[1].set_title('Validation Loss', fontsize=14, fontweight='bold')
axes[1].legend(fontsize=10)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig('training_curves.png', dpi=300, bbox_inches='tight')
plt.show()

print("Courbes sauvegardées dans 'training_curves.png'")

# 6. Expérimentation : Test sur phrases masquées

In [None]:
def predict_masked_word(text: str, top_k: int = 5):
    """Prédit les mots masqués dans une phrase"""
    inputs = tokenizer(text, return_tensors="pt").to(device)

    mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    results = []
    for idx in mask_token_index:
        mask_token_logits = logits[0, idx, :]
        top_tokens = torch.topk(mask_token_logits, top_k, dim=0)

        predictions = []
        for token_id, score in zip(top_tokens.indices, top_tokens.values):
            token = tokenizer.decode([token_id])
            predictions.append((token, score.item()))

        results.append(predictions)

    return results

test_sentences = [
    "The capital of France is [MASK].",
    "I love to play [MASK] in the park.",
    "The [MASK] is shining brightly today.",
    "Albert Einstein was a famous [MASK].",
    "She went to the [MASK] to buy some groceries.",
]

print("\nPrédictions sur des phrases de test:\n")
for sentence in test_sentences:
    print(f"Phrase: {sentence}")
    predictions = predict_masked_word(sentence)

    for i, preds in enumerate(predictions):
        print(f"  Top 5 prédictions pour [MASK] #{i+1}:")
        for token, score in preds:
            print(f"    - {token.strip():15s} (score: {score:.2f})")
    print()

complex_sentences = [
    "The [MASK] of the United States is Washington.",
    "Machine learning is a subset of artificial [MASK].",
    "Water boils at [MASK] degrees Celsius.",
]

print("Phrases complexes:\n")
for sentence in complex_sentences:
    print(f"Phrase: {sentence}")
    predictions = predict_masked_word(sentence, top_k=3)

    for i, preds in enumerate(predictions):
        print(f"  Prédictions:")
        for token, score in preds:
            print(f"    - {token.strip():15s} (score: {score:.2f})")
    print()

# Analyse des performances

- **Perte** : Loss finale (train) : `2.4524`, Loss (validation) : `2.2140`.  
  La perte diminue régulièrement pendant l'entraînement, indiquant une bonne convergence.

- **Perplexité** : `9.15` sur l’ensemble de validation.  
  Cela indique une excellente performance, car une perplexité basse signifie que le modèle est confiant dans ses prédictions et capture bien le contexte linguistique.

- **Temps d'entraînement** : `235.98s`, avec `73.88 samples/seconde`.

---

# Interprétation qualitative des prédictions

Les prédictions montrent que le modèle capture bien le contexte bidirectionnel de **BERT**. Par exemple :

- Pour *"The capital of France is [MASK]."*, le modèle prédit **'paris'** avec un score élevé, démontrant une compréhension factuelle.
- Pour des phrases complexes comme *"Machine learning is a subset of artificial [MASK]."*, il propose **'intelligence'**, ce qui est correct et montre une bonne généralisation.

Cependant, sur des termes rares ou ambigus, les prédictions secondaires peuvent être moins précises, soulignant l'importance d'un corpus plus large pour améliorer la robustesse.  
Cela illustre la force de **BERT** par rapport aux modèles unidirectionnels, en utilisant le contexte complet pour des prédictions plus précises.
