In [13]:
# # Text Generation Pipeline with GPT2 on Wikitext-2 (Fixed Version)

# # --- 1. Imports ---
# import torch
# from transformers import GPT2Tokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, Trainer, TrainingArguments, TrainerCallback
# from datasets import Dataset
# import logging

# # --- 2. Setup ---
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# # Configure logging
# logging.basicConfig(level=logging.INFO)
# logger = logging.getLogger(__name__)

# # --- 3. Load Pretrained Model and Tokenizer ---
# tokenizer = GPT2Tokenizer.from_pretrained("distilgpt2")
# model = GPT2LMHeadModel.from_pretrained("distilgpt2", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)

# # Define a padding token (GPT-2 does not have one by default)
# tokenizer.pad_token = tokenizer.eos_token

# # Move the model to the appropriate device
# model = model.to(device)

# # Enable gradient checkpointing to save memory
# model.gradient_checkpointing_enable()

# # --- 4. Use Predefined Dataset ---
# # Simulated high-quality text dataset to ensure stability
# train_texts = [
#     "Once upon a time, in a land far away, there lived a wise old king.",
#     "The quick brown fox jumps over the lazy dog.",
#     "Artificial intelligence is transforming the world in many ways.",
#     "The rain in Spain stays mainly in the plain.",
#     "In the midst of chaos, there is also opportunity."
# ] * 1000  # Repeat to simulate dataset size

# train_dataset = Dataset.from_dict({"text": train_texts})

# # --- 5. Prepare Dataset ---
# def tokenize_function(examples):
#     return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# tokenized_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# data_collator = DataCollatorForLanguageModeling(
#     tokenizer=tokenizer, mlm=False
# )

# # --- 6. Training Arguments ---
# training_args = TrainingArguments(
#     output_dir="./textgen_model",
#     overwrite_output_dir=True,
#     num_train_epochs=3,
#     per_device_train_batch_size=1,
#     save_steps=500,
#     save_total_limit=2,
#     prediction_loss_only=True,
#     logging_steps=100,
#     fp16=torch.cuda.is_available(),
#     report_to="none",
#     max_grad_norm=1.0
# )

# # --- 7. Trainer ---
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     data_collator=data_collator,
#     train_dataset=tokenized_dataset
# )

# # Add a callback to log training loss
# class LogCallback(TrainerCallback):
#     def on_log(self, args, state, control, logs=None, **kwargs):
#         if logs is not None:
#             logger.info(f"Step {state.global_step}: {logs}")

# trainer.add_callback(LogCallback())

# # --- 8. Train ---
# trainer.train()

# # --- 9. Test Text Generation ---
# def generate_text(prompt, max_length=100):
#     inputs = tokenizer(prompt, return_tensors="pt").to(device)
#     outputs = model.generate(
#         **inputs,
#         max_length=max_length,
#         num_return_sequences=1,
#         do_sample=True,
#         top_k=50,
#         top_p=0.95
#     )
#     return tokenizer.decode(outputs[0], skip_special_tokens=True)

# # --- 10. Example Usage ---
# prompt = "The future of artificial intelligence"
# print("\n--- Generated Text ---\n")
# print(generate_text(prompt))


# Script d'entraînement GPT-2 avec LoRA sur Wikitext-2
Ce notebook structure le script en cellules pour faciliter la compréhension et l'exécution.

## 1. Importation des bibliothèques nécessaires
On importe les modules essentiels pour le traitement des données, le modèle, l'entraînement et la gestion des configurations.

In [25]:
import os
import torch
import numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    Trainer, 
    TrainingArguments,
    DataCollatorForLanguageModeling,
    set_seed
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from huggingface_hub import notebook_login
from accelerate import Accelerator

## 2. Définition des configurations
On définit les noms des modèles, les chemins de sauvegarde, les hyperparamètres d'entraînement et les paramètres LoRA.

In [26]:
MODEL_NAME = "distilgpt2"
DATASET_NAME = "wikitext"
DATASET_CONFIG = "wikitext-2-raw-v1"
OUTPUT_DIR = "./results"
SEED = 42

TRAIN_CONFIG = {
    "batch_size": 8,
    "learning_rate": 3e-4,
    "num_epochs": 3,
    "max_length": 128,
    "warmup_steps": 100,
}

LORA_CONFIG = {
    "r": 16,
    "lora_alpha": 32,
    "target_modules": ["c_attn", "c_proj"],
    "lora_dropout": 0.05,
    "bias": "none",
    "task_type": "CAUSAL_LM",
}

## 3. Fonction principale
Définition de `main()` pour organiser le flux d'exécution de bout en bout.

In [27]:
print(TRAIN_CONFIG)

def main():
    set_seed(SEED)
    accelerator = Accelerator()
    os.makedirs(OUTPUT_DIR, exist_ok=True)

    print("Chargement du dataset...")
    dataset = load_dataset(DATASET_NAME, DATASET_CONFIG)

    print("Chargement du tokenizer...")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    tokenizer.pad_token = tokenizer.eos_token

    print("Prétraitement des données...")
    def preprocess_function(examples, tokenizer, max_length):
        return tokenizer(
            examples["text"],
            truncation=True,
            max_length=max_length,
            padding="max_length",
        )

    tokenized_dataset = dataset.map(
        preprocess_function,
        batched=True,
        num_proc=4,
        remove_columns=dataset["train"].column_names,
        fn_kwargs={"tokenizer": tokenizer, "max_length": TRAIN_CONFIG["max_length"]},
    )

    print("Initialisation du modèle...")
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        torch_dtype=torch.float16,
        device_map="auto"
    )

    print("Application de LoRA pour l'entraînement efficace...")
    model = prepare_model_for_kbit_training(model)
    lora_config = LoraConfig(**LORA_CONFIG)
    model = get_peft_model(model, lora_config)

    print(f"Nombre de paramètres entraînables: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        learning_rate=TRAIN_CONFIG["learning_rate"],
        per_device_train_batch_size=TRAIN_CONFIG["batch_size"],
        per_device_eval_batch_size=TRAIN_CONFIG["batch_size"],
        num_train_epochs=TRAIN_CONFIG["num_epochs"],
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        warmup_steps=TRAIN_CONFIG["warmup_steps"],
        report_to="tensorboard",
        logging_dir=os.path.join(OUTPUT_DIR, "logs"),
        logging_strategy="steps",
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        data_collator=data_collator,
    )

    print("Début de l'entraînement...")
    trainer.train()

    print("Sauvegarde du modèle...")
    model.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))
    tokenizer.save_pretrained(os.path.join(OUTPUT_DIR, "final_model"))

    print("\nGénération d'exemples de texte:")
    for _ in range(3):
        prompt = tokenizer.decode(tokenized_dataset["validation"][np.random.randint(0, len(tokenized_dataset["validation"]))]["input_ids"][:10])
        print(f"Prompt: {prompt}")

        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_length=100,
            num_return_sequences=1,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
        )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"Texte généré: {generated_text}\n")

{'batch_size': 8, 'learning_rate': 0.0003, 'num_epochs': 3, 'max_length': 128, 'warmup_steps': 100}


## 4. Point d'entrée du script
Cela permet une exécution contrôlée du programme, avec gestion d'interruptions utilisateur.

In [None]:
if __name__ == "__main__":
    try:
        main()
    except KeyboardInterrupt:
        print("Entraînement interrompu par l'utilisateur.")
    finally:
        print("Fin du programme.")

Chargement du dataset...
Chargement du tokenizer...
Prétraitement des données...


Map (num_proc=4):   0%|          | 0/4358 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/36718 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/3760 [00:00<?, ? examples/s]

Initialisation du modèle...
Application de LoRA pour l'entraînement efficace...




Nombre de paramètres entraînables: 811008
Début de l'entraînement...


Epoch,Training Loss,Validation Loss
