In [1]:
%pip install -U datasets

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch \
    torchdata --quiet

%pip install \
    transformers \
    evaluate \
    rouge_score \
    loralib \
    peft
%pip install bitsandbytes==0.45.3



In [2]:
from datasets import load_dataset
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    GenerationConfig,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
import torch
import time
from evaluate import load
import pandas as pd
import numpy as np
import os
import bitsandbytes
from peft import LoraConfig, get_peft_model, TaskType

os.environ["WANDB_DISABLED"] = "true"

## Fine tuning de Llama 3 ou Qwen 3 1.7B

Le modèle `flan-t5-base`que nous avons utilisé jusqu'à maintenant est bien pour comprendre les principes mais c'est un modèle ancien aux performances dépassées par rapport aux modèles récents tels que Llama 3.

Dans cet exercice, vous allez charger puis fine tuner un LLM bien plus performant tout en conservant une taille acceptable de 3B de paramètres : Llama 3.2 - 3B. Nous pouvons aussi tester avec Qwen 3 1.7B (https://huggingface.co/Qwen/Qwen3-1.7B).

Afin que le modèle puisse être chargé en VRAM, nous utiliserons une version quantisée en 4bits : https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-bnb-4bit. L'utilisation de la bibliothèque `bitsandbytes`est alors indispensable.

**Redémarrer la session à ce stade pour réinitialiser la RAM et la VRAM**

### Conseils pour réaliser l'exercice : 

- Le modèle n'est plus de type _Encoder Decoder_ (Seq2Seq) mais _Decoder only_ (CausalLM). Effectuer les modifications en conséquence
- Réduire la taille du jeu de données d'entraînement pour rester dans des temps acceptables (100 exemples)
- Modifier les arguments d'entraînement (`TrainingArguments`) pour prendre accélérer le traitement : considérer les paramètres `per_device_train_batch_size`, `gradient_accumulation_steps`, `gradient_chekpointing`.

L'exercice peut prendre un certain temps, faites votre maximum et avancer pas à pas.

In [3]:
MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


In [4]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name).filter(
    lambda x, index: index % 100 == 0, with_indices=True
)


def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example["input_ids"] = tokenizer(
        prompt, padding="max_length", truncation=True, return_tensors="pt"
    ).input_ids
    example["labels"] = tokenizer(
        example["summary"], padding="max_length", truncation=True, return_tensors="pt"
    ).input_ids

    return example


tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    [
        "id",
        "topic",
        "dialogue",
        "summary",
    ]
)

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
)

# Vérifier le nombre de paramètres entraînables après LoRA plus bas

In [6]:
def build_prompt(dialogue: str) -> str:
    return f"Summarize the following conversation.\n\n{dialogue}\n\nSummary: "


def causal_tokenize(batch):
    input_ids_list, attn_list, labels_list = [], [], []
    for dialogue, summary in zip(batch["dialogue"], batch["summary"]):
        prompt = build_prompt(dialogue)
        full_text = prompt + summary
        tok_full = tokenizer(
            full_text,
            padding="max_length",
            truncation=True,
            max_length=512,
        )
        prompt_ids = tokenizer(
            prompt,
            truncation=True,
            max_length=512,
        )["input_ids"]

        labels = tok_full["input_ids"].copy()
        # Masquer les tokens du prompt pour ne pas les compter dans la loss
        labels[: len(prompt_ids)] = [-100] * len(prompt_ids)

        input_ids_list.append(tok_full["input_ids"])
        attn_list.append(tok_full["attention_mask"])
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attn_list,
        "labels": labels_list,
    }


causal_dataset = dataset.map(
    causal_tokenize, batched=True, remove_columns=dataset["train"].column_names
)
train_causal = causal_dataset["train"].select(range(100))

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [7]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)


peft_model = get_peft_model(model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 24313856
all model parameters: 1827777536
percentage of trainable model parameters: 1.33%


In [10]:
output_dir = "./training-output"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=20,
    report_to="none",
)

trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=train_causal,
)

# Lancement de l'entraînement (adapter max_steps / epochs si plus de temps)
trainer.train()

Step,Training Loss
1,8.1823


Step,Training Loss
1,9.1161
2,2.7848
3,5.0627
4,5.0429
5,4.8979
6,3.6223
7,2.6204
8,4.1896
9,3.5352
10,4.0297


TrainOutput(global_step=50, training_loss=3.530032465457916, metrics={'train_runtime': 316.8188, 'train_samples_per_second': 0.316, 'train_steps_per_second': 0.158, 'total_flos': 873388415385600.0, 'train_loss': 3.530032465457916, 'epoch': 1.0})

In [13]:
SAVE_DIR = "training-output"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"Sauvegardé dans {SAVE_DIR}")

Sauvegardé dans training-output


In [18]:
from peft import PeftModel

original_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
)
peft_llama = PeftModel.from_pretrained(
    original_model,
    SAVE_DIR,
    is_trainable=False,
)

test_idx = 0
test_dialogue = dataset["test"][test_idx]["dialogue"]
test_summary = dataset["test"][test_idx]["summary"]

test_prompt = build_prompt(test_dialogue)
inputs = tokenizer(test_prompt, return_tensors="pt").to(peft_llama.device)

outputs = peft_llama.generate(
    **inputs,
    generation_config=GenerationConfig(max_new_tokens=128, temperature=0.7),
)
print("--- Prompt ---")
print(test_prompt[:400] + "...\n")
print("--- Réponse PEFT ---")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
print("--- Référence humaine ---")
print(test_summary)

--- Prompt ---
Summarize the following conversation.

#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official...

--- Réponse PEFT ---
Summarize the following conversation.

#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official memos. The use of Instant Message programs by employees during working hours is strictly prohibited.
#Person2#: Sir, does this apply to intra-office communica

In [None]:
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

dialogues = dataset["test"][0:10]["dialogue"]
human_baseline_summaries = dataset["test"][0:10]["summary"]

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, dialogue in tqdm(enumerate(dialogues)):
    prompt = f"""
Summarize the following conversation.

{dialogue}

Summary: """
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

    human_baseline_text_output = human_baseline_summaries[idx]

    original_model_outputs = original_model.to(device).generate(
        input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200)
    )
    original_model_text_output = tokenizer.decode(
        original_model_outputs[0], skip_special_tokens=True
    )

    peft_model_outputs = peft_model.to(device).generate(
        input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200)
    )
    peft_model_text_output = tokenizer.decode(
        peft_model_outputs[0], skip_special_tokens=True
    )
    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(
    zip(human_baseline_summaries, original_model_summaries, peft_model_summaries)
)

df = pd.DataFrame(
    zipped_summaries,
    columns=[
        "human_baseline_summaries",
        "original_model_summaries",
        "peft_model_summaries",
    ],
)

`generation_config` default values have been modified to match model-specific defaults: {'max_length': 131072, 'do_sample': True, 'temperature': 0.6, 'top_p': 0.9, 'pad_token_id': 128004, 'bos_token_id': 128000, 'eos_token_id': [128001, 128008, 128009]}. If this is not desired, please set these values explicitly.


In [None]:
from evaluate import load

rouge = load("rouge")


def compute_rouge_scores(df, column_names):
    """Compute ROUGE scores for the specified columns in the DataFrame."""
    rouge_scores = {}
    for col in column_names:
        scores = rouge.compute(
            predictions=df[col].tolist(),
            references=df["human_baseline_summaries"].tolist(),
        )
        rouge_scores[col] = scores
    return rouge_scores


rouge_scores = compute_rouge_scores(
    df,
    [
        "original_model_summaries",
        "peft_model_summaries",
    ],
)
print(pd.DataFrame(rouge_scores))

In [None]:
def calc_percentage_difference(scores, col_names):
    """Calculate the percentage difference in ROUGE scores between two columns."""
    percentage_differences = {}
    col1, col2 = col_names
    for rouge_type in scores[col1].keys():
        score1 = scores[col1][rouge_type]
        score2 = scores[col2][rouge_type]
        percentage_diff = (
            ((score2 - score1) / score1) * 100 if score1 != 0 else float("inf")
        )
        percentage_differences[rouge_type] = percentage_diff
    return percentage_differences


diff_scores = calc_percentage_difference(
    rouge_scores,
    [
        "original_model_summaries",
        "peft_model_summaries",
    ],
)

print(
    pd.DataFrame.from_dict(
        diff_scores,
        orient="index",
        columns=[
            "difference_percentage",
        ],
    )
)