In [22]:
%pip install -U datasets

%pip install --upgrade pip
%pip install --disable-pip-version-check \
    torch \
    torchdata --quiet

%pip install \
    transformers \
    evaluate \
    rouge_score \
    loralib \
    peft \
    bert_score
%pip install bitsandbytes==0.45.3



In [23]:
from datasets import load_dataset
from transformers import (
    AutoModel,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    GenerationConfig,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
)
import torch
import time
from evaluate import load
import pandas as pd
import numpy as np
import os
import bitsandbytes
from peft import LoraConfig, get_peft_model, TaskType
import gc

# Clear GPU memory before training
torch.cuda.empty_cache()
gc.collect()
os.environ["WANDB_DISABLED"] = "true"

## Fine tuning de Llama 3 ou Qwen 3 1.7B

Le modèle `flan-t5-base`que nous avons utilisé jusqu'à maintenant est bien pour comprendre les principes mais c'est un modèle ancien aux performances dépassées par rapport aux modèles récents tels que Llama 3.

Dans cet exercice, vous allez charger puis fine tuner un LLM bien plus performant tout en conservant une taille acceptable de 3B de paramètres : Llama 3.2 - 3B. Nous pouvons aussi tester avec Qwen 3 1.7B (https://huggingface.co/Qwen/Qwen3-1.7B).

Afin que le modèle puisse être chargé en VRAM, nous utiliserons une version quantisée en 4bits : https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-bnb-4bit. L'utilisation de la bibliothèque `bitsandbytes`est alors indispensable.

**Redémarrer la session à ce stade pour réinitialiser la RAM et la VRAM**


### Conseils pour réaliser l'exercice :

-   Le modèle n'est plus de type _Encoder Decoder_ (Seq2Seq) mais _Decoder only_ (CausalLM). Effectuer les modifications en conséquence
-   Réduire la taille du jeu de données d'entraînement pour rester dans des temps acceptables (100 exemples)
-   Modifier les arguments d'entraînement (`TrainingArguments`) pour prendre accélérer le traitement : considérer les paramètres `per_device_train_batch_size`, `gradient_accumulation_steps`, `gradient_chekpointing`.

L'exercice peut prendre un certain temps, faites votre maximum et avancer pas à pas.


In [24]:
MODEL_NAME = "unsloth/Llama-3.2-3B-Instruct-bnb-4bit"
tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
)

# LLaMA models don't have a pad token - set it to eos_token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Left padding is better for causal LM generation
tokenizer.padding_side = "left"

In [25]:
huggingface_dataset_name = "knkarthick/dialogsum"
dataset = load_dataset(huggingface_dataset_name).filter(
    lambda x, index: index % 100 == 0, with_indices=True
)


def tokenize_function(example):
    start_prompt = "Summarize the following conversation.\n\n"
    end_prompt = "\n\nSummary: "
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example["input_ids"] = tokenizer(
        prompt, padding="max_length", truncation=True, return_tensors="pt"
    ).input_ids
    example["labels"] = tokenizer(
        example["summary"], padding="max_length", truncation=True, return_tensors="pt"
    ).input_ids

    return example


tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(
    [
        "id",
        "topic",
        "dialogue",
        "summary",
    ]
)

In [26]:
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
)

# Vérifier le nombre de paramètres entraînables après LoRA plus bas

In [27]:
def build_prompt(dialogue: str) -> str:
    return f"Summarize the following conversation.\n\n{dialogue}\n\nSummary: "


def causal_tokenize(batch):
    input_ids_list, attn_list, labels_list = [], [], []

    # Use RIGHT padding for training (more standard for causal LM training)
    tokenizer.padding_side = "right"

    for dialogue, summary in zip(batch["dialogue"], batch["summary"]):
        prompt = build_prompt(dialogue)
        # Add EOS token at the end of summary
        full_text = prompt + summary + tokenizer.eos_token

        tok_full = tokenizer(
            full_text,
            padding="max_length",
            truncation=True,
            max_length=512,  # Back to 512 for better context
            add_special_tokens=True,
        )

        # Tokenize prompt separately to find where summary starts
        prompt_tok = tokenizer(
            prompt,
            truncation=True,
            max_length=512,
            add_special_tokens=False,  # Don't add BOS here since full_text already has it
        )
        prompt_len = len(prompt_tok["input_ids"]) + 1  # +1 for BOS token

        labels = tok_full["input_ids"].copy()
        # With RIGHT padding: mask prompt tokens at the start, and padding at the end
        for i in range(len(labels)):
            if i < prompt_len or tok_full["attention_mask"][i] == 0:
                labels[i] = -100

        input_ids_list.append(tok_full["input_ids"])
        attn_list.append(tok_full["attention_mask"])
        labels_list.append(labels)

    return {
        "input_ids": input_ids_list,
        "attention_mask": attn_list,
        "labels": labels_list,
    }


causal_dataset = dataset.map(
    causal_tokenize, batched=True, remove_columns=dataset["train"].column_names
)
# Use all available training data
train_causal = causal_dataset["train"]
print(f"Training samples: {len(train_causal)}")

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Training samples: 125


In [28]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"


lora_config = LoraConfig(
    r=16,  # Reduced from 32 to save memory
    lora_alpha=32,  # 2x rank
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
    ],  # Reduced target modules to save memory
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)


peft_model = get_peft_model(model, lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 9175040
all model parameters: 1812638720
percentage of trainable model parameters: 0.51%


In [29]:
# Clear GPU memory before training
torch.cuda.empty_cache()
gc.collect()

# Enable gradient checkpointing with use_reentrant=False for PEFT compatibility
peft_model.gradient_checkpointing_enable(
    gradient_checkpointing_kwargs={"use_reentrant": False}
)

output_dir = "./training-output"

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,  # Higher learning rate for LoRA
    num_train_epochs=5,  # More epochs
    logging_steps=5,
    warmup_ratio=0.1,
    lr_scheduler_type="cosine",
    optim="adamw_torch",
    weight_decay=0.01,
    report_to="none",
    save_strategy="no",
    fp16=True,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    max_grad_norm=1.0,
)

trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=train_causal,
)

# Lancement de l'entraînement
trainer.train()

Step,Training Loss
5,1.5544
10,1.3919
15,1.2847
20,1.2468
25,1.0274
30,1.0048
35,1.1339
40,0.8838
45,1.0039
50,0.8516


TrainOutput(global_step=160, training_loss=0.6915067724883557, metrics={'train_runtime': 649.4924, 'train_samples_per_second': 0.962, 'train_steps_per_second': 0.246, 'total_flos': 5429611069440000.0, 'train_loss': 0.6915067724883557, 'epoch': 5.0})

In [30]:
SAVE_DIR = "training-output"
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR)
print(f"Sauvegardé dans {SAVE_DIR}")

Sauvegardé dans training-output


In [31]:
from peft import PeftModel

# Set back to left padding for generation
tokenizer.padding_side = "left"

original_model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    trust_remote_code=True,
)
peft_llama = PeftModel.from_pretrained(
    original_model,
    SAVE_DIR,
    is_trainable=False,
)

test_idx = 0
test_dialogue = dataset["test"][test_idx]["dialogue"]
test_summary = dataset["test"][test_idx]["summary"]

test_prompt = build_prompt(test_dialogue)
inputs = tokenizer(test_prompt, return_tensors="pt").to(peft_llama.device)

outputs = peft_llama.generate(
    **inputs,
    generation_config=GenerationConfig(
        max_new_tokens=128, temperature=0.7, do_sample=True
    ),
)
# Decode only the NEW tokens (not the prompt)
generated_text = tokenizer.decode(
    outputs[0][inputs["input_ids"].shape[1] :], skip_special_tokens=True
)

print("--- Prompt ---")
print(test_prompt[:400] + "...\n")
print("--- Réponse PEFT ---")
print(generated_text)
print("--- Référence humaine ---")
print(test_summary)

--- Prompt ---
Summarize the following conversation.

#Person1#: Ms. Dawson, I need you to take a dictation for me.
#Person2#: Yes, sir...
#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?
#Person2#: Yes, sir. Go ahead.
#Person1#: Attention all staff... Effective immediately, all office communications are restricted to email correspondence and official...

--- Réponse PEFT ---
 #Person1# asks Ms. Dawson to take dictation for an intra-office memorandum. #Person1# introduces the policy that all office communications are restricted to email and official memos and those who use Instant Messaging will face termination.
--- Référence humaine ---
Ms. Dawson helps #Person1# to write a memo to inform every employee that they have to change the communication method and should not use Instant Messaging anymore.


In [32]:
from tqdm import tqdm

device = "cuda" if torch.cuda.is_available() else "cpu"

dialogues = dataset["test"][0:10]["dialogue"]
human_baseline_summaries = dataset["test"][0:10]["summary"]

original_model_summaries = []
peft_model_summaries = []

for idx, dialogue in tqdm(enumerate(dialogues)):
    prompt = build_prompt(dialogue)
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device)
    prompt_length = inputs["input_ids"].shape[1]

    # Generate with original model - use consistent generation config
    gen_config = GenerationConfig(
        max_new_tokens=150,
        pad_token_id=tokenizer.pad_token_id,
        eos_token_id=tokenizer.eos_token_id,
        do_sample=False,  # Greedy for fair comparison
    )

    original_model_outputs = original_model.to(device).generate(
        **inputs,
        generation_config=gen_config,
    )
    original_model_text_output = tokenizer.decode(
        original_model_outputs[0][prompt_length:], skip_special_tokens=True
    ).strip()

    # Generate with PEFT model
    peft_model_outputs = peft_llama.to(device).generate(
        **inputs,
        generation_config=gen_config,
    )
    peft_model_text_output = tokenizer.decode(
        peft_model_outputs[0][prompt_length:], skip_special_tokens=True
    ).strip()

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)


zipped_summaries = list(
    zip(human_baseline_summaries, original_model_summaries, peft_model_summaries)
)

df = pd.DataFrame(
    zipped_summaries,
    columns=[
        "human_baseline_summaries",
        "original_model_summaries",
        "peft_model_summaries",
    ],
)
df

10it [01:20,  8.08s/it]


Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,Ms. Dawson helps #Person1# to write a memo to ...,#Person1# needs Ms. Dawson to take a dictation...,#Person1# asks Ms. Dawson to take dictation fo...
1,#Person1# and Mike have a disagreement on how ...,#Person1# and #Person2# are doing an experimen...,#Person1# and #Person2# are doing a scene trai...
2,#Person1# teaches #Person2# how to upgrade sof...,#Person1# suggests #Person2# to upgrade the sy...,#Person1# recommends #Person2# to upgrade the ...
3,#Person1# is crazy for Trump and voted for him...,"#Person2# is supporting Trump, while #Person1#...","#Person1# thinks Trump isn't the right person,..."
4,#Person1# and #Person2# are talking about the ...,#Person1# and #Person2# talk about the heavy s...,#Person1# and #Person2# talk about the heavy s...
5,#Person2# tells David the plan for a tour and ...,#Person2# is making plans for a tour with #Per...,#Person2# is making plans for a tour with #Per...
6,It's #Person1#'s first time to China and #Pers...,#Person1# is starving and #Person2# recommends...,#Person1# is starving and wants to try some re...
7,#Person2# thinks #Person1#'s new suit is not w...,#Person2# thinks #Person1#'s new suit is not a...,#Person1# thinks #Person1#'s new suit is not a...
8,#Person2# tells #Person1# about the relationsh...,#Person2# tells #Person1# about his family in ...,"#Person2# tells #Person1# about Bill, his wife..."
9,#Person1# asks #Person2# to scedule an emergen...,#Person1# and #Person2# need to call an emerge...,#Person1# and #Person2# need to call an emerge...


In [33]:
from bert_score import score as bert_score


def compute_bertscore(df, column_names):
    """Compute BERTScore for the specified columns in the DataFrame."""
    bert_scores = {}
    references = df["human_baseline_summaries"].tolist()

    for col in column_names:
        predictions = df[col].tolist()
        P, R, F1 = bert_score(predictions, references, lang="en", verbose=True)
        bert_scores[col] = {
            "precision": P.mean().item(),
            "recall": R.mean().item(),
            "f1": F1.mean().item(),
        }
    return bert_scores


bert_scores = compute_bertscore(
    df,
    [
        "original_model_summaries",
        "peft_model_summaries",
    ],
)
print(pd.DataFrame(bert_scores))

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.22 seconds, 44.52 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.21 seconds, 47.25 sentences/sec
           original_model_summaries  peft_model_summaries
precision                  0.914271              0.918608
recall                     0.915814              0.914797
f1                         0.914913              0.916576


In [34]:
def calc_percentage_difference(scores, col_names):
    """Calculate the percentage difference in BERTScore between two columns."""
    percentage_differences = {}
    col1, col2 = col_names
    for metric in scores[col1].keys():
        score1 = scores[col1][metric]
        score2 = scores[col2][metric]
        percentage_diff = (
            ((score2 - score1) / score1) * 100 if score1 != 0 else float("inf")
        )
        percentage_differences[metric] = percentage_diff
    return percentage_differences


diff_scores = calc_percentage_difference(
    bert_scores,
    [
        "original_model_summaries",
        "peft_model_summaries",
    ],
)

print(
    pd.DataFrame.from_dict(
        diff_scores,
        orient="index",
        columns=[
            "difference_percentage",
        ],
    )
)

           difference_percentage
precision               0.474316
recall                 -0.111020
f1                      0.181750
