# üê≥ Install and importlibraries like unsloth and evaluate

In [None]:
!pip install evaluate unsloth rouge_score

In [1]:
from datasets import load_dataset, Dataset, DatasetDict
import evaluate
from evaluate import load
import torch
import pandas as pd
from tqdm import tqdm
from unsloth import FastLanguageModel


ü¶• Unsloth Zoo will now patch everything to make training faster!


# üê≥ Import the cnn_dailymail dataset

In [3]:
ds = load_dataset("abisee/cnn_dailymail", '3.0.0')
device = "cuda" if torch.cuda.is_available() else "cpu"
random_seed = 42
train_dataset = ds['train']
val_dataset = ds['validation']
test_dataset = ds['test']
train_dataset = train_dataset.select(range(10000))
val_dataset = val_dataset.select(range(100))
test_dataset = test_dataset.select(range(100))



README.md: 0.00B [00:00, ?B/s]

3.0.0/train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

3.0.0/train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

3.0.0/validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

3.0.0/test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

# üê≥ We will use a small language model for the summarization task (Qwen 0.5B)

## üêùWe use an Instruct model as it yields better and faster results after finetuning since it already have chat capabilities.

In [14]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2.5-0.5B-Instruct",
    max_seq_length = 32768,
    load_in_4bit = True,
    load_in_8bit = False,
    full_finetuning = False)

==((====))==  Unsloth 2025.12.9: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## üêù Here we define the chat template of the Qwen model

In [6]:
system_prompt = "You are a helpful assistant specialized in summarization."
#formatting function for the train and val dataset
def formatting_func(example):
    messages = [
        {"role": "system", "content": f"{system_prompt}"},
        {"role": "user", "content": "Summarize the following text:\n\n" + example["article"]},
        {"role": "assistant", "content": example["highlights"]},
    ]

    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False)
    return {"text": text}

# formatting function for the test dataset
# note here that both methods are correct, either as a dictionnary or a direct formatting of the text
def format_test_example(example):
    text = (
        "<|im_start|>system\n"
        f"{system_prompt}<|im_end|>\n"
        "<|im_start|>user\n"
        "Summarize the following text:\n\n"
        f"{example['article']}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )
    return {"text": text}

train_dataset = train_dataset.map(formatting_func)
val_dataset = val_dataset.map(formatting_func,)
test_dataset = test_dataset.map(format_test_example,)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

## üêù Example of a training element and a testing element

In [None]:
#train element
print(train_dataset[0]['text'])
#test element
print(test_dataset[0]['text'])

<|im_start|>system
You are a helpful assistant specialized in summarization.<|im_end|>
<|im_start|>user
Summarize the following text:

LONDON, England (Reuters) -- Harry Potter star Daniel Radcliffe gains access to a reported ¬£20 million ($41.1 million) fortune as he turns 18 on Monday, but he insists the money won't cast a spell on him. Daniel Radcliffe as Harry Potter in "Harry Potter and the Order of the Phoenix" To the disappointment of gossip columnists around the world, the young actor says he has no plans to fritter his cash away on fast cars, drink and celebrity parties. "I don't plan to be one of those people who, as soon as they turn 18, suddenly buy themselves a massive sports car collection or something similar," he told an Australian interviewer earlier this month. "I don't think I'll be particularly extravagant. "The things I like buying are things that cost about 10 pounds -- books and CDs and DVDs." At 18, Radcliffe will be able to gamble in a casino, buy a drink in a 

## üêù Let's first evaluate the capabilities of the model on the test dataset using rouge score.

In [None]:
%%time
from transformers import set_seed
seed=42
set_seed(seed)
prompts = test_dataset['text']
human_baseline_summaries = test_dataset['highlights']
model_summaries = []
model.eval()
for idx, prompt in enumerate(prompts):
    print(f'Now summarizing article number {idx+1}')
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    model_result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    model_output = model_result[0].split('assistant')[-1].replace('\n','',1)
    model_summaries.append(model_output)
zipped_summaries = list(zip(human_baseline_summaries, model_summaries))
df_initial = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'model_summaries'])
print(df_initial)

Now summarizing article number 1
Now summarizing article number 2
Now summarizing article number 3
Now summarizing article number 4
Now summarizing article number 5
Now summarizing article number 6
Now summarizing article number 7
Now summarizing article number 8
Now summarizing article number 9
Now summarizing article number 10
Now summarizing article number 11
Now summarizing article number 12
Now summarizing article number 13
Now summarizing article number 14
Now summarizing article number 15
Now summarizing article number 16
Now summarizing article number 17
Now summarizing article number 18
Now summarizing article number 19
Now summarizing article number 20
Now summarizing article number 21
Now summarizing article number 22
Now summarizing article number 23
Now summarizing article number 24
Now summarizing article number 25
Now summarizing article number 26
Now summarizing article number 27
Now summarizing article number 28
Now summarizing article number 29
Now summarizing article

In [None]:
rouge = evaluate.load('rouge')
score = rouge.compute(predictions=df['model_summaries'].to_list(), references=df['human_baseline_summaries'].to_list())
score = pd.DataFrame(score, index=['score'])
score

Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
score,0.239075,0.069989,0.171147,0.195998


# üê≥ Now, we finetune the model using Lora and supervised finetuning for trl

In [None]:
# Add lora adapaters
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

Unsloth 2025.12.9 patched 24 layers with 24 QKV layers, 24 O layers and 24 MLP layers.


In [None]:
from trl import SFTTrainer, SFTConfig

# Use supervised finetuning
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 8,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        num_train_epochs = 1, # increase for more training
        learning_rate = 2e-4,
        logging_steps = 10,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none",
        eval_strategy="steps",   # evaluate every eval_steps
        eval_steps=5,
        save_strategy="steps",         # save every save_steps
        save_steps=5,
        save_total_limit=3,            # keep only last 3 checkpoints
        load_best_model_at_end=True,   # automatically load best model after training
        metric_for_best_model="eval_loss",
    ),
    )

Unsloth: Tokenizing ["text"] (num_proc=4):   0%|          | 0/10000 [00:00<?, ? examples/s]

ü¶• Unsloth: Padding-free auto-enabled, enabling faster training.


## üêù we compute the cross entropy only on the assistant response

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|im_start|>user\n",
    response_part = "<|im_start|>assistant\n",
)

Map (num_proc=6):   0%|          | 0/10000 [00:00<?, ? examples/s]

In [None]:
trainer_stats = trainer.train()

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,000 | Num Epochs = 1 | Total steps = 313
O^O/ \_/ \    Batch size per device = 8 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (8 x 4 x 1) = 32
 "-____-"     Trainable parameters = 17,596,416 of 511,629,184 (3.44% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
5,No log,2.043945
10,2.080500,2.015655
15,2.080500,1.980962
20,1.848000,1.977632
25,1.848000,1.958241
30,1.825900,1.953445
35,1.825900,1.942851
40,1.787300,1.954237
45,1.787300,1.952856
50,1.780600,1.938316


Unsloth: Not an error, but Qwen2ForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


Step,Training Loss,Validation Loss
5,No log,2.043945
10,2.080500,2.015655
15,2.080500,1.980962
20,1.848000,1.977632
25,1.848000,1.958241
30,1.825900,1.953445
35,1.825900,1.942851
40,1.787300,1.954237
45,1.787300,1.952856
50,1.780600,1.938316


## üêù Save model to local

In [None]:
model.save_pretrained("lora_model")
tokenizer.save_pretrained("lora_model")

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/chat_template.jinja',
 'lora_model/vocab.json',
 'lora_model/merges.txt',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

## üêù Now we evaluate the finetuned model

In [25]:
from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "/content/drive/MyDrive/lora_model",
        max_seq_length = 32768,
        load_in_4bit = True,
    )

==((====))==  Unsloth 2025.12.9: Fast Qwen2 patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.1+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.5.1
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [8]:
%%time
from transformers import set_seed
seed=42
set_seed(seed)
prompts = test_dataset['text']
human_baseline_summaries = test_dataset['highlights']
model_summaries = []
model.eval()
for idx, prompt in enumerate(prompts):
    print(f'Now summarizing article number {idx+1}')
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(**inputs)
    model_result = tokenizer.batch_decode(outputs, skip_special_tokens=True)
    model_output = model_result[0].split('assistant')[-1].replace('\n','',1)
    model_summaries.append(model_output)
zipped_summaries = list(zip(human_baseline_summaries, model_summaries))
df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'model_summaries'])
rouge = evaluate.load('rouge')
score = rouge.compute(predictions=df['model_summaries'].to_list(), references=df['human_baseline_summaries'].to_list())
score = pd.DataFrame(score, index=['score'])
score

Now summarizing article number 1
Now summarizing article number 2
Now summarizing article number 3
Now summarizing article number 4
Now summarizing article number 5
Now summarizing article number 6
Now summarizing article number 7
Now summarizing article number 8
Now summarizing article number 9
Now summarizing article number 10
Now summarizing article number 11
Now summarizing article number 12
Now summarizing article number 13
Now summarizing article number 14
Now summarizing article number 15
Now summarizing article number 16
Now summarizing article number 17
Now summarizing article number 18
Now summarizing article number 19
Now summarizing article number 20
Now summarizing article number 21
Now summarizing article number 22
Now summarizing article number 23
Now summarizing article number 24
Now summarizing article number 25
Now summarizing article number 26
Now summarizing article number 27
Now summarizing article number 28
Now summarizing article number 29
Now summarizing article

Downloading builder script: 0.00B [00:00, ?B/s]

CPU times: user 4min 39s, sys: 858 ms, total: 4min 40s
Wall time: 5min


Unnamed: 0,rouge1,rouge2,rougeL,rougeLsum
score,0.267903,0.081543,0.198129,0.251434


## üêù we notice the rouge score has increased for all the levels and that the length for translations is smaller.

In [23]:
print("Our fintuned model example of summarization")
print(df.iloc[0]['model_summaries'])
print(f'length of the summary is {len(df.iloc[0]['model_summaries'].split(' '))}')
print("________________________________________________________________")
print("The initial model example of summarization")
print(df_initial.iloc[0]['model_summaries'])
print(f'length of the summary is {len(df_initial.iloc[0]['model_summaries'].split(' '))}')

Our fintuned model example of summarization
Pax Palestinians formally become 123rd member of International Criminal Court .
Court opens preliminary examination into alleged crimes in Palestinian territories .
ICC has jurisdiction over alleged crimes in Palestinian territories .
length of the summary is 30
________________________________________________________________
The initial model example of summarization
The Palestinian Authority has become the 123rd member of the International Criminal Court (ICC), marking a significant step towards expanding international cooperation and justice. The court has jurisdiction over alleged crimes committed by Palestinians, including those involving violence in Palestinian territories, such as East Jerusalem, since June 13, 2014. The Palestinian delegation attended the official opening of the ICC at The Hague on Wednesday, following the formal accession process. The ICC's initial findings were that Palestinians had been involved in serious crimes, 