# 🐋 Required packages installation

In [None]:
!pip install datasets transformers torch tqdm pandas autoawq
!pip install -q --no-deps xformers trl peft accelerate
!pip install -U bitsandbytes
!pip install evaluate rouge_score

Collecting autoawq
  Downloading autoawq-0.2.9.tar.gz (74 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.3/74.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB

# 🐋 Packages Import

In [None]:
from datasets import load_dataset, Dataset, load_from_disk, DatasetDict
from transformers import (
    pipeline,
    AutoTokenizer,
    BartForConditionalGeneration,
    AutoModelForCausalLM,
    Qwen2ForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from evaluate import load
import torch
import pandas as pd
import os
from tqdm import tqdm
import numpy as np
from accelerate import Accelerator


# 🐋 Dataset and Annotater choice:



## 🪽 Dataset: Cnn daily mails
For this task, I chose the CNN Daily Mail dataset for its short context length, which minimizes constraints on the Colab environment.

In [None]:
# Load the dataset and set device
ds = load_dataset("abisee/cnn_dailymail", '3.0.0')
device = "cuda" if torch.cuda.is_available() else "cpu"

## 🪽 Summarizer: bart-large-cnn

After experimenting with multiple LLMs like Mistral 7B, Zephyr 7B, and Gemma 2B, which all caused the session to crash even with quantization techniques like AWQ, I chose a smaller language model fine-tuned on CNN Daily Mail to avoid information loss.

In [None]:
# Load model and tokenizer
model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn")

The following piece of code is a function that take the documents to summarize and the batch size and return the summaries. The summaries are then saved into drive for further use.

In [None]:
# Function to generate summaries in batches
def generate_summaries(documents, batch_size=4):
    summaries = []
    for i in tqdm(range(0, len(documents), batch_size), desc="Generating summaries"):
        batch_docs = documents[i:i+batch_size]
        # Tokenize batch
        inputs = tokenizer(batch_docs, max_length=1024, padding=True, truncation=True, return_tensors="pt")
        # Move input tensors to device
        inputs = {k: v.to(device) for k, v in inputs.items()}
        # Generate summaries
        summary_ids = model.generate(
            inputs["input_ids"],
            num_beams=2,
            min_length=0,
            max_length=2000
        )
        # Decode summaries
        batch_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
        summaries.extend(batch_summaries)

    return summaries

#First 5000 documents
documents = [ds['train'][i]['article'] for i in range(5000)]

# Generate summaries
summaries = generate_summaries(documents, batch_size=1)
summary_data = [{"article": doc, "summary": summary} for doc, summary in zip(documents, summaries)]

# Convert to a Dataset object
summary_dataset = Dataset.from_pandas(pd.DataFrame(summary_data))

# Save the dataset
os.makedirs('/content/drive/MyDrive/summaries_dataset', exist_ok=True)
save_path = '/content/drive/MyDrive/summaries_dataset'
summary_dataset.save_to_disk(save_path)
print(f"Summaries generated and stored in Google Drive at: {save_path}")

# 🐋 Qwen 0.5B finetuning:

In [None]:
# load saved data
path = '/content/drive/MyDrive/summaries_dataset'
ds_summaries = load_from_disk(path)
# Generate train, validation and test splits
random_seed = 42
train_val_test_dataset = ds_summaries.train_test_split(test_size=0.2, seed=random_seed)
test_dataset = train_val_test_dataset['test']
train_val_dataset = train_val_test_dataset['train']
train_val_dataset = train_val_dataset.train_test_split(test_size=0.2, seed=random_seed)
train_dataset = train_val_dataset['train']
val_dataset = train_val_dataset['test']

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B", device_map="auto",quantization_config=bnb_config,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B",return_tensors="pt", device="auto",padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

A small comparison between the human baseline summary and the Qwen summary.

In [None]:
%%time
from transformers import set_seed
seed=42
set_seed(seed)

index = 1
article = test_dataset[index]['article']
summary = test_dataset[index]['summary']

prompt = f"Instruct: Summarize the following article.\n{article}\nOutput:\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=100)
model_res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
model_output = model_res[0].split('Output:\n')[1]
prefix, success, result = model_output.partition('###')

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'Qwen MODEL:\n{prefix}')

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Summarize the following article.
(CNN) -- Millions of poor American children have untreated tooth decay, some of them because they cannot find a dentist willing to treat them, a federal report issued Tuesday said. Only 1 in 3 children in Medicaid received any dental care over a year time span, according to a new report. "Dental disease remains a significant problem for children aged 2 through 18 in Medicaid," the U.S. Government Accountability Office report concluded, referring to the federal/state health program for poor people. According to the report, which used data from 1999 to 2004, about 6.5 million children enrolled in Medicaid had untreated tooth decay in 2005 and were nearly twice as likely as children with private health insurance to have untreated tooth decay. The GAO report was ordered after widespread publicity of the case of Deamonte Driver, a 12-ye

1. LoRA Configuration Setup

In [None]:
config = LoraConfig(
    r=32, #Rank
    lora_alpha=32,
    target_modules=[
        'q_proj',
        'k_proj',
        'v_proj',
        'dense'
    ],
    bias="none",
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
)

2. Prompt Formatting Function and max length detection

In [None]:
def create_prompt_formats(sample):

    INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    INSTRUCTION_KEY = "### Instruct: Summarize the below article."
    RESPONSE_KEY = "### Output:"
    END_KEY = "### End"

    blurb = f"\n{INTRO_BLURB}"
    instruction = f"{INSTRUCTION_KEY}"
    input_context = f"{sample['article']}" if sample["article"] else None
    response = f"{RESPONSE_KEY}\n{sample['summary']}"
    end = f"{END_KEY}"

    parts = [part for part in [blurb, instruction, input_context, response, end] if part]

    formatted_prompt = "\n\n".join(parts)
    sample["text"] = formatted_prompt

    return sample
from functools import partial
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats)#, batched=True)

    # Apply preprocessing to each batch of the dataset & and remove 'article' and 'summary'
    _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        _preprocessing_function,
        batched=True,
        remove_columns=['article', 'summary'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset
max_length = get_max_length(model)
print(max_length)
seed= 42
train_dataset = preprocess_dataset(tokenizer, max_length,seed, train_dataset)
val_dataset = preprocess_dataset(tokenizer, max_length,seed, val_dataset)
test_dataset = preprocess_dataset(tokenizer, max_length,seed, test_dataset)

Found max lenth: 32768
32768
Preprocessing dataset...
Preprocessing dataset...
Preprocessing dataset...


Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

3. Peft wrapper


In [None]:
model = prepare_model_for_kbit_training(model)
model.gradient_checkpointing_enable()

peft_model = get_peft_model(model, config)

## 🪽 Training:


In [None]:
import transformers

output_dir = '/content/drive/MyDrive/qwen'

# Configure training arguments
peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    max_steps=1000,
    learning_rate=2e-4,
    optim="paged_adamw_8bit",
    logging_steps=25,
    logging_dir="./logs",
    save_strategy="steps",
    save_steps=25,
    gradient_checkpointing=True,
    report_to="none",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

peft_model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),

)
peft_trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
25,2.6383
50,2.3096
75,2.5536
100,2.239
125,2.6016
150,2.2999
175,2.5363
200,2.2777
225,2.567
250,2.2328


TrainOutput(global_step=1000, training_loss=2.3832286605834963, metrics={'train_runtime': 3856.4555, 'train_samples_per_second': 1.037, 'train_steps_per_second': 0.259, 'total_flos': 7740671095580160.0, 'train_loss': 2.3832286605834963, 'epoch': 1.25})

## 🪽 Testing:


In [None]:
from peft import PeftModel
qwen_model = Qwen2ForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B", device_map="auto",quantization_config=bnb_config,trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B",return_tensors="pt", device="auto",padding_side="left",add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
ft_model = PeftModel.from_pretrained(qwen_model, "/content/drive/MyDrive/qwen/checkpoint-1000",torch_dtype=torch.float16,is_trainable=False)

Comparison between the human base summary and the newly trained model.

In [None]:
%%time
from transformers import set_seed
set_seed(seed)

index = 1
article = test_dataset[index]['article']
summary = test_dataset[index]['summary']

prompt = f"Instruct: Summarize the following article.\n{article}\nOutput:\n"
inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
outputs = ft_model.generate(**inputs, max_new_tokens=100)
peft_model_res = tokenizer.batch_decode(outputs, skip_special_tokens=True)
peft_model_output = peft_model_res[0].split('Output:\n')[1].split('\nAssistant')[0].strip()
prefix, success, result = peft_model_output.partition('###')

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE HUMAN SUMMARY:\n{summary}\n')
print(dash_line)
print(f'PEFT MODEL:\n{prefix}')

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


---------------------------------------------------------------------------------------------------
INPUT PROMPT:
Instruct: Summarize the following article.
(CNN) -- Millions of poor American children have untreated tooth decay, some of them because they cannot find a dentist willing to treat them, a federal report issued Tuesday said. Only 1 in 3 children in Medicaid received any dental care over a year time span, according to a new report. "Dental disease remains a significant problem for children aged 2 through 18 in Medicaid," the U.S. Government Accountability Office report concluded, referring to the federal/state health program for poor people. According to the report, which used data from 1999 to 2004, about 6.5 million children enrolled in Medicaid had untreated tooth decay in 2005 and were nearly twice as likely as children with private health insurance to have untreated tooth decay. The GAO report was ordered after widespread publicity of the case of Deamonte Driver, a 12-ye

In [None]:
peft_model_res[0].split('Output:\n')[1].split('\nAssistant')[0].strip()

'The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.'

In [None]:
import pandas as pd

articles = test_dataset[0:10]['article']
human_baseline_summaries = test_dataset[0:10]['summary']

original_model_summaries = []
instruct_model_summaries = []
peft_model_summaries = []

for idx, article in enumerate(articles):
    human_baseline_text_output = human_baseline_summaries[idx]
    prompt = f"Instruct: Summarize the following article.\n{article}\nOutput:\n"

    original_model_res = tokenizer.batch_decode(model.generate(**inputs, max_new_tokens=100), skip_special_tokens=True)
    original_model_text_output = original_model_res[0].split('Output:\n')[1]

    peft_model_res = tokenizer.batch_decode(ft_model.generate(**inputs, max_new_tokens=100), skip_special_tokens=True)
    peft_model_output = peft_model_res[0].split('Output:\n')[1].split('\nAssistant')[0].strip()
    print(peft_model_output)
    peft_model_text_output, success, result = peft_model_output.partition('###')

    original_model_summaries.append(original_model_text_output)
    peft_model_summaries.append(peft_model_text_output)

zipped_summaries = list(zip(human_baseline_summaries, original_model_summaries, peft_model_summaries))

df = pd.DataFrame(zipped_summaries, columns = ['human_baseline_summaries', 'original_model_summaries', 'peft_model_summaries'])
df

Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Setting `pad_token_id` to `eos_token_id`:151643 for open-end generation.


The U.S. Government Accountability Office report says only 1 in 3 children in Medicaid received any dental care over a year time span. The report says 14.8 percent of Medicaid recipients said their children had not gotten necessary dental care. The report says 1 in eight reportedly had never seen a dentist.


Unnamed: 0,human_baseline_summaries,original_model_summaries,peft_model_summaries
0,South Africa beat Australia by nine wickets in...,Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...
1,GAO: 6.5 million children enrolled in Medicaid...,Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...
2,Travel warning issued against non-essential tr...,Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...
3,"NEW: ACLU calls decision ""a striking blow to d...",Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...
4,Manchester City have sent representatives to B...,Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...
5,White House says Democrats are holding veteran...,Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...
6,28 illegal immigrant workers were arrested in ...,Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...
7,"Pakistani government: More than 1,000 militant...",Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...
8,Home secretary Jacqui Smith's future in doubt ...,Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...
9,Ed Genson had headed Blagojevich's defense tea...,Summary: The article discusses the issue of un...,The U.S. Government Accountability Office repo...


Rouge evaluation

In [None]:
import evaluate

rouge = evaluate.load('rouge')

original_model_results = rouge.compute(
    predictions=original_model_summaries,
    references=human_baseline_summaries[0:len(original_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

peft_model_results = rouge.compute(
    predictions=peft_model_summaries,
    references=human_baseline_summaries[0:len(peft_model_summaries)],
    use_aggregator=True,
    use_stemmer=True,
)

print('ORIGINAL MODEL:')
print(original_model_results)
print('PEFT MODEL:')
print(peft_model_results)

print("Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL")

improvement = (np.array(list(peft_model_results.values())) - np.array(list(original_model_results.values())))
for key, value in zip(peft_model_results.keys(), improvement):
    print(f'{key}: {value*100:.2f}%')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ORIGINAL MODEL:
{'rouge1': np.float64(0.1473124122768657), 'rouge2': np.float64(0.03010912698412698), 'rougeL': np.float64(0.10521452992487192), 'rougeLsum': np.float64(0.10453103986743659)}
PEFT MODEL:
{'rouge1': np.float64(0.15534724966149388), 'rouge2': np.float64(0.03826728459283079), 'rougeL': np.float64(0.11811324710086318), 'rougeLsum': np.float64(0.11747557086355315)}
Absolute percentage improvement of PEFT MODEL over ORIGINAL MODEL
rouge1: 0.80%
rouge2: 0.82%
rougeL: 1.29%
rougeLsum: 1.29%
