In [None]:
!pip install evaluate rouge_score
!pip install bitsandbytes
!pip install -U datasets fsspec #This installation is due to a bug in an older version of datasets. Typically not needed.

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer,BitsAndBytesConfig
from datasets import load_dataset,Dataset
import torch
from huggingface_hub import login
from rouge_score import rouge_scorer
import pandas as pd
from peft import LoraConfig, get_peft_model, TaskType
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
login(token="XXX")

In [None]:
from transformers import DataCollatorForSeq2Seq
pd.set_option('display.max_colwidth', 3000)

In [None]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#### Load the dataset, and prepare the prompts.

In [None]:
train = load_dataset("knkarthick/samsum",split = "train").select(range(1000))
val = load_dataset("knkarthick/samsum",split = "validation").select(range(100))
test = load_dataset("knkarthick/samsum",split = "test").select(range(100))

In [None]:
def format_dialogue(text,field_name = "dialogue"):
    dialogue = text[field_name]
    return {
        "prompt": f"<<SYS>>\nYou are a helpful assistant.\n<</SYS>>\n\n[INST]Summarize the following text \n{dialogue}\n\nSummary:\n[/INST]",
    }

train = train.map(format_dialogue)
val = val.map(format_dialogue)
test = test.map(format_dialogue)

#### Load Llama 3.2-1B Instruct

In [None]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.padding_side = "left"


# Load full-precision model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16
)

# Move model to GPU manually
model = model.to(device)

#### Evaluate the model before fine-tuning, and save the results for further comparison.

In [None]:
# Generate output in batch
def generate_output(input_texts,batch_size = 4):
    inputs = []
    outputs = []
    for i in range(0,len(input_texts),batch_size):
        batch = input_texts[i:i+batch_size]

        input = tokenizer(
            batch,
            padding=True,
            truncation=True,
            return_tensors="pt"
        ).to(model.device)

        inputs.extend(input["input_ids"])

        with torch.no_grad():
            outputs.extend(model.generate(
                **input,
                do_sample=False,
                temperature=None,
                top_p=None,
                max_new_tokens=256
            ))

    return inputs,outputs


# Only extract the summary for calculating Rouge score.
def get_output_only(inputs,outputs):
    input_len_list = [len(input_list) for input_list in inputs]
    outputs = [output_list[input_len:] for input_len,output_list in zip(input_len_list,outputs)]
    results = tokenizer.batch_decode(
        outputs,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=True
    )
    return results

# This function is used to calculate rouge scores, given a list of references and a list of candidates.
# Also, we will save the results to analyze the summaries before and after summarization.
def macro_rouge(references, candidates, list_score_dir = None ,avg_score_dir = None):
    keys = ['rouge1', 'rouge2', 'rougeL']
    scorer = rouge_scorer.RougeScorer(keys, use_stemmer=True)
    fmeasure1,fmeasure2,fmeasureL = [],[],[]
    for ref, cand in zip(references, candidates):
        score = scorer.score(ref, cand)
        fmeasure1.append(score["rouge1"].fmeasure)
        fmeasure2.append(score["rouge2"].fmeasure)
        fmeasureL.append(score["rougeL"].fmeasure)

    # Average the scores
    avg_scores = [sum(x)/len(x) for x in [fmeasure1,fmeasure2,fmeasureL]]
    avg_scores = pd.DataFrame({"Metrics":keys,"Score":avg_scores})
    scores = pd.DataFrame({"Predicted_summary":candidates, \
                           "rouge1": fmeasure1, "rouge2":fmeasure2, "rougeL":fmeasureL})
    if avg_score_dir is not None:
        avg_scores.to_csv(avg_score_dir,index = False)
    if list_score_dir is not None:
        scores.to_csv(list_score_dir,index = False)
    return avg_scores,scores

In [None]:
# Generate summarization of the train set before fine tuning the model
inputs,outputs = generate_output(train["prompt"][:200])
results = get_output_only(inputs,outputs)
avg_score_dir = "/content/drive/MyDrive/LLMs/results/avg_score_dir"
list_score_dir = "/content/drive/MyDrive/LLMs/results/list_score_dir"
cat_dir = "_train_preFT.csv"
avg_score_train,score_list_train = macro_rouge(train["summary"][:200],results,\
                                               list_score_dir = list_score_dir + cat_dir,\
                                               avg_score_dir = avg_score_dir + cat_dir,\
                                               )


In [None]:
avg_score_train

Unnamed: 0,Metrics,Score
0,rouge1,0.342774
1,rouge2,0.127532
2,rougeL,0.260625


In [None]:
# Rouge scores of train set prior to LoRa fine-tuning.
score_list_train.head(3)

Unnamed: 0,Predicted_summary,rouge1,rouge2,rougeL
0,Amanda baked cookies and offered to share them with Jerry.,0.526316,0.352941,0.526316
1,"Olivia and Oliver are discussing the upcoming election. Olivia is a liberal, and Oliver is also a liberal. They both agree that they will vote for the same candidate.",0.358974,0.108108,0.25641
2,"\nKim is having a bad day and is feeling unmotivated. She plans to do some uni stuff and clean her room, but ends up procrastinating. She suggests using the Pomodoro technique and the post-it note system to help her stay on track.",0.210526,0.072727,0.175439


In [None]:
# Generate summarization of the validation set before fine tuning the model
inputs,outputs = generate_output(val["prompt"])
results = get_output_only(inputs,outputs)
avg_score_dir = "/content/drive/MyDrive/LLMs/results/avg_score_dir"
list_score_dir = "/content/drive/MyDrive/LLMs/results/list_score_dir"
cat_dir = "_val_preFT.csv"
avg_score_val,score_list_val = macro_rouge(val["summary"],results,\
                                               list_score_dir = list_score_dir + cat_dir,\
                                               avg_score_dir = avg_score_dir + cat_dir,\
                                               )

In [None]:
# Rouge scores of validation set prior to the LoRA fine-tuning
avg_score_val

Unnamed: 0,Metrics,Score
0,rouge1,0.312233
1,rouge2,0.111795
2,rougeL,0.239238


#### Add LoRA adapters

In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],  # adjust for your model
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
# model.gradient_checkpointing_enable() -> save memory but makes it incredibally slow.
model = get_peft_model(model, lora_config)
model.to(device)
model.print_trainable_parameters()

trainable params: 851,968 || all params: 1,236,666,368 || trainable%: 0.0689


#### Prepare DataLoader with Adaptive Padding

In [None]:
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True,  # dynamically pad each batch to max length in batch
)

#### Tokenize the data to get ready for fine-tuning

In [None]:
def tokenize_and_mask(example,response_field = "summary",max_length = 512):
    prompt_ids = tokenizer(example["prompt"], truncation=True, max_length = max_length, add_special_tokens=False)["input_ids"]
    response_ids = tokenizer(example[response_field], truncation=True, max_length = max_length - len(prompt_ids), \
                             add_special_tokens=False)["input_ids"]

    input_ids = prompt_ids + response_ids + [tokenizer.eos_token_id]
    labels = [-100] * len(prompt_ids) + response_ids + [tokenizer.eos_token_id]

    return {
        "input_ids": input_ids,
        "labels": labels,
        "attention_mask": [1] * len(input_ids)
    }

train = train.map(tokenize_and_mask)
val = val.map(tokenize_and_mask)
test = test.map(tokenize_and_mask)

#### Training Setup

In [None]:
training_args = TrainingArguments(
    output_dir="./output",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_dir="./logs",
    report_to="none",# <--- disables wandb and others
    eval_steps=100
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train,
    eval_dataset=val,
    tokenizer=tokenizer,
    data_collator=data_collator
)


#### Train the model

In [None]:

trainer.train()
#save
model.save_pretrained("/content/drive/MyDrive/Colab Notebooks/lora-adapter-llama3-summarizer")
tokenizer.save_pretrained("/content/drive/MyDrive/Colab Notebooks/lora-adapter-llama3-summarizer")

#### Evaludation

In [None]:
# Generate summarization of the train set after fine tuning
inputs,outputs = generate_output(train["prompt"][:200])
results = get_output_only(inputs,outputs)
avg_score_dir = "/content/drive/MyDrive/LLMs/results/avg_score_dir"
list_score_dir = "/content/drive/MyDrive/LLMs/results/list_score_dir"
cat_dir = "_train_postFT.csv"
avg_score_train,score_list_train = macro_rouge(train["summary"][:200],results,\
                                               list_score_dir = list_score_dir + cat_dir,\
                                               avg_score_dir = avg_score_dir + cat_dir,\
                                               )


In [None]:
avg_score_train

Unnamed: 0,Metrics,Score
0,rouge1,0.487774
1,rouge2,0.230038
2,rougeL,0.3975


In [None]:
# Generate summarization of the validation set after fine tuning
inputs,outputs = generate_output(val["prompt"])
results = get_output_only(inputs,outputs)
avg_score_dir = "/content/drive/MyDrive/LLMs/results/avg_score_dir"
list_score_dir = "/content/drive/MyDrive/LLMs/results/list_score_dir"
cat_dir = "_val_postFT.csv"
avg_score_val,score_list_val = macro_rouge(val["summary"],results,\
                                               list_score_dir = list_score_dir + cat_dir,\
                                               avg_score_dir = avg_score_dir + cat_dir,\
                                               )

In [None]:
avg_score_val

Unnamed: 0,Metrics,Score
0,rouge1,0.488771
1,rouge2,0.241016
2,rougeL,0.409444


In [None]:
# Generate summarization of the test set to understand whether the fine-tuned model is generalizable for non-development data.
inputs,outputs = generate_output(test["prompt"])
results = get_output_only(inputs,outputs)
avg_score_dir = "/content/drive/MyDrive/LLMs/results/avg_score_dir"
list_score_dir = "/content/drive/MyDrive/LLMs/results/list_score_dir"
cat_dir = "_test_postFT.csv"
avg_score_test,score_list_test = macro_rouge(test["summary"],results,\
                                               list_score_dir = list_score_dir + cat_dir,\
                                               avg_score_dir = avg_score_dir + cat_dir,\
                                               )

In [None]:
avg_score_test

Unnamed: 0,Metrics,Score
0,rouge1,0.486741
1,rouge2,0.214574
2,rougeL,0.390437


#### Conclusion
LoRA has improved the Rouge scores significantly.

| Metric  | Train (pre LoRA) | Train (after LoRA) | Val (pre LoRA)| Val (post LoRA) | Test (post LoRA) |
|---------|---------|---------|---------|---------|---------|
| rouge1 | 0.342774 | 0.487774 | 0.312233 | 0.488771 | 0.486741 |
| rouge2 | 0.127532 | 0.230038 | 0.111795 | 0.241016 | 0.214574 |
| rougeL | 0.260625 | 0.397500 | 0.239238 | 0.409444 | 0.390437 |

Note that validation set was only used for evaluation during fine-tuning, and test set was not used for fine-tuning or even developed before or during fine-tuning. The performance on test set indicates that the improvement in the Rouge score is not due to overfitting.