## install dependancies

In [None]:
# install Hugging Face Libraries
!pip install peft
!pip install transformers datasets accelerate evaluate bitsandbytes loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install rouge-score

In [None]:
import os 
os.environ["WANDB_DISABLED"] = "true"

## Loading data

In [None]:
from datasets import load_dataset

dataset = load_dataset("ANWAR101/youtube-cnn")

In [None]:
dataset = dataset.remove_columns('Unnamed: 0')

In [None]:
dataset = dataset.shuffle(seed = 42)

## Convert text to text to token IDs

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="facebook/bart-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [None]:
max_input_length = 1024
max_target_length = 600


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["summary"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=ds['train'].column_names)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

## Fine-Tune with LoRA

In [None]:
from transformers import AutoModelForSeq2SeqLM
import torch

model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r = 8,
 lora_alpha = 16,
 target_modules=["q_proj", "v_proj"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)


# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

In [None]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer,model=model)

In [None]:
from huggingface_hub import login

login(token = "")

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="lora-bart-base-fine-tuned-youtube-cnn-3"

batch_size = 8
num_train_epochs = 8
# Show the training loss with every epoch
logging_steps = len(tokenized_dataset["train"]) // batch_size
model_name = model_id.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    evaluation_strategy="epoch",
    learning_rate=1e-3,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    gradient_accumulation_steps = 10,
     warmup_steps=1000,  # Gradually increase learning rate for the first 1000 steps
    lr_scheduler_type="linear",  # Linearly decrease after warmup
    push_to_hub=True,
)

In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download("punkt")

In [None]:
import evaluate
rouge_score = evaluate.load("rouge")

In [None]:
from nltk.tokenize import sent_tokenize

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [None]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()