## install dependancies

In [None]:
# install Hugging Face Libraries
!pip install peft
!pip install transformers datasets accelerate evaluate bitsandbytes loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install rouge-score

## Loading data

In [1]:
from datasets import load_dataset

ds = load_dataset("ANWAR101/youtube-cnn")

In [2]:
from datasets import load_dataset

dataset = load_dataset("ccdv/cnn_dailymail" , "3.0.0")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [3]:
ds = ds.remove_columns('Unnamed: 0')

In [4]:
ds = ds.shuffle(seed = 42)

In [5]:
ds['train'] 

Dataset({
    features: ['text', 'summary'],
    num_rows: 16730
})

In [6]:
from datasets import concatenate_datasets
import random 
random.seed(42)

dataset["train"] = dataset["train"].rename_column('article' , 'text')
dataset["train"] = dataset["train"].rename_column('highlights' , 'summary')
dataset["train"] = dataset["train"].remove_columns('id')

In [7]:
dataset["validation"] = dataset["validation"].rename_column('article' , 'text')
dataset["validation"] = dataset["validation"].rename_column('highlights' , 'summary')
dataset["validation"] = dataset["validation"].remove_columns('id')

In [8]:
train_ds = concatenate_datasets([ds['train'] , dataset["train"].select(random.sample(range(len(ds['train'])), 5000))])
val_ds = concatenate_datasets([ds['validation'] , dataset["validation"].select(random.sample(range(len(ds['validation'])), 500))])

In [9]:
from datasets import DatasetDict
dataset = DatasetDict({
    'train': train_ds , 
    'validation': val_ds
})
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'summary'],
        num_rows: 21730
    })
    validation: Dataset({
        features: ['text', 'summary'],
        num_rows: 2750
    })
})

## Convert text to text to token IDs

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="facebook/bart-base"

tokenizer = AutoTokenizer.from_pretrained(model_id)

In [11]:
max_input_length = 1024
max_target_length = 600


def preprocess_function(examples):
    model_inputs = tokenizer(
        examples["text"],
        max_length=max_input_length,
        truncation=True,
    )
    labels = tokenizer(
        examples["summary"], max_length=max_target_length, truncation=True
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=ds['train'].column_names)
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

Keys of tokenized dataset: ['input_ids', 'attention_mask', 'labels']


## Fine-Tune with LoRA

In [12]:
from transformers import AutoModelForSeq2SeqLM
import torch

model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

In [13]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r = 18,
 lora_alpha = 8,
 target_modules=["q_proj", "v_proj"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)


# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 995,328 || all params: 140,415,744 || trainable%: 0.7088435895051769




In [14]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer,model=model)

2024-03-04 22:27:34.166223: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-04 22:27:34.166278: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-04 22:27:34.167768: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [15]:
from huggingface_hub import login

login(token = "")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [17]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="lora-bart-base-fine-tuned-youtube-cnn-3"

# batch_size = 10
# num_train_epochs = 8
# Show the training loss with every epoch
# logging_steps = len(tokenized_dataset["train"]) // batch_size
model_name = model_id.split("/")[-1]

args = Seq2SeqTrainingArguments(
    output_dir='./outputs',
    eval_steps=100,
    logging_steps=100,
    warmup_steps=100,
    evaluation_strategy="steps",
    report_to="all",
    log_level="debug",
    logging_dir='./logs',
    learning_rate=1e-3,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    do_train=True,
    do_eval=True,
    weight_decay=0.05,
    # save_total_limit=3,
#     num_train_epochs=num_train_epochs,
#     predict_with_generate=True,
    gradient_accumulation_steps = 24,
    max_steps=1000,
    lr_scheduler_type="linear",  # Linearly decrease after warmup
#     push_to_hub=True,
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [None]:
!pip install nltk

In [None]:
import nltk
nltk.download("punkt")

from datasets import load_metric
rouge_metric = load_metric("rouge", trust_remote_code=True)

In [None]:
from nltk.tokenize import sent_tokenize

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the scores
    result = {key: value * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

In [16]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [18]:
trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
#     compute_metrics=compute_metrics
)

max_steps is given, it will override any value given in num_train_epochs


In [19]:
trainer.train() 

Currently training with a batch size of: 10
***** Running training *****
  Num examples = 21,730
  Num Epochs = 12
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel, distributed & accumulation) = 240
  Gradient Accumulation steps = 24
  Total optimization steps = 1,000
  Number of trainable parameters = 995,328


Step,Training Loss,Validation Loss
100,3.0304,2.097836
200,2.4163,2.011829
300,2.3369,1.988548
400,2.2898,1.969511
500,2.2543,1.963721
600,2.235,1.94827
700,2.2132,1.942316
800,2.199,1.935334
900,2.1833,1.926417
1000,2.1787,1.928086


***** Running Evaluation *****
  Num examples = 2750
  Batch size = 10
***** Running Evaluation *****
  Num examples = 2750
  Batch size = 10
***** Running Evaluation *****
  Num examples = 2750
  Batch size = 10
***** Running Evaluation *****
  Num examples = 2750
  Batch size = 10
***** Running Evaluation *****
  Num examples = 2750
  Batch size = 10
Saving model checkpoint to ./outputs/tmp-checkpoint-500
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/config.json
Model config BartConfig {
  "_name_or_path": "bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_

TrainOutput(global_step=1000, training_loss=2.3336665344238283, metrics={'train_runtime': 19263.4024, 'train_samples_per_second': 12.459, 'train_steps_per_second': 0.052, 'total_flos': 1.477580515264512e+17, 'train_loss': 2.3336665344238283, 'epoch': 11.04})

In [None]:
# to hugging face
model_name = "IT-General-Data-Summarization"
HUGGING_FACE_USER_NAME = "mou3az"

L_model.push_to_hub(f"{HUGGING_FACE_USER_NAME}/{model_name}", token='')

In [21]:
save_directory = "LORA tunning of facebook-bart"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--facebook--bart-base/snapshots/aadd2ab0ae0c8268c7c9693540e9904811f36177/config.json
Model config BartConfig {
  "_name_or_path": "bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2

('LORA tunning of facebook-bart/tokenizer_config.json',
 'LORA tunning of facebook-bart/special_tokens_map.json',
 'LORA tunning of facebook-bart/vocab.json',
 'LORA tunning of facebook-bart/merges.txt',
 'LORA tunning of facebook-bart/added_tokens.json',
 'LORA tunning of facebook-bart/tokenizer.json')

In [41]:
device = next(model.parameters()).device

# Function to calculate ROUGE scores
def calculate_rouge(predictions, references):
    rouge_results = rouge_metric.compute(predictions=predictions, references=references)
    return rouge_results

# Function to generate predictions
def generate_predictions(model, dataset):
    # Perform inference
    predictions = []
    for example in dataset:
        input_ids = tokenizer(example["text"], return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model.generate(**input_ids, early_stopping=True, num_beams=7, num_return_sequences=1, max_new_tokens=1024, no_repeat_ngram_size= 3)
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predictions.append(prediction)
    return predictions

# Run inference
predictions = generate_predictions(model, dataset["validation"])

# Compute ROUGE scores
references = [example["summary"] for example in dataset["validation"]]
rouge_scores = calculate_rouge(predictions, references)

In [44]:
rouge_scores

{'rouge1': AggregateScore(low=Score(precision=0.40111303779854607, recall=0.4565258921607348, fmeasure=0.4134774428450369), mid=Score(precision=0.40633874815680177, recall=0.46167383600180967, fmeasure=0.4179207840737956), high=Score(precision=0.41175327050160954, recall=0.4666723737653531, fmeasure=0.4226458234342272)),
 'rouge2': AggregateScore(low=Score(precision=0.17582252129306328, recall=0.19864898571559128, fmeasure=0.18034843739795808), mid=Score(precision=0.1799177561628951, recall=0.2033712470378194, fmeasure=0.18437582253998974), high=Score(precision=0.1847914909653636, recall=0.20802975609737862, fmeasure=0.18880338211459904)),
 'rougeL': AggregateScore(low=Score(precision=0.2687003251090096, recall=0.30795493803091084, fmeasure=0.2776918205456754), mid=Score(precision=0.2730224673220921, recall=0.31254690446842437, fmeasure=0.2815639847993016), high=Score(precision=0.2772787127695215, recall=0.3169361864056118, fmeasure=0.28537912524233394)),
 'rougeLsum': AggregateScore(l

In [None]:
# rouge_1_f1 = 0.433
# rouge_2_f1 = 0.191
# rouge_l_f1 = 0.292
# rouge_lsum_f1 = 0.365