In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install datasets transformers
!pip install peft
!pip install accelerate
!pip install bitsandbytes
!pip install rouge_score

In [3]:
import datasets
from datasets import load_metric
import nltk
nltk.download('punkt')
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
from peft import PeftModelForSeq2SeqLM, get_peft_config, get_peft_model
import torch

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# load the dataset from huggingface

In [4]:
samsum_dataset = datasets.load_dataset("samsum") # dataset is already splitted so no need to perform cv

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading data:   0%|          | 0.00/6.06M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/347k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/335k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [6]:
samsum_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

# preprocessing the dataset.

BART is a transformer encoder-encoder (seq2seq) model with a bidirectional (BERT-like) encoder i.e. it can process input in both (forward and backward) direction and an autoregressive (GPT-like) decoder.

In [7]:
model_checkpoint="facebook/bart-large-cnn"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
def preprocessing(data):
  input_texts = [dialogue for dialogue in data['dialogue']]
  output_texts = [summary for summary in data["summary"]]
  assert len(input_texts) == len(output_texts) # length should be same.
  # tokenization of the data to convert into desired format
  max_length = 512
  target_length=128
  tokenized_inputs = tokenizer(input_texts, max_length = max_length,padding="max_length", truncation=True, return_tensors="pt")
  tokenized_outputs = tokenizer(output_texts,max_length=target_length,  padding="max_length", truncation=True, return_tensors="pt")
  tokenized_inputs["labels"] = tokenized_outputs["input_ids"]
  return tokenized_inputs
tokenized_data = samsum_dataset.map(preprocessing, batched=True,remove_columns=['id', 'dialogue', 'summary'])

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

# Training
* We can now proceed to define the model and fine-tune it, but considering the resouces, we can use better approach to fine-tune this model.
* We will use Peft with Lora configuration, that allows us to freeze the model weight, and create the two lower rank matrices. Once the training is finished, we adjust the original weights with learned weights.

In [9]:
model=AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [10]:
model

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50264, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50264, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): La

In [11]:
config = {
    "peft_type": "LORA",
    "task_type": "SEQ_2_SEQ_LM",
    "inference_mode": False,
    "r": 8,
    "target_modules": ["k_proj", "v_proj","q_proj"],
    "lora_alpha": 32,
    "lora_dropout": 0.1,
    "fan_in_fan_out": False,
    "bias": "none",
}

peft_config = get_peft_config(config)
peft_model = PeftModelForSeq2SeqLM(model, peft_config)
peft_model.print_trainable_parameters()

trainable params: 1,769,472 || all params: 408,059,904 || trainable%: 0.4336304504938569


Looking at the above trainable params vs total number of params, we manage to decrease the total number of parameters, using LORA. Now we can use peft model to fine-tune.
* If the model is large and we want to load the model into our memory, we can also use BitsAndBytes which allow us to map the model weight to lower precision for example from float 16-32 to float4-8. But these weights are revert back from lower precision to higher precision during the backpropogation perioud.

* In this case we don't need it.

## Define the rouge metrics.

In [12]:
metric = load_metric("rouge")
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [label.strip() for label in labels]

    # rougeLSum expects newline after each sentence
    preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
    labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    # take preds, and labels where we don't have the padding.
    preds = np.where(preds != 1, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != 1, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
    result = {k: round(v * 100, 4) for k, v in result.items()}
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    return result

  metric = load_metric("rouge")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

# Training Arguments.

In [13]:
PATH_TO_STORE_MODEL = "/content/drive/MyDrive/case_study/task_2/summarization/models/"
EPOCH = 1
BATCH_SIZE = 1
LEARNING_RATE = 2e-5
WARMUP_STEPS = 500
do_train = True
do_eval = True
do_predict = True

total_steps = int(len(tokenized_data["train"])/16 * EPOCH)

args = Seq2SeqTrainingArguments(
    output_dir=PATH_TO_STORE_MODEL,
    overwrite_output_dir=True,
    do_train=do_train,
    do_eval=do_eval,
    do_predict=do_predict,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    warmup_steps=WARMUP_STEPS,
    num_train_epochs=EPOCH,
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps = int(total_steps/100),
    logging_steps = int(total_steps/100),
    eval_steps = int(total_steps/100),
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,

)

In [14]:
trainer = Seq2SeqTrainer(
    model=peft_model,
    args=args,
    train_dataset=tokenized_data["train"],
    eval_dataset=tokenized_data["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [15]:
!nvidia-smi -L

GPU 0: NVIDIA L4 (UUID: GPU-1fb8f9e6-b253-573e-d386-d6c1c370ddba)


In [16]:
trainer.train()

Step,Training Loss,Validation Loss


KeyboardInterrupt: 

We just now need to fine-tune this model and test it on test dataset for the summary.
