In [1]:
# https://github.com/laxmimerit/NLP-Tutorials-with-HuggingFace/blob/main/NLP_with_HuggingFace_Tutorial_4_Summarization.ipynb

In [1]:
import accelerate
import transformers

transformers.__version__, accelerate.__version__

('4.35.2', '0.25.0')

In [1]:
import numpy as np

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

from transformers import AdamWeightDecay
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, DataCollatorForSeq2Seq
from transformers import TrainingArguments, Trainer

from datasets import load_dataset
import torch

from rouge import Rouge
from transformers import get_linear_schedule_with_warmup

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
dataset = load_dataset('cnn_dailymail', '3.0.0', split='train[:15%]')
dataset_test = load_dataset('cnn_dailymail', '3.0.0', split='test[0:100]')

In [3]:
dataset = dataset.train_test_split(test_size=0.2)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 34453
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 8614
    })
})

In [5]:
batch_size = 4
learning_rate = 4e-5
weight_decay = 0.01
epochs = 3
warm_up=500
training_steps = 7500

In [6]:
tokenizerBart = BartTokenizer.from_pretrained('facebook/bart-base')
modelBart = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
modelBart.cuda()

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05,

In [7]:
def get_feature(batch):
    encodings = tokenizerBart(batch['article'], text_target=batch['highlights'],
                        max_length=1024, truncation=True)

    encodings = {'input_ids': encodings['input_ids'],
               'attention_mask': encodings['attention_mask'],
               'labels': encodings['labels']}

    return encodings

In [8]:
dataset = dataset.map(get_feature, batched=True)

Map:   0%|          | 0/34453 [00:00<?, ? examples/s]

Map:   0%|          | 0/8614 [00:00<?, ? examples/s]

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 34453
    })
    test: Dataset({
        features: ['article', 'highlights', 'id', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 8614
    })
})

In [10]:
columns = ['input_ids', 'labels', 'attention_mask']
dataset.set_format(type='pt', columns=columns)

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizerBart, model=modelBart, return_tensors="pt")

In [13]:
training_args = TrainingArguments(
    output_dir = 'bart',
    num_train_epochs=epochs,
    warmup_steps = warm_up,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay = weight_decay,
    max_grad_norm=1.0,
    logging_steps = 10,
    evaluation_strategy = 'steps',
    eval_steps=250,
    save_steps=1e6,
    gradient_accumulation_steps=8,
    learning_rate=learning_rate
)

trainer = Trainer(model=modelBart, 
                  args=training_args, 
                  tokenizer=tokenizerBart,
                  data_collator=data_collator,
                  train_dataset = dataset['train'], 
                  eval_dataset = dataset['test'], 
                  )

optimizer = torch.optim.Adam(modelBart.parameters(), lr=learning_rate, weight_decay=weight_decay)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warm_up, num_training_steps=training_steps)
trainer.optimizer = optimizer
trainer.lr_scheduler = scheduler

In [14]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [15]:
device

device(type='cuda')

In [16]:
modelBart.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05,

In [17]:
trainer.train()

Step,Training Loss,Validation Loss
250,2.4988,2.101984
500,2.4341,2.098762
750,2.8751,2.447793
1000,3.6723,3.020176
1250,4.2002,3.645432
1500,4.6616,4.239052
1750,4.9831,4.706543
2000,5.265,4.987002
2250,5.3254,5.141323
2500,5.4438,5.270058


TrainOutput(global_step=3228, training_loss=4.407310738144015, metrics={'train_runtime': 7654.5045, 'train_samples_per_second': 13.503, 'train_steps_per_second': 0.422, 'total_flos': 5.924065631597568e+16, 'train_loss': 4.407310738144015, 'epoch': 3.0})

In [18]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'


In [36]:
def generate_preds(tokenizer, model):
    evaluate = []
    for element in dataset_test['article']:
        tokenized = tokenizer([element], max_length=1024, truncation=True, return_tensors='pt').to('cuda')
        pred = model.generate(**tokenized, max_length=128)
        evaluate.append(pred)
    pred = []
    for article in evaluate:
        new_article = tokenizer.decode(article[0], skip_special_tokens=True)
        pred.append(new_article)
    return pred

In [20]:
def actual_preds(dataset):
    actual = []
    for summary in dataset['highlights']:
        actual.append(summary)
    return actual

In [37]:
pred_Bart = generate_preds(tokenizerBart, modelBart)

In [22]:
actual_Bart = actual_preds(dataset_test)

In [38]:
pred_Bart

['NEW: The Palestinian Palestinian Palestinian Palestinians in the court court.\nHe says he says he was accused of the U.N.S.SS.\nHe was accused accused of Palestinian Palestinian.',
 'NEW: The police say they have been killed in the death.\nHe says he says he was found found in a death.',
 "NEW: The Iranian Iranian Iranian Iran's Iranian Iran.\nHe says he says.\nNEW: Iran's Iran says he was not not not have been been in Iran.",
 'NEW: The hospital,000 people,000,000000 people.\nHe says he says he was killed in the hospital.',
 'NEW: Student Student student student student students students students.\nThe student student says he says.',
 "NEW: New York's new new new school school school.\nHe says he was not not not to be in the school.",
 'NEW: The U.S.SS.N.S,000 people were killed.\nThe death death of the death of death death death.\nHe says he was killed in the death.',
 'NEW: The death of the death death of death.\nHe says he says he was killed.\nThe death of his death death death i

In [39]:
def get_rouge_scores(actual_summary, predicted_summary):
    rouge = Rouge()
    try: 
        if (len(actual_summary) == len(predicted_summary)):
            pass
    except Exception as e:
        print(f"lengths of actual and predictions don't match: {e}")
    score_total1 = 0
    score_total2 = 0
    score_totalL = 0
    for i in range(0, len(actual_summary)):
        scores = rouge.get_scores(predicted_summary[i], actual_summary[i])
        
        score_total1 += scores[0]['rouge-1']['f']
        score_total2 += scores[0]['rouge-2']['f']
        score_totalL += scores[0]['rouge-l']['f']
    return score_total1 / len(actual_summary), score_total2 / len(actual_summary), score_totalL / len(actual_summary)

In [40]:
rouge1Bart, rouge2Bart, rougelBart = get_rouge_scores(actual_Bart,pred_Bart)

In [41]:
print(round(rouge1Bart, 4), round(rouge2Bart, 4), round(rougelBart, 4))

0.1316 0.0162 0.1267


In [26]:
len(pred_Bart)

100

In [27]:
logs = trainer.state.log_history

In [29]:
results = trainer.evaluate()

In [30]:
results

{'eval_loss': 5.525334358215332,
 'eval_runtime': 157.1202,
 'eval_samples_per_second': 54.824,
 'eval_steps_per_second': 13.709,
 'epoch': 3.0}