# NLP Final - BART and BART-pubmed on eLife

In [1]:
import transformers
# !pip install transformers datasets evaluate rouge_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor

from typing import Tuple, List

import random
import math
import os
import time
import json
import numpy as np
import pandas as pd
from collections import Counter

# We'll set the random seeds for deterministic results.
SEED = 1

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.deterministic = True

class Placeholder:
    @property
    def DO(self):
        raise NotImplementedError("You haven't yet implemented this part of the assignment yet")

TO = Placeholder()


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Pytorch version is: ", torch.__version__)
print("You are using: ", DEVICE)

  _torch_pytree._register_pytree_node(


Pytorch version is:  2.2.2
You are using:  cuda


In [2]:
# import for model evaluation
import os, sys, json
import textstat
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score
# from alignscore import AlignScore
# from lens.lens_score import LENS
import torch
from summac.model_summac import SummaCConv
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/cpsc452_xc392/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [22]:
# from google.colab import drive
# drive.mount('/content/drive')

In [3]:
# file_path = '/content/drive/My Drive/final_project/eLife_train.jsonl'
file_path = 'eLife_train.jsonl'
file_path_test = 'eLife_val.jsonl'

In [7]:
# load data by keyword
# load entire dataset: keyword = None
def keyword_data(file_path,keyword=None):
    rows_list = []

    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            json_data = json.loads(line.strip())

            if not keyword:
                rows_list.append(json_data)
            if 'keywords' in json_data and keyword in json_data['keywords']:
                rows_list.append(json_data)
    return rows_list

## Evaluation Metrics

In [8]:
def calc_rouge(preds, refs):
  # Get ROUGE F1 scores
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], \
                                    use_stemmer=True, split_summaries=True)
  scores = [scorer.score(p, refs[i]) for i, p in enumerate(preds)]
  return np.mean([s['rouge1'].fmeasure for s in scores]), \
         np.mean([s['rouge2'].fmeasure for s in scores]), \
         np.mean([s['rougeLsum'].fmeasure for s in scores])

def calc_bertscore(preds, refs):
  # Get BERTScore F1 scores
  P, R, F1 = score(preds, refs, lang="en", verbose=True, device='cuda:0')
  return np.mean(F1.tolist())

def calc_readability(preds):
  fkgl_scores = []
  cli_scores = []
  dcrs_scores = []
  for pred in preds:
    fkgl_scores.append(textstat.flesch_kincaid_grade(pred))
    cli_scores.append(textstat.coleman_liau_index(pred))
    dcrs_scores.append(textstat.dale_chall_readability_score(pred))
  return np.mean(fkgl_scores), np.mean(cli_scores), np.mean(dcrs_scores)

def calc_lens(preds, refs, docs):
  model_path = "./models/LENS/LENS/checkpoints/epoch=5-step=6102.ckpt"
  metric = LENS(model_path, rescale=True)
  abstracts = [d.split("\n")[0] for d in docs]
  refs = [[x] for x in refs]

  scores = metric.score(abstracts, preds, refs, batch_size=8, gpus=1)
  return np.mean(scores)

def calc_alignscore(preds, docs):
  alignscorer = AlignScore(model='roberta-base', batch_size=16, device='cuda:0', \
                           ckpt_path='./models/AlignScore/AlignScore-base.ckpt', evaluation_mode='nli_sp')
  return np.mean(alignscorer.score(contexts=docs, claims=preds))

def cal_summac(preds, docs):
  model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device="cuda", start_file="default", agg="mean")
  return np.mean(model_conv.score(docs, preds)['scores'])

## BART on whole dataset

In [5]:
from transformers import BartForConditionalGeneration, BartTokenizer
model_name = "facebook/bart-large-xsum"
tokenizer = BartTokenizer.from_pretrained(model_name)  # load the tokenizer
model = BartForConditionalGeneration.from_pretrained(model_name).to(DEVICE)  # load the model

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [9]:
file_path = 'eLife_train.jsonl'
file_path_test = 'eLife_val.jsonl'

eLife_data=pd.DataFrame(keyword_data(file_path))
eLife_data_val=pd.DataFrame(keyword_data(file_path_test))

In [10]:
def preprocess_and_tokenize(dataframe):
    # first 512 tokens
    dataframe['input_text'] = dataframe['article'].apply(lambda x: ' '.join(x.split()[:512]))
    # tokenization
    inputs = tokenizer(dataframe['input_text'].tolist(), max_length=512, padding='max_length', truncation=True, return_tensors="pt")
    # tokenize output
    outputs = tokenizer(dataframe['lay_summary'].tolist(), max_length=128, padding='max_length', truncation=True, return_tensors="pt")
    return inputs, outputs['input_ids']

In [11]:
train_inputs, train_labels = preprocess_and_tokenize(eLife_data)
val_inputs, val_labels = preprocess_and_tokenize(eLife_data_val)

In [12]:
from torch.utils.data import Dataset, DataLoader

class MedicineDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

# # Dataset
train_dataset = MedicineDataset(train_inputs, train_labels)
val_dataset = MedicineDataset(val_inputs, val_labels)

# DataLoader
# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [13]:
from transformers import EvalPrediction
from rouge_score import rouge_scorer
import textstat

def compute_metrics(p: EvalPrediction):
    predictions = p.predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]

    predicted_ids = predictions.argmax(-1)

    decoded_preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in predicted_ids]
    decoded_labels = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in p.label_ids]

    # Relevance scores
    rouge_results = calc_rouge(decoded_preds, decoded_labels)
    rouge_results = {key: results for key, results in zip(['rouge1' , 'rouge2', 'rougeL'], list(rouge_results))}
    bert_score = calc_bertscore(decoded_preds, decoded_labels)
    
    # Readability scores
    avg_fkgl, avg_cli, avg_dcrs = calc_readability(decoded_preds)

    return {
        **rouge_results,
        "bert_score": bert_score,
        "avg_fkgl": avg_fkgl,
        "avg_cli": avg_cli,
        "avg_dcrs": avg_dcrs
    }

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              # Training epochs
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=8,    
    warmup_steps=100,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,                
    evaluation_strategy="steps",     
    eval_steps=100,                  
    save_strategy="steps",           
    save_steps=500,                  # Enable mixed precision training
    fp16=True,                        
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

In [None]:
base_model_save_path = "project/LaySumProject/results"
keyword = "bart"
keyword_save_path = os.path.join(base_model_save_path, f"model_save_{keyword.replace(' ', '_')}")
os.makedirs(keyword_save_path, exist_ok=True)
model.save_pretrained(keyword_save_path)
tokenizer.save_pretrained(keyword_save_path)

predict_output = trainer.predict(val_dataset)
metrics = compute_metrics(predict_output)
print(metrics)

In [14]:
trainer.evaluate()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.87 seconds, 84.07 sentences/sec


{'eval_loss': 2.264357089996338,
 'eval_rouge1': 0.5782105879938336,
 'eval_rouge2': 0.22440689302668687,
 'eval_rougeL': 0.5416079381523048,
 'eval_bert_score': 0.8693327740514921,
 'eval_avg_fkgl': 9.169294605809128,
 'eval_avg_cli': 10.619585062240665,
 'eval_avg_dcrs': 8.974730290456431,
 'eval_runtime': 9.6736,
 'eval_samples_per_second': 24.913,
 'eval_steps_per_second': 3.205,
 'epoch': 3.0}

## BART-pubmed

In [9]:
# Load model BART pubmed
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
tokenizer = AutoTokenizer.from_pretrained("mse30/bart-base-finetuned-pubmed")
model_pubmed = AutoModelForSeq2SeqLM.from_pretrained("mse30/bart-base-finetuned-pubmed")

file_path = 'eLife_train.jsonl'
file_path_test = 'eLife_val.jsonl'

eLife_data=pd.DataFrame(keyword_data(file_path))
eLife_data_val=pd.DataFrame(keyword_data(file_path_test))

train_inputs, train_labels = preprocess_and_tokenize(eLife_data)
val_inputs, val_labels = preprocess_and_tokenize(eLife_data_val)

train_dataset = MedicineDataset(train_inputs, train_labels)
val_dataset = MedicineDataset(val_inputs, val_labels)

# train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
# val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [10]:
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir='./results',          
    num_train_epochs=3,              
    per_device_train_batch_size=8,   
    per_device_eval_batch_size=8,    
    warmup_steps=100,                
    weight_decay=0.01,               
    logging_dir='./logs',            
    logging_steps=10,                
    evaluation_strategy="steps",     
    eval_steps=100,                  
    save_strategy="steps",           
    save_steps=500,                  
    fp16=True,
)

trainer = Trainer(
    model_pubmed,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bert Score,Avg Fkgl,Avg Cli,Avg Dcrs
100,3.0428,2.845975,0.510394,0.154672,0.47518,0.84812,7.221992,8.93195,8.201826
200,2.9286,2.69747,0.519499,0.164124,0.480498,0.852327,8.526141,9.681826,8.457676
300,2.8229,2.638843,0.526143,0.170059,0.488964,0.853601,8.103734,9.820913,8.483071
400,2.8799,2.579447,0.535481,0.173346,0.495976,0.855289,8.454772,9.724564,8.449087
500,2.761,2.554208,0.536934,0.17731,0.501212,0.856432,7.692531,9.663154,8.420539
600,2.623,2.540445,0.538741,0.18109,0.500014,0.856913,8.184647,9.455975,8.498589
700,2.5596,2.521262,0.542475,0.183624,0.500809,0.857806,9.042739,9.775021,8.677552
800,2.5776,2.505866,0.546267,0.183392,0.50686,0.858679,8.436515,9.975685,8.640456
900,2.5591,2.500283,0.543899,0.185821,0.504478,0.858701,8.43361,9.813859,8.542739
1000,2.4211,2.490222,0.543913,0.186455,0.506282,0.858994,8.655602,10.080996,8.649212


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.89 seconds, 83.30 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.89 seconds, 83.28 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.93 seconds, 82.38 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.92 seconds, 82.42 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.91 seconds, 82.80 sentences/sec


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.90 seconds, 83.03 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.89 seconds, 83.41 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.90 seconds, 83.22 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.91 seconds, 82.81 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.91 seconds, 82.71 sentences/sec


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.91 seconds, 82.75 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.87 seconds, 83.94 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.93 seconds, 82.36 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.89 seconds, 83.33 sentences/sec


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.90 seconds, 83.11 sentences/sec


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.91 seconds, 82.71 sentences/sec


TrainOutput(global_step=1632, training_loss=2.6181880703159406, metrics={'train_runtime': 266.2892, 'train_samples_per_second': 48.962, 'train_steps_per_second': 6.129, 'total_flos': 3974871971266560.0, 'train_loss': 2.6181880703159406, 'epoch': 3.0})

In [None]:
base_model_save_path = "project/LaySumProject/results"
keyword = "bartpubmed"
keyword_save_path = os.path.join(base_model_save_path, f"model_save_{keyword.replace(' ', '_')}")
os.makedirs(keyword_save_path, exist_ok=True)
model.save_pretrained(keyword_save_path)
tokenizer.save_pretrained(keyword_save_path)

predict_output = trainer.predict(val_dataset)
metrics = compute_metrics(predict_output, trainer.docs)
print(metrics)

In [11]:
trainer.evaluate()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 2.89 seconds, 83.28 sentences/sec


{'eval_loss': 2.4586825370788574,
 'eval_rouge1': 0.5492260573721498,
 'eval_rouge2': 0.19132091134194984,
 'eval_rougeL': 0.5106251910564675,
 'eval_bert_score': 0.8607087429628333,
 'eval_avg_fkgl': 8.707053941908715,
 'eval_avg_cli': 10.129626556016596,
 'eval_avg_dcrs': 8.711784232365146,
 'eval_runtime': 7.5095,
 'eval_samples_per_second': 32.093,
 'eval_steps_per_second': 4.128,
 'epoch': 3.0}