# BioLaySum Part II and III: Training on subsets AND on modified data


Section I: Data Preparations
*   Keywords extraction
*   Tokenization
*   Evaluation

Section II: Training on 6 subsets

Section III: Training on modified dataset (with definition replacement)


In [None]:
!pip install accelerate -U
import transformers
#!pip install transformers datasets evaluate rouge_score
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor

from typing import Tuple, List
import tqdm

import random
import math
import os
import time
import json
import numpy as np
from collections import Counter

# We'll set the random seeds for deterministic results.
SEED = 1

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.enabled = False
torch.backends.cudnn.deterministic = True

class Placeholder:
    @property
    def DO(self):
        raise NotImplementedError("You haven't yet implemented this part of the assignment yet")

TO = Placeholder()


DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Pytorch version is: ", torch.__version__)
print("You are using: ", DEVICE)

Pytorch version is:  2.2.1+cu121
You are using:  cuda


## Preparation

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
# file_path = '/content/drive/My Drive/final_project/eLife_train.jsonl'
# file_path_test = '/content/drive/My Drive/final_project/eLife_val.jsonl'

folder_dir = '/content/drive/My Drive/NLP Final Project/biolaysumm2024_data'
file_path = os.path.join(folder_dir, 'eLife_train.jsonl')
file_path_test = os.path.join(folder_dir, 'eLife_val.jsonl')

In [None]:
folder_dir = '/content/drive/My Drive/NLP Final Project/biolaysumm2024_data'
file_path_plos = os.path.join(folder_dir, 'PLOS_train.jsonl')
file_path_test_plos = os.path.join(folder_dir, 'PLOS_val.jsonl')

In [None]:
import pandas as pd
import json

# training dataset preview
# 定义要读取的行数
n_lines = 5

data = []

with open(file_path, 'r', encoding='utf-8') as f:#file_path_plos
    for _ in range(n_lines):
        line = f.readline().strip()
        # 确保不是文件的末尾
        if not line:
            break
        data.append(json.loads(line))

df = pd.DataFrame(data)
print(df)

                                         lay_summary  \
0  In the USA , more deaths happen in the winter ...   
1  Most people have likely experienced the discom...   
2  The immune system protects an individual from ...   
3  The brain adapts to control our behavior in di...   
4  Cells use motor proteins that to move organell...   

                                             article  \
0  In temperate climates , winter deaths exceed s...   
1  Whether complement dysregulation directly cont...   
2  Variation in the presentation of hereditary im...   
3  Rapid and flexible interpretation of conflicti...   
4  Myosin 5a is a dual-headed molecular motor tha...   

                                            headings  \
0  [Abstract, Introduction, Results, Discussion, ...   
1  [Abstract, Introduction, Results, Discussion, ...   
2  [Abstract, Introduction, Results, Discussion, ...   
3  [Abstract, Introduction, Results, Discussion, ...   
4  [Abstract, Introduction, Results, Discussio

## Keywords extraction (DON'T Need to run this section in future use)

In [None]:
#ELIFE
column_name = 'keywords'

keyword_values = []

with open(file_path, 'r', encoding='utf-8') as f:#file_path_plos
    for line in f:
        # 解析每一行的JSON数据
        json_data = json.loads(line.strip())

        # 提取特定列的值，并添加到列表中
        # 这里使用了dict.get方法，它会在键不存在时返回None，避免KeyError异常
        value = json_data.get(column_name)

        # 如果你只想添加存在的值，可以进行检查
        if value is not None:
            keyword_values.append(value)

# 现在，column_values列表包含了你感兴趣的列的所有值
# 下面是打印这个列表的前几个元素作为示例
print(keyword_values[:5])

keyword_dict = {}

for values in keyword_values:
    for value in values:
        if value not in keyword_dict:
            keyword_dict[value] = 0
        keyword_dict[value] += 1

[['epidemiology and global health'], ['microbiology and infectious disease', 'immunology and inflammation'], ['microbiology and infectious disease', 'immunology and inflammation'], ['neuroscience'], ['structural biology and molecular biophysics']]


In [None]:
import pprint
pprint.pprint(keyword_dict)
# unique keyword values
unique_keywords = set(keyword_dict.keys())
pprint.pprint(unique_keywords)

{'biochemistry and chemical biology': 505,
 'cancer biology': 207,
 'cell biology': 922,
 'chromosomes and gene expression': 353,
 'computational and systems biology': 298,
 'developmental biology': 553,
 'ecology': 154,
 'epidemiology and global health': 104,
 'evolutionary biology': 253,
 'genetics and genomics': 261,
 'immunology and inflammation': 243,
 'medicine': 96,
 'microbiology and infectious disease': 420,
 'neuroscience': 1240,
 'physics of living systems': 87,
 'plant biology': 158,
 'research communication': 14,
 'short report': 227,
 'stem cells and regenerative medicine': 131,
 'structural biology and molecular biophysics': 480,
 'tools and resources': 205}
{'biochemistry and chemical biology',
 'cancer biology',
 'cell biology',
 'chromosomes and gene expression',
 'computational and systems biology',
 'developmental biology',
 'ecology',
 'epidemiology and global health',
 'evolutionary biology',
 'genetics and genomics',
 'immunology and inflammation',
 'medicine',
 

## Create keyword subsets in json (DON'T Need to run this section in future use)

In [None]:
# create a json file that stores papers according to keyword
def keyword_articles(file_path, save_path):
    keyword_data = {}
    with open(file_path, 'r') as f:
        print('Converting...')
        for line in tqdm.tqdm(f):
            entry = json.loads(line.strip())
            keywords = entry.get('keywords', []) # extract keywords
            article = entry.get('article', '') # extract articles
            lay_summary = entry.get('lay_summary', '') # extract lay_summary
            for keyword in keywords:
                keyword_data.setdefault(keyword, {'article': [], 'lay_summary': []})
                keyword_data[keyword]['article'].append(article)
                keyword_data[keyword]['lay_summary'].append(lay_summary)

    with open(save_path, 'w') as f:
        json.dump(keyword_data, f, indent=4)
    print(f'Finished, please check file {save_path}')

In [None]:
folder_dir = '/content/drive/My Drive/NLP Final Project/biolaysumm2024_data'
file_path = os.path.join(folder_dir, 'eLife_train.jsonl')
file_path_test = os.path.join(folder_dir, 'eLife_val.jsonl')

save_path = os.path.join(folder_dir, 'eLife_train_keyword.json')
save_path_test = os.path.join(folder_dir, 'eLife_val_keyword.json')

In [None]:
# convert data to key - article/laysummary
keyword_articles(file_path, save_path)

Converting...


4346it [00:04, 909.44it/s]


Finished, please check file /content/drive/My Drive/NLP Final Project/biolaysumm2024_data/eLife_train_keyword.json


In [None]:
keyword_articles(file_path_test, save_path_test)

Converting...


241it [00:00, 278.91it/s]


Finished, please check file /content/drive/My Drive/NLP Final Project/biolaysumm2024_data/eLife_val_keyword.json


## Load and Preprocess json

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
model_name = "facebook/bart-large-xsum"
tokenizer = BartTokenizer.from_pretrained(model_name)  # load the tokenizer
model = BartForConditionalGeneration.from_pretrained(model_name).to(DEVICE)  # load the model

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mse30/bart-base-finetuned-pubmed")
model = AutoModelForSeq2SeqLM.from_pretrained("mse30/bart-base-finetuned-pubmed")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
# choose keyword input (str), this function will help tokenize the articles and laysumm
def preprocess_and_tokenize2(keyword_data, tokenizer, input_keyword):
    # Initialize lists to store preprocessed data
    input_texts = []
    lay_summaries = []

    # Check if the input keyword is in the keyword_data dictionary
    if input_keyword in keyword_data:
        # Retrieve the data associated with the input keyword
        data = keyword_data[input_keyword]
        # Combine articles and lay summaries into single strings
        articles = data.get('article', [])
        lay_summary = data.get('lay_summary', [])

        # Preprocess articles: truncate to first 512 words
        preprocessed_articles = [' '.join(article.split()[:512]) for article in articles]
        # Preprocess lay summaries
        preprocessed_lay_summaries = lay_summary

        # Extend lists with preprocessed data
        input_texts.extend(preprocessed_articles)
        lay_summaries.extend(preprocessed_lay_summaries)

        # Tokenize input texts for BART model input
        inputs = tokenizer(input_texts, max_length=512, padding='max_length', truncation=True, return_tensors="pt")
        # Tokenize lay summaries for model output
        outputs = tokenizer(lay_summaries, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
        return inputs, outputs['input_ids']
    else:
        print("Input keyword not found in the keyword data.")
        return None, None


In [None]:
#choose keyword input (str), this function will help tokenize the articles and laysumm
def preprocess_2(keyword_data, input_keyword):
    # Initialize lists to store preprocessed data
    input_texts = []
    lay_summaries = []

    # Check if the input keyword is in the keyword_data dictionary
    if input_keyword in keyword_data:
        # Retrieve the data associated with the input keyword
        data = keyword_data[input_keyword]
        # Combine articles and lay summaries into single strings
        articles = data.get('article', [])
        lay_summary = data.get('lay_summary', [])

        # Preprocess articles: truncate to first 512 words
        preprocessed_articles = [' '.join(article.split()[:512]) for article in articles]
        # Preprocess lay summaries
        preprocessed_lay_summaries = lay_summary

        # Extend lists with preprocessed data
        input_texts.extend(preprocessed_articles)
        lay_summaries.extend(preprocessed_lay_summaries)

        return input_texts, lay_summaries
    else:
        print("Input keyword not found in the keyword data.")
        return None, None

In [None]:
# load json file
import json
import os

folder_dir = '/content/drive/My Drive/NLP Final Project/biolaysumm2024_data'
file_path = os.path.join(folder_dir, 'eLife_train_keyword.json')
file_path_test  = os.path.join(folder_dir, 'eLife_val_keyword.json')

# choose medicine to test code
input_keyword = 'medicine'
with open(file_path, 'r') as f:
    keyword_data = json.load(f)

# 预处理和分词训练数据
train_inputs, train_labels = preprocess_and_tokenize2(keyword_data, tokenizer, input_keyword)

In [None]:
with open(file_path_test, 'r') as f:
    keyword_data_val = json.load(f)
# 预处理和分词验证数据
val_inputs, val_labels = preprocess_and_tokenize2(keyword_data_val, tokenizer, input_keyword)

In [None]:
print(train_inputs['input_ids'].shape)

torch.Size([96, 512])


## Pretrained model on subsets(Oldest Version - Discarded）

In [None]:
from torch.utils.data import Dataset, DataLoader

class MedicineDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

# 创建Dataset
#train_dataset = MedicineDataset(train_inputs, train_labels)
#val_dataset = MedicineDataset(val_inputs, val_labels)

# 创建DataLoader
#train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
#val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

In [None]:
!pip install textstat
!pip install bert_score

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/105.1 kB[0m [31m1.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m29.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.3


In [None]:
!bash ./get_models.sh


mkdir: cannot create directory ‘models’: File exists
fatal: destination path 'LENS' already exists and is not an empty directory.
Processing ./LENS/lens
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pandas==1.1.5 (from lens-metric==0.1.1)
  Using cached pandas-1.1.5.tar.gz (5.2 MB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by user[0m[31m
[0m  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
 35 1177M   35  416M    0     0  37.4M      0  0:00:31  0:00:11  0:00:20 48.1M^C


In [None]:
!pip install -r requirements.txt



In [None]:
import os, sys, json
import textstat
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score
import nltk
#from alignscore import AlignScore
#from lens.lens_score import LENS
import torch
from summac.model_summac import SummaCConv

nltk.download('punkt')

def calc_rouge(preds, refs):
  # Get ROUGE F1 scores
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], \
                                    use_stemmer=True, split_summaries=True)
  scores = [scorer.score(p, refs[i]) for i, p in enumerate(preds)]
  return np.mean([s['rouge1'].fmeasure for s in scores]), \
         np.mean([s['rouge2'].fmeasure for s in scores]), \
         np.mean([s['rougeLsum'].fmeasure for s in scores])

def calc_bertscore(preds, refs):
  # Get BERTScore F1 scores
  P, R, F1 = score(preds, refs, lang="en", verbose=True, device='cuda:0')
  return np.mean(F1.tolist())

def calc_readability(preds):
  fkgl_scores = []
  cli_scores = []
  dcrs_scores = []
  for pred in preds:
    fkgl_scores.append(textstat.flesch_kincaid_grade(pred))
    cli_scores.append(textstat.coleman_liau_index(pred))
    dcrs_scores.append(textstat.dale_chall_readability_score(pred))
  return np.mean(fkgl_scores), np.mean(cli_scores), np.mean(dcrs_scores)

def calc_lens(preds, refs, docs):
  model_path = "./models/LENS/LENS/checkpoints/epoch=5-step=6102.ckpt"
  metric = LENS(model_path, rescale=True)
  abstracts = [d.split("\n")[0] for d in docs]
  refs = [[x] for x in refs]

  scores = metric.score(abstracts, preds, refs, batch_size=8, gpus=1)
  return np.mean(scores)

def calc_alignscore(preds, docs):
  alignscorer = AlignScore(model='roberta-base', batch_size=16, device='cuda:0', \
                           ckpt_path='./models/AlignScore/AlignScore-base.ckpt', evaluation_mode='nli_sp')
  return np.mean(alignscorer.score(contexts=docs, claims=preds))

def cal_summac(preds, docs):
  model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device="cuda", start_file="default", agg="mean")
  return np.mean(model_conv.score(docs, preds)['scores'])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
from transformers import EvalPrediction
from rouge_score import rouge_scorer
!pip install textstat
import textstat
from bert_score import score

def compute_metrics(p: EvalPrediction):
    predictions = p.predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predicted_ids = predictions.argmax(-1)
    decoded_preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in predicted_ids]
    decoded_labels = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in p.label_ids]

    rouge_results = calc_rouge(decoded_preds, decoded_labels)
    rouge_results = {key: results for key, results in zip(['rouge1' , 'rouge2', 'rougeL'], list(rouge_results))}
    bert_score = calc_bertscore(decoded_preds, decoded_labels)

    avg_fkgl, avg_cli, avg_dcrs = calc_readability(decoded_preds)
    avg_sum=cal_summac(decoded_preds, docs)

    return {
        **rouge_results,
        "bert_score": bert_score,
        "avg_fkgl": avg_fkgl,
        "avg_cli": avg_cli,
        "avg_dcrs": avg_dcrs
    }



In [None]:
#!pip install accelerate -U
from transformers import BartForConditionalGeneration, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100
)


#trainer = Trainer(
#    model=model,
#    args=training_args,
#    train_dataset=train_dataset,
#    eval_dataset=val_dataset,
#    compute_metrics=compute_metrics,
#)

# 训练模型
#trainer.train()

In [None]:
#trainer.evaluate()

{'eval_loss': 1.704986572265625,
 'eval_rouge1': 0.6456050898141762,
 'eval_rouge2': 0.30986479548980916,
 'eval_rougeL': 0.5512341911985836,
 'eval_avg_fkgl': 56.59406249999998,
 'eval_fkg_age': 9.890625000000002,
 'eval_runtime': 7.7992,
 'eval_samples_per_second': 12.309,
 'eval_steps_per_second': 1.539,
 'epoch': 3.0}

In [None]:
keyword_list={'biochemistry and chemical biology': 505,
 'cell biology': 922,
 'developmental biology': 553,
 'microbiology and infectious disease': 420,
 'neuroscience': 1240,
 'structural biology and molecular biophysics': 480}

In [None]:
keyword_list={
 'structural biology and molecular biophysics': 480}

In [None]:
folder_dir = '/content/drive/My Drive/NLP Final Project/biolaysumm2024_data'
file_path = os.path.join(folder_dir, 'eLife_train_keyword.json')
file_path_test  = os.path.join(folder_dir, 'eLife_val_keyword.json')

In [None]:
results_df = {}

for keyword in keyword_list:
    with open(file_path, 'r') as f:
        keyword_data = json.load(f)
    with open(file_path_test, 'r') as f:
        keyword_data_val = json.load(f)

    # Preprocess and tokenize training and validation data
    train_inputs, train_labels = preprocess_and_tokenize2(keyword_data, tokenizer, keyword)
    val_inputs, val_labels = preprocess_and_tokenize2(keyword_data_val, tokenizer, keyword)

    # Create Dataset and DataLoader
    train_dataset = MedicineDataset(train_inputs, train_labels)
    val_dataset = MedicineDataset(val_inputs, val_labels)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    # Setup the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    # Train and evaluate the model
    trainer.train()
    evaluation_results = trainer.evaluate()

    # Append results to the DataFrame
    results_df[keyword]= evaluation_results



# Print the DataFrame to view the results
print(results_df)

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bert Score,Avg Fkgl,Avg Cli,Avg Dcrs
100,2.2023,2.419927,0.545991,0.190804,0.500312,0.85974,11.410345,9.468276,9.223103


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.19 seconds, 24.32 sentences/sec


Checkpoint destination directory ./results/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.17 seconds, 24.87 sentences/sec


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bert Score,Avg Fkgl,Avg Cli,Avg Dcrs
100,2.2872,2.300878,0.551494,0.198129,0.525444,0.864017,7.304348,9.966957,8.621087
200,1.9798,2.27671,0.56301,0.207419,0.526298,0.865726,9.354348,10.693478,8.987174
300,1.5097,2.333712,0.564232,0.209921,0.527746,0.867705,9.165217,10.950217,9.267391


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.76 seconds, 26.08 sentences/sec


Checkpoint destination directory ./results/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.77 seconds, 26.04 sentences/sec


Checkpoint destination directory ./results/checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.89 seconds, 24.33 sentences/sec


Checkpoint destination directory ./results/checkpoint-300 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.85 seconds, 24.91 sentences/sec


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bert Score,Avg Fkgl,Avg Cli,Avg Dcrs
100,1.9858,2.515091,0.560467,0.198864,0.520441,0.864211,8.3,9.487619,8.474286
200,1.5058,2.523253,0.564081,0.206558,0.529307,0.865962,9.252381,10.14619,8.777143


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.89 seconds, 23.52 sentences/sec


Checkpoint destination directory ./results/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.97 seconds, 21.65 sentences/sec


Checkpoint destination directory ./results/checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.01 seconds, 20.77 sentences/sec


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bert Score,Avg Fkgl,Avg Cli,Avg Dcrs
100,2.0788,2.289491,0.573409,0.220781,0.533544,0.866007,9.188462,10.540769,8.807692


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.06 seconds, 24.61 sentences/sec


Checkpoint destination directory ./results/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.09 seconds, 23.94 sentences/sec


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bert Score,Avg Fkgl,Avg Cli,Avg Dcrs
100,2.3509,2.376192,0.560746,0.206997,0.527252,0.86363,8.714493,10.205217,8.480145
200,1.9348,2.390561,0.559312,0.213478,0.524819,0.864638,8.850725,10.38942,8.631884
300,1.9271,2.346908,0.565822,0.218142,0.533251,0.865872,8.578261,10.59087,8.603623
400,1.5342,2.420903,0.566701,0.218607,0.531241,0.867273,9.353623,10.790725,8.792029


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.62 seconds, 26.29 sentences/sec


Checkpoint destination directory ./results/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.69 seconds, 25.65 sentences/sec


Checkpoint destination directory ./results/checkpoint-200 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.68 seconds, 25.78 sentences/sec


Checkpoint destination directory ./results/checkpoint-300 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.68 seconds, 25.78 sentences/sec


Checkpoint destination directory ./results/checkpoint-400 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.79 seconds, 24.74 sentences/sec


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Bert Score,Avg Fkgl,Avg Cli,Avg Dcrs
100,1.7698,2.346217,0.571823,0.209275,0.529447,0.864376,10.359259,10.21037,9.245926


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.08 seconds, 25.03 sentences/sec


Checkpoint destination directory ./results/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'max_length': 62, 'min_length': 11, 'early_stopping': True, 'num_beams': 6, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.08 seconds, 25.02 sentences/sec
{'biochemistry and chemical biology': {'eval_loss': 2.3999407291412354, 'eval_rouge1': 0.5571657918494475, 'eval_rouge2': 0.19903460081470561, 'eval_rougeL': 0.5215711373603467, 'eval_bert_score': 0.8634871799370338, 'eval_avg_fkgl': 8.76551724137931, 'eval_avg_cli': 10.44344827586207, 'eval_avg_dcrs': 9.15, 'eval_runtime': 9.1633, 'eval_samples_per_second': 3.165, 'eval_steps_per_second': 0.437, 'epoch': 3.0}, 'cell biology': {'eval_loss': 2.296180009841919, 'eval_rouge1': 0.5643682740035066, 'eval_rouge2': 0.21288116857351283, 'eval_rougeL': 0.5270713769652733, 'eval_bert_score': 0.8672683614751567, 'eval_avg_fkgl': 9.697826086956523, 'eval_avg_cli': 11.018043478260868, 'eval_avg_dcrs': 9.298043478260867, 'eval_runtime': 10.2869, 'eval_samples_per_second': 4.472, 'eval_steps_per_second': 0.583, 'epoch': 3.0}, 'developmental biology': {'eval_loss': 2.521914005279541, 'eval_rouge1': 0.5666135677944008, 'eval_rouge2': 0.20661360689117167, 'eval_

In [None]:
import os

results_df = {}
model_save_directory = 'My Drive/nlp final/model'  # 设定一个目录来保存所有模型

for keyword in keyword_list:
    with open(file_path, 'r') as f:
        keyword_data = json.load(f)
    with open(file_path_test, 'r') as f:
        keyword_data_val = json.load(f)

    train_inputs, train_labels = preprocess_and_tokenize2(keyword_data, tokenizer, keyword)
    val_inputs, val_labels = preprocess_and_tokenize2(keyword_data_val, tokenizer, keyword)

    train_dataset = MedicineDataset(train_inputs, train_labels)
    val_dataset = MedicineDataset(val_inputs, val_labels)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    evaluation_results = trainer.evaluate()

    results_df[keyword] = evaluation_results

    # 为每个关键词保存模型
    keyword_model_path = os.path.join(model_save_directory, keyword)
    os.makedirs(keyword_model_path, exist_ok=True)  # 确保目录存在
    model.save_pretrained(keyword_model_path)
    tokenizer.save_pretrained(keyword_model_path)

# 打印结果 DataFrame
print(results_df)


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss


In [None]:
import pprint
pprint.pprint(results_df)

{'biochemistry and chemical biology': {'epoch': 3.0,
                                       'eval_avg_cli': 10.44344827586207,
                                       'eval_avg_dcrs': 9.15,
                                       'eval_avg_fkgl': 8.76551724137931,
                                       'eval_bert_score': 0.8634871799370338,
                                       'eval_loss': 2.3999407291412354,
                                       'eval_rouge1': 0.5571657918494475,
                                       'eval_rouge2': 0.19903460081470561,
                                       'eval_rougeL': 0.5215711373603467,
                                       'eval_runtime': 9.1633,
                                       'eval_samples_per_second': 3.165,
                                       'eval_steps_per_second': 0.437},
 'cell biology': {'epoch': 3.0,
                  'eval_avg_cli': 11.018043478260868,
                  'eval_avg_dcrs': 9.298043478260867,
                

## Train on Subsets and Evaluation（New version）

In [None]:
!pip install accelerate -U



In [None]:
!pip install textstat
!pip install rouge_score
!pip install bert_score
!pip install summac

Collecting textstat
  Downloading textstat-0.7.3-py3-none-any.whl (105 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyphen (from textstat)
  Downloading pyphen-0.15.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.15.0 textstat-0.7.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24933 sha256=a945a1327c7a0b0e4b01a937687f850b06a3948f910312ac91454b58b3909bfb
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os, sys, json
import textstat
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score
import nltk
import torch
from summac.model_summac import SummaCConv

nltk.download('punkt')

def calc_rouge(preds, refs):
  # Get ROUGE F1 scores
  scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeLsum'], \
                                    use_stemmer=True, split_summaries=True)
  scores = [scorer.score(p, refs[i]) for i, p in enumerate(preds)]
  return np.mean([s['rouge1'].fmeasure for s in scores]), \
         np.mean([s['rouge2'].fmeasure for s in scores]), \
         np.mean([s['rougeLsum'].fmeasure for s in scores])

def calc_bertscore(preds, refs):
  # Get BERTScore F1 scores
  P, R, F1 = score(preds, refs, lang="en", verbose=True, device='cuda:0')
  return np.mean(F1.tolist())

def calc_readability(preds):
  fkgl_scores = []
  cli_scores = []
  dcrs_scores = []
  for pred in preds:
    fkgl_scores.append(textstat.flesch_kincaid_grade(pred))
    cli_scores.append(textstat.coleman_liau_index(pred))
    dcrs_scores.append(textstat.dale_chall_readability_score(pred))
  return np.mean(fkgl_scores), np.mean(cli_scores), np.mean(dcrs_scores)

def calc_lens(preds, refs, docs):
  model_path = "./models/LENS/LENS/checkpoints/epoch=5-step=6102.ckpt"
  metric = LENS(model_path, rescale=True)
  abstracts = [d.split("\n")[0] for d in docs]
  refs = [[x] for x in refs]

  scores = metric.score(abstracts, preds, refs, batch_size=8, gpus=1)
  return np.mean(scores)

def calc_alignscore(preds, docs):
  alignscorer = AlignScore(model='roberta-base', batch_size=16, device='cuda:0', \
                           ckpt_path='./models/AlignScore/AlignScore-base.ckpt', evaluation_mode='nli_sp')
  return np.mean(alignscorer.score(contexts=docs, claims=preds))

def cal_summac(preds, docs):
  model_conv = SummaCConv(models=["vitc"], bins='percentile', granularity="sentence", nli_labels="e", device="cuda", start_file="default", agg="mean")
  return np.mean(model_conv.score(docs, preds)['scores'])


  _torch_pytree._register_pytree_node(
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### pubmed

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mse30/bart-base-finetuned-pubmed")
model = AutoModelForSeq2SeqLM.from_pretrained("mse30/bart-base-finetuned-pubmed")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
import json
from transformers import Trainer, EvalPrediction, TrainingArguments
from torch.utils.data import DataLoader
import torch
from torch.utils.data import Dataset, DataLoader

class MedicineDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

def preprocess_and_tokenize2(keyword_data, tokenizer, input_keyword):
    input_texts = []
    lay_summaries = []

    if input_keyword in keyword_data:
        data = keyword_data[input_keyword]
        articles = data.get('article', [])
        lay_summary = data.get('lay_summary', [])
        preprocessed_articles = [' '.join(article.split()[:512]) for article in articles]
        preprocessed_lay_summaries = lay_summary
        input_texts.extend(preprocessed_articles)
        lay_summaries.extend(preprocessed_lay_summaries)
        inputs = tokenizer(input_texts, max_length=512, padding='max_length', truncation=True, return_tensors="pt")
        outputs = tokenizer(lay_summaries, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
        return inputs, outputs['input_ids']
    else:
        print("Input keyword not found in the keyword data.")
        return None, None

def preprocess_2(keyword_data, input_keyword):
    lay_summaries = []

    if input_keyword in keyword_data:
        data = keyword_data[input_keyword]
        articles = data.get('article', [])
        lay_summary = data.get('lay_summary', [])

        preprocessed_articles = [' '.join(article.split()[:512]) for article in articles]
        preprocessed_lay_summaries = lay_summary

        input_texts.extend(preprocessed_articles)
        lay_summaries.extend(preprocessed_lay_summaries)

        return input_texts, lay_summaries
    else:
        print("Input keyword not found in the keyword data.")
        return None, None

class CustomTrainer(Trainer):
    def __init__(self, *args, keyword_data, input_keyword, **kwargs):
        super().__init__(*args, **kwargs)
        self.keyword_data = keyword_data
        self.input_keyword = input_keyword
        self.docs, _ = preprocess_2(self.keyword_data, self.input_keyword)
        if self.docs is None:
            raise ValueError("Keyword not found or no data available.")

def compute_metrics(p, docs):

    predictions = p.predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predicted_ids = predictions.argmax(-1)
    decoded_preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in predicted_ids]
    decoded_labels = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in p.label_ids]

    rouge_results = calc_rouge(decoded_preds, decoded_labels)
    rouge_results = {key: results for key, results in zip(['rouge1' , 'rouge2', 'rougeL'], list(rouge_results))}
    bert_score = calc_bertscore(decoded_preds, decoded_labels)

    avg_fkgl, avg_cli, avg_dcrs = calc_readability(decoded_preds)
    avg_sum=cal_summac(decoded_preds, docs)

    return {
        **rouge_results,
        "bert_score": bert_score,
        "avg_fkgl": avg_fkgl,
        "avg_cli": avg_cli,
        "avg_dcrs": avg_dcrs,
        "summac_score":avg_sum
    }

results_df = {}
keyword_list={'biochemistry and chemical biology': 505,
 'cell biology': 922,
 'developmental biology': 553,
 'microbiology and infectious disease': 420,
 'neuroscience': 1240,
 'structural biology and molecular biophysics': 480}

folder_dir = '/content/drive/My Drive/NLP Final Project/biolaysumm2024_data'
file_path = os.path.join(folder_dir, 'eLife_train_keyword.json')
file_path_test  = os.path.join(folder_dir, 'eLife_val_keyword.json')
base_model_save_path = "/content/drive/My Drive/model/"
for keyword in keyword_list:
    with open(file_path, 'r') as f:
        keyword_data = json.load(f)
    with open(file_path_test, 'r') as f:
        keyword_data_val = json.load(f)

    train_inputs, train_labels = preprocess_and_tokenize2(keyword_data, tokenizer, keyword)
    val_inputs, val_labels = preprocess_and_tokenize2(keyword_data_val, tokenizer, keyword)

    train_dataset = MedicineDataset(train_inputs, train_labels)
    val_dataset = MedicineDataset(val_inputs, val_labels)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        keyword_data=keyword_data,
        input_keyword=keyword
    )

    trainer.train()
    keyword_save_path = os.path.join(base_model_save_path, f"model_save_{keyword.replace(' ', '_')}")
    os.makedirs(keyword_save_path, exist_ok=True)
    model.save_pretrained(keyword_save_path)
    tokenizer.save_pretrained(keyword_save_path)

    predict_output = trainer.predict(val_dataset)
    metrics = compute_metrics(predict_output, trainer.docs)

    results_df[keyword] = metrics

print(results_df)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
import pprint
results_df_pubmed={'biochemistry and chemical biology': {'rouge1': 0.5261942596556795, 'rouge2': 0.16537349974618143, 'rougeL': 0.49146141830803386, 'bert_score': 0.8556824182641918, 'avg_fkgl': 7.886206896551725, 'avg_cli': 9.607586206896553, 'avg_dcrs': 8.86896551724138, 'summac_score': 0.41749052240930756}, 'cell biology': {'rouge1': 0.5427326598274683, 'rouge2': 0.1799154634765325, 'rougeL': 0.5035828109597411, 'bert_score': 0.8612991921279741, 'avg_fkgl': 8.86304347826087, 'avg_cli': 10.472826086956522, 'avg_dcrs': 8.824782608695653, 'summac_score': 0.41746153585288837}, 'developmental biology': {'rouge1': 0.5429999087077958, 'rouge2': 0.18883587409077202, 'rougeL': 0.5038135610969696, 'bert_score': 0.8608331453232538, 'avg_fkgl': 8.233333333333334, 'avg_cli': 9.323809523809523, 'avg_dcrs': 8.53, 'summac_score': 0.45047215478760855}, 'microbiology and infectious disease': {'rouge1': 0.5442487393418923, 'rouge2': 0.18829237594167691, 'rougeL': 0.506254518621505, 'bert_score': 0.8607531006519611, 'avg_fkgl': 8.761538461538462, 'avg_cli': 10.846153846153847, 'avg_dcrs': 8.76153846153846, 'summac_score': 0.3951897563842627}, 'neuroscience': {'rouge1': 0.5539640145270395, 'rouge2': 0.1985209760823631, 'rougeL': 0.5157063730811436, 'bert_score': 0.860499754332114, 'avg_fkgl': 8.657971014492754, 'avg_cli': 10.360000000000003, 'avg_dcrs': 8.503043478260873, 'summac_score': 0.43131061395009357}, 'structural biology and molecular biophysics': {'rouge1': 0.5598335601922456, 'rouge2': 0.19362457961582666, 'rougeL': 0.5175096469194662, 'bert_score': 0.8600004514058431, 'avg_fkgl': 9.492592592592594, 'avg_cli': 10.129999999999999, 'avg_dcrs': 9.184814814814816, 'summac_score': 0.39549706710709465}}
pprint.pprint(results_df_pubmed)


{'biochemistry and chemical biology': {'avg_cli': 9.607586206896553,
                                       'avg_dcrs': 8.86896551724138,
                                       'avg_fkgl': 7.886206896551725,
                                       'bert_score': 0.8556824182641918,
                                       'rouge1': 0.5261942596556795,
                                       'rouge2': 0.16537349974618143,
                                       'rougeL': 0.49146141830803386,
                                       'summac_score': 0.41749052240930756},
 'cell biology': {'avg_cli': 10.472826086956522,
                  'avg_dcrs': 8.824782608695653,
                  'avg_fkgl': 8.86304347826087,
                  'bert_score': 0.8612991921279741,
                  'rouge1': 0.5427326598274683,
                  'rouge2': 0.1799154634765325,
                  'rougeL': 0.5035828109597411,
                  'summac_score': 0.41746153585288837},
 'developmental biology': {'avg_cli

### original bart

In [None]:
from transformers import BartForConditionalGeneration, BartTokenizer
model_name = "facebook/bart-large-xsum"
tokenizer = BartTokenizer.from_pretrained(model_name)  # load the tokenizer
model_bart = BartForConditionalGeneration.from_pretrained(model_name)  # load the model

In [None]:
import json
from transformers import Trainer, EvalPrediction, TrainingArguments
from torch.utils.data import DataLoader
import torch
from torch.utils.data import Dataset, DataLoader

class MedicineDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

def preprocess_and_tokenize2(keyword_data, tokenizer, input_keyword):
    input_texts = []
    lay_summaries = []

    if input_keyword in keyword_data:
        data = keyword_data[input_keyword]
        articles = data.get('article', [])
        lay_summary = data.get('lay_summary', [])
        preprocessed_articles = [' '.join(article.split()[:512]) for article in articles]
        preprocessed_lay_summaries = lay_summary
        input_texts.extend(preprocessed_articles)
        lay_summaries.extend(preprocessed_lay_summaries)
        inputs = tokenizer(input_texts, max_length=512, padding='max_length', truncation=True, return_tensors="pt")
        outputs = tokenizer(lay_summaries, max_length=128, padding='max_length', truncation=True, return_tensors="pt")
        return inputs, outputs['input_ids']
    else:
        print("Input keyword not found in the keyword data.")
        return None, None


def preprocess_2(keyword_data, input_keyword):
    input_texts = []
    lay_summaries = []

    if input_keyword in keyword_data:
        data = keyword_data[input_keyword]
        articles = data.get('article', [])
        lay_summary = data.get('lay_summary', [])

        preprocessed_articles = [' '.join(article.split()[:512]) for article in articles]
        preprocessed_lay_summaries = lay_summary

        input_texts.extend(preprocessed_articles)
        lay_summaries.extend(preprocessed_lay_summaries)

        return input_texts, lay_summaries
    else:
        print("Input keyword not found in the keyword data.")
        return None, None
class CustomTrainer(Trainer):
    def __init__(self, *args, keyword_data, input_keyword, **kwargs):
        super().__init__(*args, **kwargs)
        self.keyword_data = keyword_data
        self.input_keyword = input_keyword
        self.docs, _ = preprocess_2(self.keyword_data, self.input_keyword)
        if self.docs is None:
            raise ValueError("Keyword not found or no data available.")

def compute_metrics(p, docs):

    predictions = p.predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predicted_ids = predictions.argmax(-1)
    decoded_preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in predicted_ids]
    decoded_labels = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in p.label_ids]

    rouge_results = calc_rouge(decoded_preds, decoded_labels)
    rouge_results = {key: results for key, results in zip(['rouge1' , 'rouge2', 'rougeL'], list(rouge_results))}
    bert_score = calc_bertscore(decoded_preds, decoded_labels)

    avg_fkgl, avg_cli, avg_dcrs = calc_readability(decoded_preds)
    avg_sum=cal_summac(decoded_preds, docs)

    return {
        **rouge_results,
        "bert_score": bert_score,
        "avg_fkgl": avg_fkgl,
        "avg_cli": avg_cli,
        "avg_dcrs": avg_dcrs,
        "summac_score":avg_sum
    }

results_df = {}
keyword_list={'biochemistry and chemical biology': 505,
 'cell biology': 922,
 'developmental biology': 553,
 'microbiology and infectious disease': 420,
 'neuroscience': 1240,
 'structural biology and molecular biophysics': 480}

folder_dir = '/content/drive/My Drive/NLP Final Project/biolaysumm2024_data'
file_path = os.path.join(folder_dir, 'eLife_train_keyword.json')
file_path_test  = os.path.join(folder_dir, 'eLife_val_keyword.json')
base_model_save_path = "/content/drive/My Drive/model/"
for keyword in keyword_list:
    with open(file_path, 'r') as f:
        keyword_data = json.load(f)
    with open(file_path_test, 'r') as f:
        keyword_data_val = json.load(f)

    train_inputs, train_labels = preprocess_and_tokenize2(keyword_data, tokenizer, keyword)
    val_inputs, val_labels = preprocess_and_tokenize2(keyword_data_val, tokenizer, keyword)

    train_dataset = MedicineDataset(train_inputs, train_labels)
    val_dataset = MedicineDataset(val_inputs, val_labels)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100
    )

    trainer = CustomTrainer(
        model=model_bart,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        keyword_data=keyword_data,
        input_keyword=keyword
    )

    trainer.train()
    #keyword_save_path = os.path.join(base_model_save_path, f"bart_model_save_{keyword.replace(' ', '_')}")
    #os.makedirs(keyword_save_path, exist_ok=True)
    #model.save_pretrained(keyword_save_path)
    #tokenizer.save_pretrained(keyword_save_path)

    predict_output = trainer.predict(val_dataset)
    metrics = compute_metrics(predict_output, trainer.docs)

    results_df[keyword] = metrics

print(results_df)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss
100,2.1907,2.446174


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.21 seconds, 24.06 sentences/sec
<All keys matched successfully>


  _torch_pytree._register_pytree_node(
  histograms = torch.FloatTensor(histograms).to(self.device)
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss
100,2.2758,2.300059
200,1.9915,2.289924
300,1.5711,2.31453


Step,Training Loss,Validation Loss
100,2.2758,2.300059
200,1.9915,2.289924
300,1.5711,2.31453


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/2 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.75 seconds, 26.24 sentences/sec
<All keys matched successfully>


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss
100,2.1125,2.424093
200,1.457,2.511576


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.90 seconds, 23.46 sentences/sec
<All keys matched successfully>


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss
100,2.0422,2.289065


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.08 seconds, 24.02 sentences/sec
<All keys matched successfully>


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss
100,2.3432,2.411906
200,1.9677,2.367114
300,1.8989,2.338944
400,1.5214,2.397351


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 2.51 seconds, 27.49 sentences/sec
<All keys matched successfully>


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss
100,1.6616,2.359049


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.12 seconds, 24.14 sentences/sec
<All keys matched successfully>
{'biochemistry and chemical biology': {'rouge1': 0.5557641271968649, 'rouge2': 0.19634067770562091, 'rougeL': 0.5222785495587291, 'bert_score': 0.8633655802956943, 'avg_fkgl': 8.713793103448277, 'avg_cli': 10.283103448275861, 'avg_dcrs': 9.176206896551722, 'summac_score': 0.3947130564985604}, 'cell biology': {'rouge1': 0.5653088430042418, 'rouge2': 0.2102211665141096, 'rougeL': 0.5272597911261301, 'bert_score': 0.8667732451273047, 'avg_fkgl': 9.52608695652174, 'avg_cli': 11.135217391304348, 'avg_dcrs': 9.23195652173913, 'summac_score': 0.39551061132679816}, 'developmental biology': {'rouge1': 0.5639139983221687, 'rouge2': 0.19947149211717358, 'rougeL': 0.5235325590768227, 'bert_score': 0.8651603772526696, 'avg_fkgl': 8.957142857142856, 'avg_cli': 10.27142857142857, 'avg_dcrs': 8.819047619047618, 'summac_score': 0.40368049485342844}, 'microbiology and infectious disease': {'rouge1': 0.5653336846318665, 'rouge2': 0

In [None]:
import pprint
results_df_bart={'biochemistry and chemical biology': {'rouge1': 0.5557641271968649, 'rouge2': 0.19634067770562091, 'rougeL': 0.5222785495587291, 'bert_score': 0.8633655802956943, 'avg_fkgl': 8.713793103448277, 'avg_cli': 10.283103448275861, 'avg_dcrs': 9.176206896551722, 'summac_score': 0.3947130564985604}, 'cell biology': {'rouge1': 0.5653088430042418, 'rouge2': 0.2102211665141096, 'rougeL': 0.5272597911261301, 'bert_score': 0.8667732451273047, 'avg_fkgl': 9.52608695652174, 'avg_cli': 11.135217391304348, 'avg_dcrs': 9.23195652173913, 'summac_score': 0.39551061132679816}, 'developmental biology': {'rouge1': 0.5639139983221687, 'rouge2': 0.19947149211717358, 'rougeL': 0.5235325590768227, 'bert_score': 0.8651603772526696, 'avg_fkgl': 8.957142857142856, 'avg_cli': 10.27142857142857, 'avg_dcrs': 8.819047619047618, 'summac_score': 0.40368049485342844}, 'microbiology and infectious disease': {'rouge1': 0.5653336846318665, 'rouge2': 0.22692159254893424, 'rougeL': 0.5277475844561814, 'bert_score': 0.8670703699955573, 'avg_fkgl': 9.63076923076923, 'avg_cli': 11.056153846153846, 'avg_dcrs': 9.18423076923077, 'summac_score': 0.3854228613468317}, 'neuroscience': {'rouge1': 0.5664507153288848, 'rouge2': 0.22145644285891364, 'rougeL': 0.529811544800716, 'bert_score': 0.868166749028192, 'avg_fkgl': 9.514492753623188, 'avg_cli': 10.953768115942028, 'avg_dcrs': 8.823478260869566, 'summac_score': 0.3977303476869196}, 'structural biology and molecular biophysics': {'rouge1': 0.561827839269482, 'rouge2': 0.20020731438030132, 'rougeL': 0.5279639263220679, 'bert_score': 0.8650240302085876, 'avg_fkgl': 9.344444444444443, 'avg_cli': 10.212592592592593, 'avg_dcrs': 9.330740740740742, 'summac_score': 0.3805476349812967}}
pprint.pprint(results_df_bart)

{'biochemistry and chemical biology': {'avg_cli': 10.283103448275861,
                                       'avg_dcrs': 9.176206896551722,
                                       'avg_fkgl': 8.713793103448277,
                                       'bert_score': 0.8633655802956943,
                                       'rouge1': 0.5557641271968649,
                                       'rouge2': 0.19634067770562091,
                                       'rougeL': 0.5222785495587291,
                                       'summac_score': 0.3947130564985604},
 'cell biology': {'avg_cli': 11.135217391304348,
                  'avg_dcrs': 9.23195652173913,
                  'avg_fkgl': 9.52608695652174,
                  'bert_score': 0.8667732451273047,
                  'rouge1': 0.5653088430042418,
                  'rouge2': 0.2102211665141096,
                  'rougeL': 0.5272597911261301,
                  'summac_score': 0.39551061132679816},
 'developmental biology': {'avg_cli'

## Part III: Training on Modified dataset (with definition replacement)

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("mse30/bart-base-finetuned-pubmed")
model = AutoModelForSeq2SeqLM.from_pretrained("mse30/bart-base-finetuned-pubmed")

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [None]:
import json
from transformers import Trainer, EvalPrediction, TrainingArguments
from torch.utils.data import DataLoader
import torch
from torch.utils.data import Dataset, DataLoader

class MedicineDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx].clone().detach()
        return item

    def __len__(self):
        return len(self.labels)

def preprocess_and_tokenize2(keyword_data, tokenizer, input_keyword):
    input_texts = []
    lay_summaries = []
    if input_keyword in keyword_data:
        data = keyword_data[input_keyword]
        articles = data.get('article', [])
        lay_summary = data.get('lay_summary', [])
        input_texts.extend(articles)
        lay_summaries.extend(lay_summary)
        inputs = tokenizer(articles, padding='longest', truncation=True, max_length=600, return_tensors="pt")
        outputs = tokenizer(lay_summaries, padding='longest', truncation=True, max_length=600, return_tensors="pt")
        return inputs, outputs['input_ids']
    else:
        print("Input keyword not found in the keyword data.")
        return None, None


def preprocess_2(keyword_data, input_keyword):
    input_texts = []
    lay_summaries = []

    if input_keyword in keyword_data:
        data = keyword_data[input_keyword]
        articles = data.get('article', [])
        lay_summary = data.get('lay_summary', [])

        preprocessed_articles = [' '.join(article.split()[:1000]) for article in articles]
        preprocessed_lay_summaries = lay_summary

        input_texts.extend(preprocessed_articles)
        lay_summaries.extend(preprocessed_lay_summaries)

        return input_texts, lay_summaries
    else:
        print("Input keyword not found in the keyword data.")
        return None, None
class CustomTrainer(Trainer):
    def __init__(self, *args, keyword_data, input_keyword, **kwargs):
        super().__init__(*args, **kwargs)
        self.keyword_data = keyword_data
        self.input_keyword = input_keyword
        self.docs, _ = preprocess_2(self.keyword_data, self.input_keyword)
        if self.docs is None:
            raise ValueError("Keyword not found or no data available.")

def compute_metrics(p, docs):

    predictions = p.predictions
    if isinstance(predictions, tuple):
        predictions = predictions[0]
    predicted_ids = predictions.argmax(-1)
    decoded_preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in predicted_ids]
    decoded_labels = [tokenizer.decode(l, skip_special_tokens=True, clean_up_tokenization_spaces=True) for l in p.label_ids]

    rouge_results = calc_rouge(decoded_preds, decoded_labels)
    rouge_results = {key: results for key, results in zip(['rouge1' , 'rouge2', 'rougeL'], list(rouge_results))}
    bert_score = calc_bertscore(decoded_preds, decoded_labels)

    avg_fkgl, avg_cli, avg_dcrs = calc_readability(decoded_preds)
    avg_sum=cal_summac(decoded_preds, docs)

    return {
        **rouge_results,
        "bert_score": bert_score,
        "avg_fkgl": avg_fkgl,
        "avg_cli": avg_cli,
        "avg_dcrs": avg_dcrs,
        "summac_score":avg_sum
    }

results_df = {}
keyword_list={
 'structural biology and molecular biophysics': 480}

folder_dir = '/content/drive/My Drive/NLP Final Project/biolaysumm2024_data'
file_path = os.path.join(folder_dir, 'eLife_train_modified_structural.json')
file_path_test  = os.path.join(folder_dir, 'eLife_test_modified_structural.json')
base_model_save_path = "/content/drive/My Drive/model/"
for keyword in keyword_list:
    with open(file_path, 'r') as f:
        keyword_data = json.load(f)
    with open(file_path_test, 'r') as f:
        keyword_data_val = json.load(f)

    train_inputs, train_labels = preprocess_and_tokenize2(keyword_data, tokenizer, keyword)
    val_inputs, val_labels = preprocess_and_tokenize2(keyword_data_val, tokenizer, keyword)

    train_dataset = MedicineDataset(train_inputs, train_labels)
    val_dataset = MedicineDataset(val_inputs, val_labels)
    train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

    training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100
    )

    trainer = CustomTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        keyword_data=keyword_data,
        input_keyword=keyword
    )

    trainer.train()
    #keyword_save_path = os.path.join(base_model_save_path, f"bart_model_save_{keyword.replace(' ', '_')}")
    #os.makedirs(keyword_save_path, exist_ok=True)
    #model.save_pretrained(keyword_save_path)
    #tokenizer.save_pretrained(keyword_save_path)

    predict_output = trainer.predict(val_dataset)
    metrics = compute_metrics(predict_output, trainer.docs)  # 使用 trainer 的 docs 属性进行评估

    results_df[keyword] = metrics

print(results_df)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)


Step,Training Loss,Validation Loss
100,2.3426,2.11615


Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 4.45 seconds, 6.07 sentences/sec
<All keys matched successfully>


Downloading tokenizer_config.json:   0%|          | 0.00/217 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

  _torch_pytree._register_pytree_node(


Downloading model.safetensors:   0%|          | 0.00/235M [00:00<?, ?B/s]

{'structural biology and molecular biophysics': {'rouge1': 0.5887113593708855, 'rouge2': 0.20083853335653024, 'rougeL': 0.5666888347049172, 'bert_score': 0.8414443996217515, 'avg_fkgl': 8.137037037037036, 'avg_cli': 9.618148148148148, 'avg_dcrs': 8.07074074074074, 'summac_score': 0.4095482417830714}}


  histograms = torch.FloatTensor(histograms).to(self.device)


#### prepocess and without preprocess result

In [None]:
without_preprocess={'structural biology and molecular biophysics': {'rouge1': 0.5380485831326763, 'rouge2': 0.16223120435318295, 'rougeL': 0.4957532566013197, 'bert_score': 0.8543319856678998, 'avg_fkgl': 8.666666666666666, 'avg_cli': 9.924074074074074, 'avg_dcrs': 8.851111111111111, 'summac_score': 0.4140351657514219}}

In [None]:
withprocess={'structural biology and molecular biophysics': {'rouge1': 0.5387937796392054, 'rouge2': 0.16679767944945903, 'rougeL': 0.4985975645522696, 'bert_score': 0.8555719278476857, 'avg_fkgl': 8.903703703703703, 'avg_cli': 10.194444444444445, 'avg_dcrs': 9.131851851851852, 'summac_score': 0.38076089708893385}}