<a href="https://colab.research.google.com/github/AmirMoazzami/266_final_proj/blob/mk%2Fbart-biobart-exploration/pretrained_no_finetune/BioBART.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Mounting GDrive to save results
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install -q transformers datasets torchsummary

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m48.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m83.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m60.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import shutil
import os
import gc
import time

from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset, load_from_disk
import torch
from torch.cuda.amp import autocast
import torchsummary


# Memory optimization for MPS
os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = "0.0"
print(os.getenv('PYTORCH_MPS_HIGH_WATERMARK_RATIO'))

# Memory optimization for CUDA
max_split_size_mb = 256  # Set the max_split_size_mb value (e.g., 512 MB)
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{max_split_size_mb}"
print(f"PYTORCH_CUDA_ALLOC_CONF is set to: {os.environ['PYTORCH_CUDA_ALLOC_CONF']}")

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# use MPS instead
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print('There is MPS GPU available!')

elif torch.cuda.is_available():
    device = torch.device("cuda")
    print('There is CUDA GPU available!')

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


model_names = [
    "GanjinZero/biobart-v2-base",
    "facebook/bart-base",
    "facebook/bart-large-cnn",
]

def get_model_and_tokenizer(model_name: str):
    model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
    tokenizer = BartTokenizer.from_pretrained(model_name)
    return model, tokenizer

def clear_memory(*args):
    """After inference, delete the variables to free up memory"""
    for arg in args:
        del arg
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    elif torch.backends.mps.is_available():
        torch.mps.empty_cache()

# run once only during development period:
dataset = load_dataset(
    "allenai/mslr2022",
    "ms2",
    split='validation',
)  # this takes a long time, ~ 9 mins. Most time spent in "Generating train split"

# # subsetting the first 10 into a small file for quick debugging in the future (no need to pull entire dataset!)
# dataset.save_to_disk("first_10_train_examples")
# shutil.make_archive('first_10_train_examples', 'zip', 'first_10_train_examples')

# run following for development
# dataset = load_from_disk("first_10_train_examples")

dataset

0.0
PYTORCH_CUDA_ALLOC_CONF is set to: max_split_size_mb:256
There is CUDA GPU available!


Downloading builder script:   0%|          | 0.00/7.09k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/264M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14188 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1667 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2021 [00:00<?, ? examples/s]

Dataset({
    features: ['review_id', 'pmid', 'title', 'abstract', 'target', 'background'],
    num_rows: 2021
})

In [None]:
!unzip /content/first_10_train_examples.zip -d /content/first_10_train_examples/

Archive:  /content/first_10_train_examples.zip
  inflating: /content/first_10_train_examples/data-00000-of-00001.arrow  
  inflating: /content/first_10_train_examples/state.json  
  inflating: /content/first_10_train_examples/dataset_info.json  


In [None]:
model, tokenizer = get_model_and_tokenizer("GanjinZero/biobart-v2-base")
# model, tokenizer = get_model_and_tokenizer("facebook/bart-large-cnn")
# model, tokenizer = get_model_and_tokenizer("facebook/bart-base")

In [None]:
# # old way

# test_idx = 5
# background_text = dataset[test_idx]["background"].replace("\n", " ")
# text1 = dataset[test_idx]["abstract"][0].replace("\n", " ")
# text2 = dataset[test_idx]["abstract"][1].replace("\n", " ")
# instruction = "summarize: BACKGROUND - "
# background_length = len(tokenizer.encode(instruction + background_text))

# # Encoding separately, concatenate, then decode
# inputs1 = tokenizer.encode(instruction + background_text + " ABSTRACT - " + text1, return_tensors="pt", max_length=1024, truncation=True).to(device)
# inputs2 = tokenizer.encode(instruction + background_text + " ABSTRACT - " + text2, return_tensors="pt", max_length=1024, truncation=True).to(device)

# encoded1 = model(inputs1).encoder_last_hidden_state[:, :int(1024 / 2)]  # max position embeddings is 1024 for BART
# encoded2 = model(inputs2).encoder_last_hidden_state[:, :int(1024 / 2)]

# concatenated = torch.cat((encoded1, encoded2), dim=1)
# decoded = model.generate(max_length=300, pad_token_id=1, inputs_embeds=concatenated, decoder_inputs_embeds=concatenated)
# print(decoded.shape)
# tokenizer.decode(decoded[0], skip_special_tokens=True)

torch.Size([1, 300])


'summarize: BACKGROUND - INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state, causes insulin resistance and may delay recovery.. Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects. A systematic review was undertaken to analyse the effect of the potential effect of preoperative carbohydrate loading on insulin resistance, gastric emptying, gastric acidity, patient wellbeing, immunity and nutrition following surgery. ABSTRACT - The effect on gastric pH and volume, and the effects of this effect on the ability of an effect on a possible effect of a potential effect on an effect of an upcoming event, was found to be, respectively, of -1, -2, -3, and -1.5, -4, -1, -1 and -2.2,-1, and was investigated in this study by the use of a technique of, and hence, of, a, and a, thus, of-1. During and after the period, we were able to detect, and we could not detect, the possibility of, or the outcome of, an effect, of an event, of bei

In [None]:
# different way of "concatenating" -- this is better

test_idx = 5

max_num_refs = 25
background_text = dataset[test_idx]["background"].replace("\n", " ")
# text1 = dataset[test_idx]["abstract"][1].replace("\n", " ")
# text2 = dataset[test_idx]["abstract"][0].replace("\n", " ")
# texts = [text1, text2]

texts = [text.replace("\n", " ") for text in dataset[test_idx]["abstract"]][:max_num_refs]
print(f"Number of references: {len(texts)}")

instruction = "summarize conclusion:"
text_with_preamble = [instruction + "<s>STUDY - " + text + "<s>BACKGROUND: " + background_text for text in texts]
inputs = tokenizer.batch_encode_plus(text_with_preamble, return_tensors="pt", max_length=1024, truncation=True, padding=True).to(device)

# Encoding separately, concatenate, then decode
encoder_outputs = model.model.encoder(inputs["input_ids"])
encoded = encoder_outputs[0]
encoder_outputs["last_hidden_state"] = encoded.reshape(1, -1, encoded.size(-1))

decoded = model.generate(max_length=512, pad_token_id=1, encoder_outputs=encoder_outputs)
print(decoded.shape)
summary_output_text = tokenizer.decode(decoded[0], skip_special_tokens=True)

# print with simple wrapping
print("\n".join(summary_output_text.split(". ")))

Number of references: 18
torch.Size([1, 313])
summarize conclusion:STUDY - Background : Postoperative hyperglycemia is associated with postoperative insulin resistance
We studied the effect of preoperative carbohydrate loading
The effect of this treatment on postoperative recovery
The study was conducted to determine the role of postoperative carbohydrate loading on postoperative insulin sensitivity
PATIENTS AND METHODS Insulin resistance and glucose turnover ( [ 6, 6,(2)H(2)]-D-glucose ) were measured using a carbohydrate-rich drink ( 400 ml )
The patients undergoing a hepatectomy for the treatment of a hepatic neoplasm were r and omly assigned to receive either a placebo drink or carbohydrate ( 12.6g/100ml ) drink ( CHOD )
Patients were classified into two sequential groups : the control group included 968 patients treated with sliding-scale-guided intermittent subcutaneous insulin injections ( SQI ), and the study group included 31 patients who underwent surgery
The visual analog sc

In [None]:
print("\n".join(texts[8].split(". ")))

Background and objective We studied the effect of three different fasting protocol s on preoperative discomfort and glucose and insulin levels 
Methods Two hundred and ten ASA I – III patients undergoing general or gastrointestinal surgery were r and omly assigned to three groups : overnight intravenous 5 % glucose infusion ( 1000 ml ) , carbohydrate-rich drink ( 400 ml ) at 6–7 a.m
, or overnight fasting 
The subjective feelings of thirst , hunger , mouth dryness , weakness , tiredness , anxiety , headache and pain of each patient were question ed preoperatively using a visual analogue scale 
Serum glucose and insulin levels were measured at predetermined time points preoperatively 
Results During the waiting period before surgery , the carbohydrate-rich drink group was less hungry than the fasting group ( P = 0.011 ) 
No other differences were seen in visual analogue scale scores among the study groups 
Trend analysis showed increasing thirst , mouth dryness and anxiety in the intrav

In [None]:
# search in texts
query = "postoperative"
for text in texts:
    if query in text:
        print(text)
        break

BACKGROUND AND AIMS Preoperative intake of a clear carbohydrate-rich drink reduces  insulin resistance after surgery . In this study , we evaluated whether this could be related to increased insulin sensitivity at the onset of surgery . Furthermore , we aim ed to establish the optimal dose-regimen . METHODS Six healthy volunteers underwent hyperinsulinaemic ( 0.8 mU/kg/min ) , normoglycaemic ( 4.5 mmol/l ) clamps and indirect calorimetry on four occasions in a crossover-r and omised order ;  after overnight fasting ( CC ) , after a single evening dose ( 800 ml ) of the drink ( LC ) , after a single morning dose ( 400 ml , CL ) and after intake of the drink in the evening and in the morning before the clamp ( LL ) . Data are presented as mean+/-SD . Statistical analysis was performed using the Student 's t-test and ANOVA . RESULTS Insulin sensitivity was higher in CL and LL ( 9.2+/-1.5 and 9.3+/-1.9 mg/kg/min , respectively ) compared to CC and LC ( 6.1+/-1.6 and 6.6+/-1.9 mg/kg/min , P

In [None]:
encoded.reshape(1, -1, encoded.size(-1)).shape

torch.Size([1, 14760, 768])

In [None]:
print("\n".join(background_text.split(". ")))

INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state , causes insulin resistance and may delay recovery 
Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects 
A systematic review was undertaken to analyse the effect of preoperative carbohydrate loading on insulin resistance , gastric emptying , gastric acidity , patient wellbeing , immunity and nutrition following surgery .


In [None]:
clear_memory(model, tokenizer, inputs, encoder_outputs, encoded, decoded)

In [None]:
print(dataset[test_idx]["target"])

Preoperative carbohydrate drinks significantly improved insulin resistance and indices of patient comfort following surgery , especially hunger , thirst , malaise , anxiety and nausea .
No definite conclusions could be made regarding preservation of muscle mass .
Following ingestion of carbohydrate drinks , no adverse events such as apparent or proven aspiration during or after surgery were reported .
Administration of oral carbohydrate drinks before surgery is probably safe and may have a positive influence on a wide range of perioperative markers of clinical outcome .


Operationalizing our summarizer! Loop through our dataset:

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Loop through our dataset

max_num_refs = 18  # adjusted down based on memory limitations, trial and error :')
# save_dir = "/content/drive/MyDrive/266: NLP/266 final project/Baselines/biobart/validation"  # on UCB email
save_dir = "/content/drive/MyDrive/266 final project/Baselines/biobart/validation"  # on personal gmail
verbose = False
instruction = "Retrieve concise conclusion without background:"
torch.cuda.empty_cache()
model, tokenizer = get_model_and_tokenizer("GanjinZero/biobart-v2-base")

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

start_time = time.time()

last_stopping_point = "16539827"
past_stopping_point = False

for idx in range(dataset.num_rows):

    review_id = dataset[idx]["review_id"]
    if not past_stopping_point:
        if review_id == last_stopping_point:
            past_stopping_point = True
        continue

    if "encoder_outputs" in locals():
        del encoder_outputs
    gc.collect()
    torch.cuda.empty_cache()

    report_token = "*"

    # if summary already exists, skip
    if (skip := os.path.exists(os.path.join(save_dir, f"{review_id}.txt"))):
        if verbose:
            print(f"Summary already exists for {review_id}, skipping...")
        report_token = "-"

    if verbose:
        print(f"Index: {idx}")
    else:
        if (idx + 1) % 100 == 0:
            print(report_token, end="")
            print(f" ~ Time elapsed: {(time.time() - start_time) / 60:.2f} min")
        elif (idx + 1) % 10 == 0:
            print(report_token+"|", end="")
        else:
            print(report_token, end="")

    if skip:
        continue

    torch.cuda.empty_cache()

    background_text = dataset[idx]["background"].replace("\n", " ")
    texts = [text.replace("\n", " ") for text in dataset[idx]["abstract"]][:max_num_refs]
    if verbose:
        print(f"Number of references: {len(texts)}")

    text_with_preamble = [instruction + "<s> " + text + "<s>BACKGROUND: " + background_text for text in texts]
    inputs = tokenizer.batch_encode_plus(text_with_preamble, return_tensors="pt", max_length=1024, truncation=True, padding=True).to(device)

    # Encoding separately, concatenate, then decode
    encoder_outputs = model.model.encoder(
        inputs["input_ids"],
        # attention_mask=inputs["attention_mask"],  # this for some reason had a negative effect: summaries get truncated mid-sentence!
    )
    encoder_outputs["last_hidden_state"] = encoder_outputs[0].reshape(1, -1, encoder_outputs[0].size(-1))

    with autocast():
        decoded = model.generate(max_length=512, pad_token_id=1, encoder_outputs=encoder_outputs)

    if verbose:
        print(decoded.shape)
    summary_output_text = tokenizer.decode(decoded[0], skip_special_tokens=True)

    # print with simple wrapping
    if verbose:
        print("\n".join(summary_output_text.split(". ")))

    # Save to disk -- create simple text file with review_id as name
    with open(os.path.join(save_dir, f"{review_id}.txt"), "w") as f:
        f.write(summary_output_text)

    del inputs
    del encoder_outputs
    del decoded
    gc.collect()
    torch.cuda.empty_cache()

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/666M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/892k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

**|**********|**********|**********|**********|**********|**********|**********|********** ~ Time elapsed: 13.75 min
**********|**********|**********|**********|**********|**********|**********|**********|**********|********** ~ Time elapsed: 30.01 min
**********|**********|**********|**********|**********|**********|**********|**********|**********|********** ~ Time elapsed: 45.86 min
**********|**********|**********|**********|**********|**********|**********|**********|**********|********** ~ Time elapsed: 62.37 min
**********|**********|**********|**********|**********|**********|**********|**********|**********|********** ~ Time elapsed: 77.96 min
**********|**********|**********|**********|**********|**********|**********|**********|**********|********** ~ Time elapsed: 93.64 min
**********|**********|**********|**********|**********|**********|**********|**********|**********|********** ~ Time elapsed: 109.70 min
**********|**********|*

In [None]:
model.model.encoder

BartEncoder(
  (embed_tokens): Embedding(85401, 768, padding_idx=1)
  (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
  (layers): ModuleList(
    (0-5): 6 x BartEncoderLayer(
      (self_attn): BartAttention(
        (k_proj): Linear(in_features=768, out_features=768, bias=True)
        (v_proj): Linear(in_features=768, out_features=768, bias=True)
        (q_proj): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (activation_fn): GELUActivation()
      (fc1): Linear(in_features=768, out_features=3072, bias=True)
      (fc2): Linear(in_features=3072, out_features=768, bias=True)
      (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    )
  )
  (layernorm_embedding): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)

Now that it's done, concatenate all into a convenient csv with review ID and candidate (generate summary)

In [18]:
import pandas as pd
import os

# save_dir = "/content/drive/MyDrive/266: NLP/266 final project/Baselines/biobart/validation"  # on UCB email
save_dir = "/content/drive/MyDrive/266 final project/Baselines/biobart/validation"  # on personal gmail

df = pd.DataFrame(columns=["review_id", "candidate"])
for file_count, filename in enumerate(os.listdir(save_dir)):
    if filename.endswith(".txt"):
        with open(os.path.join(save_dir, filename), "r") as f:
            summary = f.read()

        prefix = "Retrieve concise conclusion without background:"
        if summary.startswith(prefix):
            summary = summary[len(prefix):]

        df = pd.concat(
            [
                df,
                pd.DataFrame(
                    [[filename[:-4], summary]],
                    columns=["review_id", "candidate"]
                )
            ],
            ignore_index=True,
        )

    if file_count % 100 == 0:
        print(f"File count: {file_count}")

df.to_csv(os.path.join(save_dir, "biobart_validation.csv"), index=False)

File count: 0
File count: 100
File count: 200
File count: 300
File count: 400
File count: 500
File count: 600
File count: 700
File count: 800
File count: 900
File count: 1000
File count: 1100
File count: 1200
File count: 1300
File count: 1400
File count: 1500
File count: 1600
File count: 1700
File count: 1800
File count: 1900
File count: 2000


## Appendix dump
### Other older approaches (not relevant anymore)

In [None]:
inputs_combined = tokenizer.encode("summarize: " + background_text + " " + text1 + " " + text2, return_tensors="pt", max_length=1024, truncation=True).to(device)

decoded_combined = model.generate(inputs_combined, max_length=300, pad_token_id=1)
print(decoded_combined.shape)
tokenizer.decode(decoded_combined[0], skip_special_tokens=True)

torch.Size([1, 300])


'summarize: INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state, causes insulin resistance and may delay recovery. Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects. A systematic review was undertaken to analyse the effect of preoperative carbohydrate loading on insulin resistance. gastric emptying, gastric acidity, patient wellbeing, immunity and nutrition following surgery. The effect on gastric pH and volume of 0, 6 and 10 ml · kg−1, of apple juice given 2.5 hours before surgery to children aged five to ten years was investigated in this prospect i ve, r and omized, single-blind study. Gastric contents were aspirated after induction of anaesthesia, and the volume measured. The pH of the gastric aspirate was then assessed using pH paper. Neither gastric volume nor pH immediately following the induction of general anaesthesia were significantly different among the three groups. Gast gastric volumes after 0,6 and 10 m

In [None]:
inputs_solo = tokenizer.encode("summarize: BACKGROUND - " + background_text + " ABSTRACT - " + text1, return_tensors="pt", max_length=1024, truncation=True).to(device)

decoded_solo = model.generate(inputs_solo, max_length=300, pad_token_id=1)
tokenizer.decode(decoded_solo[0], skip_special_tokens=True)

'summarize: BACKGROUND - INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state, causes insulin resistance and may delay recovery. Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects. A systematic review was undertaken to analyse the effect of preoperative carbohydrate loading on insulin resistance. gastric emptying, gastric acidity, patient wellbeing, immunity and nutrition following surgery. ABSTRACT - The effect on gastric pH and volume of 0, 6 and 10 ml · kg−1, of apple juice given 2.5 hours before surgery to children aged five to ten years was investigated in this prospect i ve, r and omized, single-blind study. Gastric contents were aspirated after induction of anaesthesia, and the volume measured. The pH of the gastric aspirate was then assessed using pH paper. Neither gastric volume nor pH immediately following the induction of general anaesthesia were significantly different among the three groups. Gastral volumes

In [None]:
print(inputs_solo.shape)
print(decoded_solo.shape)

torch.Size([1, 818])
torch.Size([1, 300])


In [None]:
print(decoded.shape)
tokenizer.decode(decoded[0], skip_special_tokens=True)

torch.Size([1, 300])


'summarize: BACKGROUND - INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state, causes insulin resistance and may delay recovery.. Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects. A systematic review was undertaken to analyse the effect of the potential effect of preoperative carbohydrate loading on insulin resistance, gastric emptying, gastric acidity, patient wellbeing, immunity and nutrition following surgery. ABSTRACT - The effect on gastric pH and volume, and the effects of this effect on the ability of an effect on a possible effect of a potential effect on an effect of an upcoming event, was found to be, respectively, of -1, -2, -3, and -1.5, -4, -1, -1 and -2.2,-1, and was investigated in this study by the use of a technique of, and hence, of, a, and a, thus, of-1. During and after the period, we were able to detect, and we could not detect, the possibility of, or the outcome of, an effect, of an event, of bei

In [None]:
clear_memory(inputs_combined, decoded_combined, inputs_solo, decoded_solo)

In [None]:
def summarize(text, model, tokenizer, **generate_args):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(inputs, **generate_args)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Summarize the first 5 examples
summaries = []
for example in dataset:
    text = example["abstract"][1]  # single abstract
    # text = "; ".join([f"ARTICLE {e + 1}: {txt}" for e, txt in enumerate(example["abstract"])])  # concatenate everything into one

    # test text from news article:
    # text = """ATLANTIC CITY, N.J. — Danish energy developer Orsted said Tuesday night it is scrapping two large offshore wind-power projects off the coast of New Jersey, adding uncertainty to a nascent industry the Biden administration and many state governments are counting on to help transition away from the burning of planet-warming fossil fuels. The company said it is canceling its Ocean Wind I and II projects in southern New Jersey, citing supply-chain issues and rising interest rates."""
    summary = summarize(text, model, tokenizer, max_length=200, min_length=20, length_penalty=1.0, num_beams=10, early_stopping=True)  # guesses for generate args
    summaries.append(summary)
    break

for i, summary in enumerate(summaries):
    print(f"Summary {i+1}:\n{summary}\n")

Summary 1:
The GFP-tagged ADSCs were identified in the lungs and differentiated into endothelial-like cells. Two weeks post-MCT administration, the ADSCs group received 1 × 106 ADSCs via the external jugular vein. Compared to PAH rats, mean pulmonary arterial pressure was decreased in rats at 1, 2, and 3 weeks after ADSCs-treatment.



In [None]:
"; ".join([f"ARTICLE {e + 1}: {txt}" for e, txt in enumerate(example["abstract"])])

'ARTICLE 1: Although transplantation of adult bone marrow mesenchymal stem cells ( BM-MSCs ) holds promise in the treatment for pulmonary arterial hypertension ( PAH ) , the poor survival and differentiation potential of adult BM-MSCs have limited their therapeutic efficiency . Here , we compared the therapeutic efficacy of human embryonic stem cell-derived MSCs ( hESC-MSCs ) with adult BM-MSCs for the treatment of PAH in an animal model . One week following monocrotaline (MCT)-induced PAH , mice were r and omly assigned to receive phosphate-buffered saline ( MCT group ) ; 3.0 × 106 human BM-derived MSCs ( BM-MSCs group ) or 3.0 × 106 hESC-derived MSCs ( hESC-MSCs group ) via tail vein injection . At 3 weeks posttransplantation , the right ventricular systolic pressure ( RVSP ) , degree of RV hypertrophy , and medial wall thickening of pulmonary arteries were lower= , and pulmonary capillary density was higher in the hESC-MSC group as compared with BM-MSC and MCT groups ( all p < 0.05 

In [None]:
print(dataset[0]["abstract"][1].replace(". ", ". \n"))

Abstract We investigated the effect of adipose-derived stem cells ( ADSCs ) transplantation effects on structural remodeling and pulmonary artery pressure in  monocrotaline (MCT)-induced pulmonary hypertensive rats . 
In the first experiment , 32 male Sprague-Dawley ( SD ) rats were r and omly divided into four groups ( n = 8/group ) : 3 ADSCs treated groups and normal control ( Ctrl ) . 
ADSCs were administered through the left jugular vein at 105 , 106 and 107 cells , respectively , and a cell density of 106cells/ml was shown to be optimal . 
The GFP-tagged ADSCs were identified in the lungs and differentiated into endothelial-like cells . 
In the second experiment , 96 male SD rats were r and omly divided into three groups ( n = 32/group ) : Ctrl , MCT-induced pulmonary arterial hypertension ( PAH ) , and PAH treated with ADSCs ( ADSCs ) . 
Two weeks post-MCT administration , the ADSCs group received 1 × 106 ADSCs via the external jugular vein . 
Compared to PAH rats , mean pulmonar