<a href="https://colab.research.google.com/github/AmirMoazzami/266_final_proj/blob/mk%2Fbart-biobart-exploration/pretrained_no_finetune/BioBART.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install torchsummary

In [1]:
import shutil
import os
import gc

from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM, BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset, load_from_disk
import torch


os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = "0.0"
print(os.getenv('PYTORCH_MPS_HIGH_WATERMARK_RATIO'))

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# use MPS instead
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print('There is MPS GPU available!')

elif torch.cuda.is_available():
    device = torch.device("cuda")
    print('There is CUDA GPU available!')

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")


model_names = [
    "GanjinZero/biobart-v2-base",
    "facebook/bart-base",
    "facebook/bart-large-cnn",
]

def get_model_and_tokenizer(model_name: str):
    model = BartForConditionalGeneration.from_pretrained(model_name).to(device)
    tokenizer = BartTokenizer.from_pretrained(model_name)
    return model, tokenizer

def clear_memory(*args):
    """After inference, delete the variables to free up memory"""
    for arg in args:
        del arg
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    elif torch.backends.mps.is_available():
        torch.mps.empty_cache()

# # run once only during development period:
# dataset = load_dataset(
#     "allenai/mslr2022",
#     "ms2",
#     split='train[:10]',  # use only for setting up/debugging
# )  # this takes a long time, ~ 6 mins. Most time spent in train split

# # subsetting the first 10 into a small file for quick debugging in the future (no need to pull entire dataset!)
# dataset.save_to_disk("first_10_train_examples")
# shutil.make_archive('first_10_train_examples', 'zip', 'first_10_train_examples')

# run following all the time
dataset = load_from_disk("first_10_train_examples")
dataset

0.0
There is MPS GPU available!


Dataset({
    features: ['review_id', 'pmid', 'title', 'abstract', 'target', 'background'],
    num_rows: 10
})

In [2]:
model, tokenizer = get_model_and_tokenizer("GanjinZero/biobart-v2-base")
# model, tokenizer = get_model_and_tokenizer("facebook/bart-large-cnn")
# model, tokenizer = get_model_and_tokenizer("facebook/bart-base")

In [14]:
# # old way

# test_idx = 5
# background_text = dataset[test_idx]["background"].replace("\n", " ")
# text1 = dataset[test_idx]["abstract"][0].replace("\n", " ")
# text2 = dataset[test_idx]["abstract"][1].replace("\n", " ")
# instruction = "summarize: BACKGROUND - "
# background_length = len(tokenizer.encode(instruction + background_text))

# # Encoding separately, concatenate, then decode
# inputs1 = tokenizer.encode(instruction + background_text + " ABSTRACT - " + text1, return_tensors="pt", max_length=1024, truncation=True).to(device)
# inputs2 = tokenizer.encode(instruction + background_text + " ABSTRACT - " + text2, return_tensors="pt", max_length=1024, truncation=True).to(device)

# encoded1 = model(inputs1).encoder_last_hidden_state[:, :int(1024 / 2)]  # max position embeddings is 1024 for BART
# encoded2 = model(inputs2).encoder_last_hidden_state[:, :int(1024 / 2)]

# concatenated = torch.cat((encoded1, encoded2), dim=1)
# decoded = model.generate(max_length=300, pad_token_id=1, inputs_embeds=concatenated, decoder_inputs_embeds=concatenated)
# print(decoded.shape)
# tokenizer.decode(decoded[0], skip_special_tokens=True)

torch.Size([1, 300])


'summarize: BACKGROUND - INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state, causes insulin resistance and may delay recovery.. Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects. A systematic review was undertaken to analyse the effect of the potential effect of preoperative carbohydrate loading on insulin resistance, gastric emptying, gastric acidity, patient wellbeing, immunity and nutrition following surgery. ABSTRACT - The effect on gastric pH and volume, and the effects of this effect on the ability of an effect on a possible effect of a potential effect on an effect of an upcoming event, was found to be, respectively, of -1, -2, -3, and -1.5, -4, -1, -1 and -2.2,-1, and was investigated in this study by the use of a technique of, and hence, of, a, and a, thus, of-1. During and after the period, we were able to detect, and we could not detect, the possibility of, or the outcome of, an effect, of an event, of bei

In [3]:
# different way of "concatenating" -- this is better

test_idx = 5

max_num_refs = 25
background_text = dataset[test_idx]["background"].replace("\n", " ")
# text1 = dataset[test_idx]["abstract"][1].replace("\n", " ")
# text2 = dataset[test_idx]["abstract"][0].replace("\n", " ")
# texts = [text1, text2]

texts = [text.replace("\n", " ") for text in dataset[test_idx]["abstract"]][:max_num_refs]
print(f"Number of references: {len(texts)}")

instruction = "summarize conclusion:"
text_with_preamble = [instruction + "<s>STUDY - " + text + "<s>BACKGROUND: " + background_text for text in texts]
inputs = tokenizer.batch_encode_plus(text_with_preamble, return_tensors="pt", max_length=1024, truncation=True, padding=True).to(device)

# Encoding separately, concatenate, then decode
encoder_outputs = model.model.encoder(inputs["input_ids"])
encoded = encoder_outputs[0]
encoder_outputs["last_hidden_state"] = encoded.reshape(1, -1, encoded.size(-1))

decoded = model.generate(max_length=512, pad_token_id=1, encoder_outputs=encoder_outputs)
print(decoded.shape)
summary_output_text = tokenizer.decode(decoded[0], skip_special_tokens=True)

# print with simple wrapping
print("\n".join(summary_output_text.split(". ")))

Number of references: 18
torch.Size([1, 444])
summarize results:STUDY - Background : Postoperative glucose metabolism is associated with postoperative insulin resistance
We previously showed that the effect of preoperative carbohydrate loading ( CHO ) on postoperative recovery
The effect of this study was investigated in 80 healthy children of ages five to ten years undergoing surgery
The aim of the present study was to investigate whether oral carbohydrate or carbohydrate with peptide drinks preoperatively could reduce the catabolic state, causes insulin resistance and may delay recovery
RESULTS The patients undergoing surgery were double-blinded and were given either a placebo drink or carbohydrate ( 12.6g/100ml ) drink ( CHOD )
The postoperative blood glucose level and the total insulin requirement for normoglycemic control during the 16 h following surgery were determined
The visual analog scale scores in a control group were not different between groups
During the waiting period b

In [7]:
print("\n".join(texts[8].split(". ")))

Background and objective We studied the effect of three different fasting protocol s on preoperative discomfort and glucose and insulin levels 
Methods Two hundred and ten ASA I – III patients undergoing general or gastrointestinal surgery were r and omly assigned to three groups : overnight intravenous 5 % glucose infusion ( 1000 ml ) , carbohydrate-rich drink ( 400 ml ) at 6–7 a.m
, or overnight fasting 
The subjective feelings of thirst , hunger , mouth dryness , weakness , tiredness , anxiety , headache and pain of each patient were question ed preoperatively using a visual analogue scale 
Serum glucose and insulin levels were measured at predetermined time points preoperatively 
Results During the waiting period before surgery , the carbohydrate-rich drink group was less hungry than the fasting group ( P = 0.011 ) 
No other differences were seen in visual analogue scale scores among the study groups 
Trend analysis showed increasing thirst , mouth dryness and anxiety in the intrav

In [12]:
# search in texts
query = "postoperative"
for text in texts:
    if query in text:
        print(text)
        break

BACKGROUND AND AIMS Preoperative intake of a clear carbohydrate-rich drink reduces  insulin resistance after surgery . In this study , we evaluated whether this could be related to increased insulin sensitivity at the onset of surgery . Furthermore , we aim ed to establish the optimal dose-regimen . METHODS Six healthy volunteers underwent hyperinsulinaemic ( 0.8 mU/kg/min ) , normoglycaemic ( 4.5 mmol/l ) clamps and indirect calorimetry on four occasions in a crossover-r and omised order ;  after overnight fasting ( CC ) , after a single evening dose ( 800 ml ) of the drink ( LC ) , after a single morning dose ( 400 ml , CL ) and after intake of the drink in the evening and in the morning before the clamp ( LL ) . Data are presented as mean+/-SD . Statistical analysis was performed using the Student 's t-test and ANOVA . RESULTS Insulin sensitivity was higher in CL and LL ( 9.2+/-1.5 and 9.3+/-1.9 mg/kg/min , respectively ) compared to CC and LC ( 6.1+/-1.6 and 6.6+/-1.9 mg/kg/min , P

In [4]:
encoded.reshape(1, -1, encoded.size(-1)).shape

torch.Size([1, 14760, 768])

In [9]:
print("\n".join(background_text.split(". ")))

INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state , causes insulin resistance and may delay recovery 
Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects 
A systematic review was undertaken to analyse the effect of preoperative carbohydrate loading on insulin resistance , gastric emptying , gastric acidity , patient wellbeing , immunity and nutrition following surgery .


In [8]:
clear_memory(model, tokenizer, inputs, encoder_outputs, encoded, decoded)

In [19]:
print(dataset[test_idx]["target"])

Preoperative carbohydrate drinks significantly improved insulin resistance and indices of patient comfort following surgery , especially hunger , thirst , malaise , anxiety and nausea .
No definite conclusions could be made regarding preservation of muscle mass .
Following ingestion of carbohydrate drinks , no adverse events such as apparent or proven aspiration during or after surgery were reported .
Administration of oral carbohydrate drinks before surgery is probably safe and may have a positive influence on a wide range of perioperative markers of clinical outcome .


Operationalizing our summarizer! Loop through our dataset:

In [16]:
dataset[0]["review_id"]

'30760312'

In [18]:
# Loop through our dataset

max_num_refs = 25
save_dir = "first_10_conclusion_1"
verbose = True
instruction = "Retrieve concise conclusion without background:"

if not os.path.exists(save_dir):
    os.makedirs(save_dir)

for idx in range(dataset.num_rows):
    if verbose:
        print(f"Index: {idx}")
    model, tokenizer = get_model_and_tokenizer("GanjinZero/biobart-v2-base")

    background_text = dataset[idx]["background"].replace("\n", " ")
    texts = [text.replace("\n", " ") for text in dataset[idx]["abstract"]][:max_num_refs]
    print(f"Number of references: {len(texts)}")

    text_with_preamble = [instruction + "<s> " + text + "<s>BACKGROUND: " + background_text for text in texts]
    inputs = tokenizer.batch_encode_plus(text_with_preamble, return_tensors="pt", max_length=1024, truncation=True, padding=True).to(device)

    # Encoding separately, concatenate, then decode
    encoder_outputs = model.model.encoder(inputs["input_ids"])
    encoded = encoder_outputs[0]
    encoder_outputs["last_hidden_state"] = encoded.reshape(1, -1, encoded.size(-1))

    decoded = model.generate(max_length=512, pad_token_id=1, encoder_outputs=encoder_outputs)
    if verbose:
        print(decoded.shape)
    summary_output_text = tokenizer.decode(decoded[0], skip_special_tokens=True)

    # print with simple wrapping
    if verbose:
        print("\n".join(summary_output_text.split(". ")))

    # Save to disk -- create simple text file with review_id as name
    review_id = dataset[idx]["review_id"]
    with open(os.path.join(save_dir, f"{review_id}.txt"), "w") as f:
        f.write(summary_output_text)

    clear_memory(inputs, encoder_outputs, encoded, decoded)

Index: 0
Number of references: 15
torch.Size([1, 417])
Retrieve concise conclusion without background: BACKGROUND Current therapies for pulmonary arterial hypertension ( PAH ) are not yet common practice, but awareness of the merits of conducting such SRs is steadily increasing
As animal intervention studies differ from r and omized clinical trials
The aim of this study was to further investigate the long-term effect of BMSCs on pulmonary hypertension, we compared the early effect of oral and intramuscular injection of MSCs
The study was a double-blind, placebo-controlled study, the pulmonary vascular wall, and pulmonary hemodynamics.
The right-sided heart catheterization and 6-MWD test were performed at baseline and at the time of 12 wk after cell infusion
The primary end point was the change from baseline to week 12 in the distance walked in six minutes
The change in mean pulmonary-artery pressure and World Health Organization ( WHO ) functional class and the incidence of clinical wo

Other older approaches

In [24]:
inputs_combined = tokenizer.encode("summarize: " + background_text + " " + text1 + " " + text2, return_tensors="pt", max_length=1024, truncation=True).to(device)

decoded_combined = model.generate(inputs_combined, max_length=300, pad_token_id=1)
print(decoded_combined.shape)
tokenizer.decode(decoded_combined[0], skip_special_tokens=True)

torch.Size([1, 300])


'summarize: INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state, causes insulin resistance and may delay recovery. Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects. A systematic review was undertaken to analyse the effect of preoperative carbohydrate loading on insulin resistance. gastric emptying, gastric acidity, patient wellbeing, immunity and nutrition following surgery. The effect on gastric pH and volume of 0, 6 and 10 ml · kg−1, of apple juice given 2.5 hours before surgery to children aged five to ten years was investigated in this prospect i ve, r and omized, single-blind study. Gastric contents were aspirated after induction of anaesthesia, and the volume measured. The pH of the gastric aspirate was then assessed using pH paper. Neither gastric volume nor pH immediately following the induction of general anaesthesia were significantly different among the three groups. Gast gastric volumes after 0,6 and 10 m

In [25]:
inputs_solo = tokenizer.encode("summarize: BACKGROUND - " + background_text + " ABSTRACT - " + text1, return_tensors="pt", max_length=1024, truncation=True).to(device)

decoded_solo = model.generate(inputs_solo, max_length=300, pad_token_id=1)
tokenizer.decode(decoded_solo[0], skip_special_tokens=True)

'summarize: BACKGROUND - INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state, causes insulin resistance and may delay recovery. Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects. A systematic review was undertaken to analyse the effect of preoperative carbohydrate loading on insulin resistance. gastric emptying, gastric acidity, patient wellbeing, immunity and nutrition following surgery. ABSTRACT - The effect on gastric pH and volume of 0, 6 and 10 ml · kg−1, of apple juice given 2.5 hours before surgery to children aged five to ten years was investigated in this prospect i ve, r and omized, single-blind study. Gastric contents were aspirated after induction of anaesthesia, and the volume measured. The pH of the gastric aspirate was then assessed using pH paper. Neither gastric volume nor pH immediately following the induction of general anaesthesia were significantly different among the three groups. Gastral volumes

In [26]:
print(inputs_solo.shape)
print(decoded_solo.shape)

torch.Size([1, 818])
torch.Size([1, 300])


In [27]:
print(decoded.shape)
tokenizer.decode(decoded[0], skip_special_tokens=True)

torch.Size([1, 300])


'summarize: BACKGROUND - INTRODUCTION Surgical stress in the presence of fasting worsens the catabolic state, causes insulin resistance and may delay recovery.. Carbohydrate rich drinks given preoperatively may ameliorate these deleterious effects. A systematic review was undertaken to analyse the effect of the potential effect of preoperative carbohydrate loading on insulin resistance, gastric emptying, gastric acidity, patient wellbeing, immunity and nutrition following surgery. ABSTRACT - The effect on gastric pH and volume, and the effects of this effect on the ability of an effect on a possible effect of a potential effect on an effect of an upcoming event, was found to be, respectively, of -1, -2, -3, and -1.5, -4, -1, -1 and -2.2,-1, and was investigated in this study by the use of a technique of, and hence, of, a, and a, thus, of-1. During and after the period, we were able to detect, and we could not detect, the possibility of, or the outcome of, an effect, of an event, of bei

In [29]:
clear_memory(inputs_combined, decoded_combined, inputs_solo, decoded_solo)

In [33]:
def summarize(text, model, tokenizer, **generate_args):
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=1024, truncation=True).to(device)
    summary_ids = model.generate(inputs, **generate_args)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

# Summarize the first 5 examples
summaries = []
for example in dataset:
    text = example["abstract"][1]  # single abstract
    # text = "; ".join([f"ARTICLE {e + 1}: {txt}" for e, txt in enumerate(example["abstract"])])  # concatenate everything into one

    # test text from news article:
    # text = """ATLANTIC CITY, N.J. — Danish energy developer Orsted said Tuesday night it is scrapping two large offshore wind-power projects off the coast of New Jersey, adding uncertainty to a nascent industry the Biden administration and many state governments are counting on to help transition away from the burning of planet-warming fossil fuels. The company said it is canceling its Ocean Wind I and II projects in southern New Jersey, citing supply-chain issues and rising interest rates."""
    summary = summarize(text, model, tokenizer, max_length=200, min_length=20, length_penalty=1.0, num_beams=10, early_stopping=True)  # guesses for generate args
    summaries.append(summary)
    break

for i, summary in enumerate(summaries):
    print(f"Summary {i+1}:\n{summary}\n")

Summary 1:
The GFP-tagged ADSCs were identified in the lungs and differentiated into endothelial-like cells. Two weeks post-MCT administration, the ADSCs group received 1 × 106 ADSCs via the external jugular vein. Compared to PAH rats, mean pulmonary arterial pressure was decreased in rats at 1, 2, and 3 weeks after ADSCs-treatment.



In [29]:
"; ".join([f"ARTICLE {e + 1}: {txt}" for e, txt in enumerate(example["abstract"])])

'ARTICLE 1: Although transplantation of adult bone marrow mesenchymal stem cells ( BM-MSCs ) holds promise in the treatment for pulmonary arterial hypertension ( PAH ) , the poor survival and differentiation potential of adult BM-MSCs have limited their therapeutic efficiency . Here , we compared the therapeutic efficacy of human embryonic stem cell-derived MSCs ( hESC-MSCs ) with adult BM-MSCs for the treatment of PAH in an animal model . One week following monocrotaline (MCT)-induced PAH , mice were r and omly assigned to receive phosphate-buffered saline ( MCT group ) ; 3.0 × 106 human BM-derived MSCs ( BM-MSCs group ) or 3.0 × 106 hESC-derived MSCs ( hESC-MSCs group ) via tail vein injection . At 3 weeks posttransplantation , the right ventricular systolic pressure ( RVSP ) , degree of RV hypertrophy , and medial wall thickening of pulmonary arteries were lower= , and pulmonary capillary density was higher in the hESC-MSC group as compared with BM-MSC and MCT groups ( all p < 0.05 

In [32]:
print(dataset[0]["abstract"][1].replace(". ", ". \n"))

Abstract We investigated the effect of adipose-derived stem cells ( ADSCs ) transplantation effects on structural remodeling and pulmonary artery pressure in  monocrotaline (MCT)-induced pulmonary hypertensive rats . 
In the first experiment , 32 male Sprague-Dawley ( SD ) rats were r and omly divided into four groups ( n = 8/group ) : 3 ADSCs treated groups and normal control ( Ctrl ) . 
ADSCs were administered through the left jugular vein at 105 , 106 and 107 cells , respectively , and a cell density of 106cells/ml was shown to be optimal . 
The GFP-tagged ADSCs were identified in the lungs and differentiated into endothelial-like cells . 
In the second experiment , 96 male SD rats were r and omly divided into three groups ( n = 32/group ) : Ctrl , MCT-induced pulmonary arterial hypertension ( PAH ) , and PAH treated with ADSCs ( ADSCs ) . 
Two weeks post-MCT administration , the ADSCs group received 1 × 106 ADSCs via the external jugular vein . 
Compared to PAH rats , mean pulmonar