In [None]:
#Download and Extract the Dataset:
!wget https://ai2-s2-mslr.s3.us-west-2.amazonaws.com/mslr_data.tar.gz
!tar -xvf mslr_data.tar.gz

#Delete the Cochrane dataset and any other unwanted files:
!rm -r mslr_data/cochrane/
!rm mslr_data.tar.gz*

#Move the ms2 directory up one level and remove the parent mslr_data directory:
!mv mslr_data/ms2 ./
!rm -r mslr_data/
!rm -r sample_data/

In [None]:
###

In [1]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
import matplotlib.pyplot as plt

# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.9.0


In [None]:
##########

In [2]:
#import 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration, AutoConfig
from rouge_score import rouge_scorer
import evaluate
from pprint import pprint  # Makes longer output readable without horizontal scrolling




In [3]:
# Load the data
train_inputs = pd.read_csv('ms2/train-inputs.csv')
train_targets = pd.read_csv('ms2/train-targets.csv')

# Merge the two dataframes on the 'ReviewID' column to create a new DataFrame 'df'
df = pd.merge(train_inputs, train_targets, on='ReviewID', how='inner')

# Display the first few rows to check the result
df.head()

Unnamed: 0,Unnamed: 0_x,ReviewID,PMID,Title,Abstract,Unnamed: 0_y,Target,Background
0,0,30760312,22776744,Improved Cell Survival and Paracrine Capacity ...,Although transplantation of adult bone marrow ...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...
1,1,30760312,25271670,Adipose-derived stem cells attenuate pulmonary...,Abstract We investigated the effect of adipose...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...
2,2,30760312,3493740,Effect of bone marrow mesenchymal stem cells o...,The aim of the present study was to investigat...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...
3,3,30760312,1863023,Survival in patients with primary pulmonary hy...,OBJECTIVE To characterize mortality in persons...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...
4,4,30760312,16291984,Sildenafil citrate therapy for pulmonary arter...,BACKGROUND Sildenafil inhibits phosphodiestera...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...


In [None]:
# Exploratory Data Analysis on the new DataFrame 'df'
# Calculate text lengths for relevant columns
df['Title Length'] = df['Title'].str.len()
df['Abstract Length'] = df['Abstract'].str.len()
df['Target Length'] = df['Target'].str.len()
df['Background Length'] = df['Background'].str.len()

# Plot the distribution of text lengths for Title and Abstract columns
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['Title Length'], kde=True).set_title('Distribution of Title Text Lengths')
plt.subplot(1, 2, 2)
sns.histplot(df['Abstract Length'], kde=True).set_title('Distribution of Abstract Text Lengths')
plt.tight_layout()
plt.show()

# Plot the distribution of text lengths for Target and Background columns
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['Target Length'], kde=True).set_title('Distribution of Target Text Lengths')
plt.subplot(1, 2, 2)
sns.histplot(df['Background Length'], kde=True).set_title('Distribution of Background Text Lengths')
plt.tight_layout()
plt.show()

In [None]:
####

In [None]:
df.head()

#**Pegasus**

In [None]:
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration
import pandas as pd
from datasets import load_metric

# Assuming df is your DataFrame loaded with all the necessary columns.

# Group by ReviewID and concatenate all Abstracts
grouped_df = df.groupby('ReviewID').agg({
    'Abstract': ' '.join,
    'Target': 'first'  # Assuming all Targets per ReviewID are the same.
}).reset_index()

# Initialize the Pegasus tokenizer and model
ptokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
pmodel = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")


In [None]:
# Tokenize the inputs
inputs = ptokenizer(grouped_df['Abstract'][:20].tolist(), max_length=1024, 
                    truncation=True, padding="max_length", return_tensors="tf")
# inputs[:2]

In [None]:
# Generate summaries with the model
summary_ids = pmodel.generate(inputs["input_ids"],
                              num_beams=2,  # You can tune this
                              no_repeat_ngram_size=2,  # You can tune this
                              min_length=5,  # You can tune this
                              max_length=50,  # You can tune this
                              early_stopping=True)


In [None]:
summary_ids.shape

In [None]:
# Decode the summaries
summaries = ptokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

# Add summaries to the grouped DataFrame
grouped_df['Generated_Summary'] = summaries


#**Pegasus in PYTORCH**

In [3]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import pandas as pd
from datasets import load_metric

# Assuming df is your DataFrame loaded with all the necessary columns.

# Group by ReviewID and concatenate all Abstracts
grouped_df = df.groupby('ReviewID').agg({
    'Abstract': ' '.join,
    'Target': 'first'  # Assuming all Targets per ReviewID are the same.
}).reset_index()

# Initialize the Pegasus tokenizer and model
ptokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
pmodel = PegasusForConditionalGeneration.from_pretrained("google/pegasus-cnn_dailymail")

# Tokenize the inputs
inputs = ptokenizer(grouped_df['Abstract'][:50].tolist(), max_length=1024, 
                    truncation=True, padding="max_length", return_tensors="pt")  # Note return_tensors="pt" for PyTorch


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Generate summaries with the model
summary_ids = pmodel.generate(inputs["input_ids"],
                              num_beams=2,  # You can tune this
                              no_repeat_ngram_size=2,  # You can tune this
                              min_length=5,  # You can tune this
                              max_length=50,  # You can tune this
                              early_stopping=True)

In [5]:
summary_ids.shape

torch.Size([50, 50])

In [6]:
summaries = ptokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
summaries[:10]

['The effectiveness of betamethasone in reducing respiratory distress syndrome was limited to premature infants delivered to mothers with intact fetal membranes .<n>The time between administration of the glucocorticoid and delivery did not significantly affect the incidence of RDS in this study',
 'Indomethacin is a rapidly effective , non-steroidal .<n>anti-inflammatory agent and is an alternative to predni-steroid therapy for the treatment of bronchiolar inflammation and swelling.',
 'One hundred children aged 3 to 12 years admitted for strabismus surgery were enrolled in a double-blind study .<n>The incidence of vomiting and the occurrence of the oculocardiac reflexes was recorded by a gastric tube at the',
 'A double-blind , r and om selection comparison was made of the therapeutic effects in acute herpes zoster of 40 % idoxuridine ( IDU ) dissolved in dimethyl sulphoxide ( DMSO)<n>The effect',
 'A multicenter retrospective audit of carotid endarterectomies performed during 1981 wa

In [None]:

# grouped_df['Generated_Summary'] = summaries


***PEGASUS  in Pytorch 2***

In [7]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import pandas as pd

# Load your DataFrame 'df' with all necessary columns
# df = pd.read_csv('path_to_your_csv.csv') # Uncomment and set path to CSV

# Group by ReviewID and concatenate all Abstracts
grouped_df = df.groupby('ReviewID').agg({
    'Abstract': ' '.join,
    'Target': 'first'  # Assuming all Targets per ReviewID are the same.
}).reset_index()

# Initialize the Pegasus tokenizer and model
ptokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
pmodel = PegasusForConditionalGeneration.from_pretrained("google/pegasus-cnn_dailymail")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
grouped_df.shape

(14188, 3)

In [26]:
grouped_df

Unnamed: 0,ReviewID,Abstract,Target
0,2137711,A double-blind study was design ed to investig...,"Data from 12 controlled trials , involving ove..."
1,5338265,A DOUBLE-BLIND study on the effect of betameth...,"Studies in the United States,9 Canada,10 and t..."
2,7577280,Low doses ( 0.05 mg/kg ) of intravenously admi...,Metoclopramide 0.15 and 0.25 mg kg-1 was signi...
3,7779475,A double-blind r and om selection comparison w...,Firm recommendations for clinical practice are...
4,7847648,A multicenter retrospective audit of carotid ...,Real-time B-mode ultrasonographic imaging allo...
...,...,...,...
14183,32428983,Decreasing the population and activation of in...,PUFA administration may not differ when compar...
14184,32470201,Background Earlier diagnosis followed by multi...,Diabetes-related morbidity and health-related ...
14185,32484259,CONTEXT Post hoc analysis of a previous trial ...,Given the limited data from economic evaluatio...
14186,32495338,BACKGROUND Although there is r and omized evid...,"Based on long-term follow-up , RP compared wit..."


In [36]:
# Replace 'your_review_id' with the actual ReviewID you're looking for
your_review_id = 30760312

# Select the row where 'ReviewID' matches 'your_review_id'
target_text = df.loc[df['ReviewID'] == your_review_id, 'Target'].values[0]

print(target_text)

Conclusions SC therapy is effective for PAH in pre clinical studies .
These results may help to st and ardise pre clinical animal studies and provide a theoretical basis for clinical trial design in the future .


In [28]:
# Tokenize the inputs
inputs = ptokenizer(grouped_df['Abstract'][:50].tolist(), max_length=1024, 
                    truncation=True, padding="max_length", return_tensors="pt")  # Note return_tensors="pt" for PyTorch


In [37]:
# Define the batch size
batch_size = 5

# Generate summaries in batches of 5
summaries = []
for i in range(0, len(inputs['input_ids']), batch_size):
    input_ids_batch = inputs['input_ids'][i:i+batch_size]
    summary_ids = model.generate(input_ids_batch,
                                 num_beams=2,
                                 no_repeat_ngram_size=2,
                                 min_length=5,
                                 max_length=200,
                                 early_stopping=True)
    batch_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    summaries.extend(batch_summaries)

NameError: name 'review_ids_batch' is not defined

In [38]:
# Initialize the results list
results = []

# Generate summaries in batches of 5
for i in range(0, len(inputs['input_ids']), batch_size):
    input_ids_batch = inputs['input_ids'][i:i+batch_size]
    review_ids_batch = grouped_df['ReviewID'][i:i+batch_size].tolist()  # Get the corresponding ReviewIDs
    summary_ids = model.generate(input_ids_batch,
                                 num_beams=2,
                                 no_repeat_ngram_size=2,
                                 min_length=5,
                                 max_length=50,
                                 early_stopping=True)
    batch_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

    # Append each summary with its ReviewID to the results list
    for review_id, summary in zip(review_ids_batch, batch_summaries):
        results.append({'ReviewID': review_id, 'Summary': summary})

# Convert the results to a DataFrame
summaries_df = pd.DataFrame(results)


In [30]:
# len(summaries)

50

In [39]:
summaries_df.head()

Unnamed: 0,ReviewID,Summary
0,2137711,The effectiveness of betamethasone in reducing...
1,5338265,"Indomethacin is a rapidly effective, non-stero..."
2,7577280,One hundred children aged 3 to 12 years admitt...
3,7779475,"A double-blind, r and om selection comparison ..."
4,7847648,A multicenter retrospective audit of carotid e...


In [40]:
# Merge the two DataFrames on the ReviewID
comparison_df = pd.merge(grouped_df[['ReviewID', 'Target']], summaries_df, on='ReviewID', how='inner')

# Now you have a DataFrame with the ReviewID, the target summaries, and the generated summaries
comparison_df.head()

Unnamed: 0,ReviewID,Target,Summary
0,2137711,"Data from 12 controlled trials , involving ove...",The effectiveness of betamethasone in reducing...
1,5338265,"Studies in the United States,9 Canada,10 and t...","Indomethacin is a rapidly effective, non-stero..."
2,7577280,Metoclopramide 0.15 and 0.25 mg kg-1 was signi...,One hundred children aged 3 to 12 years admitt...
3,7779475,Firm recommendations for clinical practice are...,"A double-blind, r and om selection comparison ..."
4,7847648,Real-time B-mode ultrasonographic imaging allo...,A multicenter retrospective audit of carotid e...


In [46]:
first_row = comparison_df.iloc[5]
print(f"ReviewID: {first_row['ReviewID']}")
print(f"Target Summary: {first_row['Target']}")
print(" ")
print(f"Generated Summary: {first_row['Summary']}")


ReviewID: 8671108
Target Summary: Based on evidence from r and omized trials , tricyclic anti-depressants appear to be the only agents of proven benefit for established postherpetic neuralgia
 
Generated Summary: We studied the effects of topical capsaicin on postherpetic neuralgia.<n>We compared amitriptyline with the relatively selective blocker desipramine in 38 patients with painful diabetic neuropathy and compared fluoxetine with placebo in 46 patients.


In [51]:
import pandas as pd
from bert_score import score
import logging

# Assuming 'comparison_df' is your DataFrame with 'Target' and 'Summary' columns.
# comparison_df = pd.read_csv('your_file.csv')  # If you're loading from a file, for example.

# Convert DataFrame to dictionaries
targets = {str(row['ReviewID']): {'target': row['Target']} for index, row in comparison_df.iterrows()}
generated = {str(row['ReviewID']): row['Summary'] for index, row in comparison_df.iterrows()}

# Define the calculate_bertscore function (as you have provided)
def calculate_bertscore(targets, generated, model_type="roberta-large"):
    logging.info(f"Computing BERTscore...")
    docids = list(targets.keys())
    target_texts = [targets[docid]['target'] for docid in docids]
    generated_texts = [generated.get(docid, '') for docid in docids]

    # BERTscore
    bs_ps, bs_rs, bs_fs = score(generated_texts, target_texts, model_type=model_type)
    return {
        "bs_ps": bs_ps,
        "bs_rs": bs_rs,
        "bs_fs": bs_fs
    }

# Calculate BERT scores
bert_scores = calculate_bertscore(targets, generated)

# bert_scores now contains the BERT precision, recall, and F1 scores for each summary


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [52]:
bert_scores

{'bs_ps': tensor([0.8466, 0.7782, 0.8352, 0.7814, 0.8199, 0.8273, 0.8169, 0.8160, 0.8046,
         0.8087, 0.8053, 0.8179, 0.8383, 0.7955, 0.8173, 0.8342, 0.8419, 0.7998,
         0.8401, 0.8036, 0.8349, 0.8551, 0.8168, 0.8619, 0.8354, 0.8192, 0.8223,
         0.8083, 0.8280, 0.8538, 0.8161, 0.8453, 0.7790, 0.8040, 0.8226, 0.8805,
         0.8056, 0.8030, 0.8267, 0.8256, 0.8205, 0.8468, 0.8211, 0.8130, 0.8210,
         0.8553, 0.7961, 0.8317, 0.8283, 0.8472]),
 'bs_rs': tensor([0.8270, 0.8072, 0.8140, 0.8328, 0.8199, 0.8582, 0.7957, 0.8782, 0.7987,
         0.8325, 0.7794, 0.8237, 0.8513, 0.8199, 0.8218, 0.8010, 0.8344, 0.8183,
         0.8516, 0.8310, 0.8479, 0.8119, 0.8219, 0.7995, 0.8463, 0.8134, 0.7967,
         0.8345, 0.8460, 0.8240, 0.8371, 0.8260, 0.8057, 0.8239, 0.8072, 0.8742,
         0.7505, 0.7862, 0.8500, 0.7876, 0.8047, 0.8284, 0.8330, 0.8326, 0.8072,
         0.7809, 0.8343, 0.8126, 0.8320, 0.8425]),
 'bs_fs': tensor([0.8367, 0.7924, 0.8244, 0.8063, 0.8199, 0.8425, 0.80

In [53]:
import pandas as pd
import logging
from ms2.models.utils import rouge_scores  # Ensure ms2 is installed or available in your environment
from transformers import AutoTokenizer

# Assuming 'comparison_df' is your DataFrame with 'Target' and 'Summary' columns.
# comparison_df = pd.read_csv('your_file.csv')  # If you're loading from a file, for example.

# Convert DataFrame to dictionaries
targets = {str(row['ReviewID']): {'target': row['Target']} for index, row in comparison_df.iterrows()}
generated = {str(row['ReviewID']): row['Summary'] for index, row in comparison_df.iterrows()}

# Define the calculate_rouge function
def calculate_rouge(targets, generated):
    logging.info(f"Computing ROUGE scores...")
    docids = list(targets.keys())
    target_texts = [[targets[docid]['target']] for docid in docids]
    generated_texts = [[generated.get(docid, '')] for docid in docids]

    # Initialize tokenizer from the 'facebook/bart-base' model
    tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')

    # Calculate ROUGE scores using the provided 'rouge_scores' function from the 'ms2.models.utils' module
    rouge_results = rouge_scores(generated_texts, target_texts, tokenizer, use_aggregator=True)
    return rouge_results

# Calculate ROUGE scores
rouge_results = calculate_rouge(targets, generated)

# rouge_results will contain the ROUGE scores for each summary



ModuleNotFoundError: No module named 'ms2.models'