In [None]:
#Download and Extract the Dataset:
!wget https://ai2-s2-mslr.s3.us-west-2.amazonaws.com/mslr_data.tar.gz
!tar -xvf mslr_data.tar.gz

#Delete the Cochrane dataset and any other unwanted files:
!rm -r mslr_data/cochrane/
!rm mslr_data.tar.gz*

#Move the ms2 directory up one level and remove the parent mslr_data directory:
!mv mslr_data/ms2 ./
!rm -r mslr_data/
!rm -r sample_data/

In [None]:
###

In [None]:
import numpy as np
import pandas as pd
import sklearn
import tensorflow as tf
import matplotlib.pyplot as plt

# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")

In [None]:
##########

In [1]:
#import 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration, AutoConfig
from rouge_score import rouge_scorer
import evaluate
from pprint import pprint  # Makes longer output readable without horizontal scrolling




In [2]:
# Load the data
train_inputs = pd.read_csv('ms2/train-inputs.csv')
train_targets = pd.read_csv('ms2/train-targets.csv')

# Merge the two dataframes on the 'ReviewID' column to create a new DataFrame 'df'
df = pd.merge(train_inputs, train_targets, on='ReviewID', how='inner')

# Display the first few rows to check the result
df.head()

Unnamed: 0,Unnamed: 0_x,ReviewID,PMID,Title,Abstract,Unnamed: 0_y,Target,Background
0,0,30760312,22776744,Improved Cell Survival and Paracrine Capacity ...,Although transplantation of adult bone marrow ...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...
1,1,30760312,25271670,Adipose-derived stem cells attenuate pulmonary...,Abstract We investigated the effect of adipose...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...
2,2,30760312,3493740,Effect of bone marrow mesenchymal stem cells o...,The aim of the present study was to investigat...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...
3,3,30760312,1863023,Survival in patients with primary pulmonary hy...,OBJECTIVE To characterize mortality in persons...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...
4,4,30760312,16291984,Sildenafil citrate therapy for pulmonary arter...,BACKGROUND Sildenafil inhibits phosphodiestera...,0,Conclusions SC therapy is effective for PAH in...,Background Despite significant progress in dru...


In [None]:
# Exploratory Data Analysis on the new DataFrame 'df'
# Calculate text lengths for relevant columns
df['Title Length'] = df['Title'].str.len()
df['Abstract Length'] = df['Abstract'].str.len()
df['Target Length'] = df['Target'].str.len()
df['Background Length'] = df['Background'].str.len()

# Plot the distribution of text lengths for Title and Abstract columns
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['Title Length'], kde=True).set_title('Distribution of Title Text Lengths')
plt.subplot(1, 2, 2)
sns.histplot(df['Abstract Length'], kde=True).set_title('Distribution of Abstract Text Lengths')
plt.tight_layout()
plt.show()

# Plot the distribution of text lengths for Target and Background columns
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
sns.histplot(df['Target Length'], kde=True).set_title('Distribution of Target Text Lengths')
plt.subplot(1, 2, 2)
sns.histplot(df['Background Length'], kde=True).set_title('Distribution of Background Text Lengths')
plt.tight_layout()
plt.show()

In [None]:
####

In [None]:
df.head()

#**Pegasus**

In [None]:
from transformers import PegasusTokenizer, TFPegasusForConditionalGeneration
import pandas as pd
from datasets import load_metric

# Assuming df is your DataFrame loaded with all the necessary columns.

# Group by ReviewID and concatenate all Abstracts
grouped_df = df.groupby('ReviewID').agg({
    'Abstract': ' '.join,
    'Target': 'first'  # Assuming all Targets per ReviewID are the same.
}).reset_index()

# Initialize the Pegasus tokenizer and model
ptokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
pmodel = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")


In [None]:
# Tokenize the inputs
inputs = ptokenizer(grouped_df['Abstract'][:20].tolist(), max_length=1024, 
                    truncation=True, padding="max_length", return_tensors="tf")
# inputs[:2]

In [None]:
# Generate summaries with the model
summary_ids = pmodel.generate(inputs["input_ids"],
                              num_beams=2,  # You can tune this
                              no_repeat_ngram_size=2,  # You can tune this
                              min_length=5,  # You can tune this
                              max_length=50,  # You can tune this
                              early_stopping=True)


In [None]:
summary_ids.shape

In [None]:
# Decode the summaries
summaries = ptokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)

# Add summaries to the grouped DataFrame
grouped_df['Generated_Summary'] = summaries


#**Pegasus in PYTORCH**

In [3]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration
import pandas as pd
from datasets import load_metric

# Assuming df is your DataFrame loaded with all the necessary columns.

# Group by ReviewID and concatenate all Abstracts
grouped_df = df.groupby('ReviewID').agg({
    'Abstract': ' '.join,
    'Target': 'first'  # Assuming all Targets per ReviewID are the same.
}).reset_index()

# Initialize the Pegasus tokenizer and model
ptokenizer = PegasusTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
pmodel = PegasusForConditionalGeneration.from_pretrained("google/pegasus-cnn_dailymail")

# Tokenize the inputs
inputs = ptokenizer(grouped_df['Abstract'][:50].tolist(), max_length=1024, 
                    truncation=True, padding="max_length", return_tensors="pt")  # Note return_tensors="pt" for PyTorch


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
# Generate summaries with the model
summary_ids = pmodel.generate(inputs["input_ids"],
                              num_beams=2,  # You can tune this
                              no_repeat_ngram_size=2,  # You can tune this
                              min_length=5,  # You can tune this
                              max_length=50,  # You can tune this
                              early_stopping=True)

In [5]:
summary_ids.shape

torch.Size([50, 50])

In [6]:
summaries = ptokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
summaries[:10]

['The effectiveness of betamethasone in reducing respiratory distress syndrome was limited to premature infants delivered to mothers with intact fetal membranes .<n>The time between administration of the glucocorticoid and delivery did not significantly affect the incidence of RDS in this study',
 'Indomethacin is a rapidly effective , non-steroidal .<n>anti-inflammatory agent and is an alternative to predni-steroid therapy for the treatment of bronchiolar inflammation and swelling.',
 'One hundred children aged 3 to 12 years admitted for strabismus surgery were enrolled in a double-blind study .<n>The incidence of vomiting and the occurrence of the oculocardiac reflexes was recorded by a gastric tube at the',
 'A double-blind , r and om selection comparison was made of the therapeutic effects in acute herpes zoster of 40 % idoxuridine ( IDU ) dissolved in dimethyl sulphoxide ( DMSO)<n>The effect',
 'A multicenter retrospective audit of carotid endarterectomies performed during 1981 wa

In [None]:

# grouped_df['Generated_Summary'] = summaries
