In [44]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
import evaluate
import numpy as np

from pprint import pprint
from datasets import load_dataset
from sklearn.cluster import KMeans

from transformers import PegasusTokenizer, PegasusForConditionalGeneration
from transformers import BertTokenizer, BertModel
from nltk.tokenize import sent_tokenize

# sure the GPUs are working 
import tensorflow as tf

# Check for TensorFlow GPU access
print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# See TensorFlow version
print(f"TensorFlow version: {tf.__version__}")


# # Load the dataset
# dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')

# # Convert the dataset to a Pandas DataFrame
# df = dataset.to_pandas()


# Load the data- test inputs
df = pd.read_csv('mslr2022_validation.csv')


df = df[['review_id', 'abstract']]

#save the first 10 rows for practice 
df= df[5:10]

# df.head()

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.9.0


In [45]:
df.head(15)

Unnamed: 0,review_id,abstract
5,16801507,['We have recently demonstrated that glucocort...
6,11676811,['To compare the use of r and omized controls...
7,12719681,"[""BACKGROUND There is some evidence that quali..."
8,26830881,['Abstract This long-term extension of an 8-we...
9,28611377,['Aim Anticoagulation prophylaxis for stroke i...


In [50]:
# Split the abstracts into lists of sentences
df['split_abstracts'] = df['abstract'].apply(lambda x: x.split('\n '))
df['split_abstracts']

5    [['We have recently demonstrated that glucocor...
6    [['To compare the use of  r and omized control...
7    [["BACKGROUND There is some evidence that qual...
8    [['Abstract This long-term extension of an 8-w...
9    [['Aim Anticoagulation prophylaxis for stroke ...
Name: split_abstracts, dtype: object

***Extractive step: BERT, K means***

In [52]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
import torch
from transformers import BertTokenizer, BertModel

# Assuming df is your actual DataFrame with 'review_id' and 'abstract' columns

# Split the concatenated abstracts into lists of abstracts
df['split_abstracts'] = df['abstract'].apply(lambda x: x.split('\n '))

# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

def select_top_sentences(sentences, embeddings, n_sentences=2):
    if len(sentences) < n_sentences:
        return ' '.join(sentences)
    kmeans = KMeans(n_clusters=n_sentences, n_init=10)
    kmeans.fit(embeddings)
    top_sentence_indices = np.argmin(
        np.linalg.norm(embeddings[:, np.newaxis] - kmeans.cluster_centers_, axis=2), axis=0)
    top_sentences = [sentences[index] for index in top_sentence_indices]
    return ' '.join(top_sentences)

# Process each abstract and create summaries
summaries = {}
for _, row in df.iterrows():
    review_id = row['review_id']
    split_abstracts = row['split_abstracts']
    combined_summary = ''
    
    for abstract in split_abstracts:
        # Split each abstract into sentences
        sentences = abstract.split('. ')  # Assuming sentences in abstracts are separated by '. '
        # Generate embeddings for each sentence
        embeddings = bert_sentence_embeddings(sentences)
        # Select the top sentences from these embeddings
        summary = select_top_sentences(sentences, embeddings)
        # Combine the summaries from each abstract
        combined_summary += summary + ' '
    
    # Add the combined summary for the current review_id to the summaries dictionary
    summaries[review_id] = combined_summary.strip()

# Print the summaries for each review_id
for review_id, summary in summaries.items():
    print(f"Review ID: {review_id}\nSummary: {summary}\n")


Review ID: 16801507
Summary: OPG was significantly decreased in group A ( P < 0.001 ) , while no significant change was seen in group B As vitamin K2 ( menatetrenone ) has been used for the treatment of osteoporosis , the present study was carried out to evaluate the effect of vitamin K2 on GC-induced bone loss  There were no significant differences between the groups in the incidence of serious adverse events  METHODS To evaluate the efficacy of strontium ranelate in preventing vertebral fractures in a phase 3 trial , we r and omly assigned 1649 postmenopausal women with osteoporosis ( low bone mineral density ) and at least one vertebral fracture to receive 2 g of oral strontium ranelate per day or placebo for three years  During 1 yr of calcium/vitamin D2 treatment , ucOC decreased ( P < 0.05 ) , especially in those with the initially increased values ( from 2.22 + /- 0.35 to 1.41 + /- 0.29 ng/ml , P < 0.005 ) contrasting with an increase in the placebo group ( P < 0.05 )  In conclu

In [58]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# Load pre-trained Pegasus tokenizer and model
tokenizer_pegasus = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
model_pegasus = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

# Dictionary to store the final summaries
final_summaries = {}

# Iterate over each summary in the summaries dictionary
for review_id, extractive_summary in summaries.items():
    # Prepare the input for the model
    inputs = tokenizer_pegasus(
        extractive_summary, 
        truncation=True, 
        padding="longest", 
        return_tensors="pt", 
        max_length=512
    )

    # Generate the summary with Pegasus
    try:
        summary_ids = model_pegasus.generate(
            inputs['input_ids'], 
            num_beams=4,
            min_length=50,
            max_length=200,             
            length_penalty=2.0, 
            early_stopping=True
        )
        
        # Decode the generated IDs to text
        pegasus_summary = tokenizer_pegasus.decode(
            summary_ids[0], 
            skip_special_tokens=True
        )
        
        # Store the summary in the final summaries dictionary
        final_summaries[review_id] = pegasus_summary

    except IndexError as e:
        print(f"Error processing review_id {review_id}: {e}")
        final_summaries[review_id] = ""

# Display the final summaries
for review_id, summary in final_summaries.items():
    print(f"Review ID: {review_id}\nAbstractive Summary: {summary}\n")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Review ID: 16801507
Abstractive Summary: The aim of this study was to investigate the effect of vitamin K2 on osteoporotic bone turnover (OPG) in postmenopausal women with osteoporosis and at least one vertebral fracture after three years of treatment with strontium ranelate in a phase 3 trial.

Review ID: 11676811
Abstractive Summary: The effect of exercise on low back pain has been investigated in a series of RCTs and human clinical trials (HCTs) of the same therapy in a hospital setting, with the aim of comparing the effects of two different exercise programs on low back pain.

Review ID: 12719681
Abstractive Summary: The purpose of this study was to investigate the relationship between patient distress and quality of life (QOL) in patients with chronic obstructive pulmonary disease (COPD) and the use of re-admission sources such as nebulisers and asthma inhalers.

Review ID: 26830881
Abstractive Summary: Here are some of the latest studies on anxiety disorders that have been publis