In [48]:
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import torch
# import evaluate
# import numpy as np

# from pprint import pprint
# from datasets import load_dataset
# from sklearn.cluster import KMeans

# from transformers import PegasusTokenizer, PegasusForConditionalGeneration
# from transformers import BertTokenizer, BertModel
# from nltk.tokenize import sent_tokenize

# # sure the GPUs are working 
# import tensorflow as tf

# # Check for TensorFlow GPU access
# print(f"TensorFlow has access to the following devices:\n{tf.config.list_physical_devices()}")

# # See TensorFlow version
# print(f"TensorFlow version: {tf.__version__}")


# # # Load the dataset
# dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')

# # # Convert the dataset to a Pandas DataFrame
# # df = dataset.to_pandas()


# # # Load the data- test inputs
# # df = pd.read_csv('mslr2022_validation.csv')


# # df = df[['review_id', 'abstract']]

# # #save the first 10 rows for practice 
# # df= df[10:15]

# # df.head()

# #save to csv
# # df.to_csv('testing.csv', index=True)

# #cut down to the first 5 for demonstration 
# dataset = dataset[:5]
# dataset

TensorFlow has access to the following devices:
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
TensorFlow version: 2.9.0


{'review_id': ['28514886', '18842808', '24297836', '32367221', '25038833'],
 'pmid': [['15870317',
   '20863418',
   '17991656',
   '15585783',
   '20032496',
   '24233255',
   '11964956',
   '7503180',
   '3113290',
   '11978748',
   '15699689',
   '4274545',
   '16978805',
   '17460489',
   '15343178',
   '20639792',
   '16801182',
   '12604972',
   '17433577',
   '19561548',
   '17325558',
   '19879595'],
  ['7872224',
   '15614200',
   '8247594',
   '17161227',
   '15955465',
   '1324483',
   '16391591',
   '12569112',
   '6096282',
   '18041436',
   '1313163'],
  ['16055524', '21536612', '2700574', '3332564'],
  ['11685356',
   '17261567',
   '22928432',
   '26578718',
   '29741911',
   '19633231',
   '27209621',
   '19956928',
   '28477270',
   '29776815',
   '25514139',
   '17293471',
   '21059327',
   '15466722',
   '23632778',
   '28273424',
   '21663722',
   '12860546',
   '24630956',
   '22886496',
   '19684298'],
  ['16687205',
   '20597708',
   '22982689',
   '15643227',
 

***Extractive step: BERT, K means***

In [2]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from datasets import load_dataset
import pandas as pd


# Load the dataset and cut down to the first 5 for demonstration
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
dataset = dataset.select(range(3))  # Use select to create a subset

# Initialize BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return np.array(embeddings)

def select_top_sentences(sentences, embeddings, n_sentences=5):
    if len(sentences) < n_sentences:
        return ' '.join(sentences)
    kmeans = KMeans(n_clusters=n_sentences, n_init=10)
    kmeans.fit(embeddings)
    top_sentence_indices = np.argmin(
        np.linalg.norm(embeddings[:, np.newaxis] - kmeans.cluster_centers_, axis=2), axis=0)
    top_sentences = [sentences[index] for index in top_sentence_indices]
    return ' '.join(top_sentences)

def process_row(row):
    review_id = row['review_id']
    abstract_list = row['abstract'] 

    combined_summary = ''

    for abstract in abstract_list:
        # Check if the abstract is a string; if not, join it into a single string
        abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

        # Split abstract into sentences
        sentences = abstract_text.split('. ')
        # Generate embeddings for each sentence
        embeddings = bert_sentence_embeddings(sentences)
        # Select the top sentences from these embeddings
        summary = select_top_sentences(sentences, embeddings)
        
#         # Print the top sentences for debugging
#         print("Top sentences for this abstract:", summary)
        
        
        # Combine the summaries from each abstract
        combined_summary += summary + ' '

    return {"review_id": review_id, "summary": combined_summary.strip()}

# Apply the function to each element of the dataset
summaries_dataset = dataset.map(process_row)

# Convert to pandas DataFrame
df = pd.DataFrame(summaries_dataset)
df = df[['review_id', 'summary']]
# Save to CSV
csv_file_path = 'test.csv'  # Update with your desired file path
df.to_csv(csv_file_path, index=True)

print(f"Saved summaries to {csv_file_path}")

Map:   0%|          | 0/3 [00:00<?, ? examples/s]

Saved summaries to test.csv


***abstractive step***

In [3]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
from datasets import load_dataset

# Load pre-trained Pegasus tokenizer and model
tokenizer_pegasus = PegasusTokenizer.from_pretrained('google/pegasus-xsum')
model_pegasus = PegasusForConditionalGeneration.from_pretrained('google/pegasus-xsum')

# Dictionary to store the final summaries
final_summaries = {}

# Iterate over each summary in the summaries dataset
for row in summaries_dataset:
    review_id = row['review_id']
    extractive_summary = row['summary']

    # Prepare the input for the model
    inputs = tokenizer_pegasus(
        extractive_summary, 
        truncation=True, 
        padding="longest", 
        return_tensors="pt", 
        max_length=512
    )

    # Generate the summary with Pegasus
    try:
        summary_ids = model_pegasus.generate(
            inputs['input_ids'], 
            num_beams=4,
            min_length=50,
            max_length=200,             
            length_penalty=2.0, 
            early_stopping=True
        )
        
        # Decode the generated IDs to text
        pegasus_summary = tokenizer_pegasus.decode(
            summary_ids[0], 
            skip_special_tokens=True
        )
        
        # Store the summary in the final summaries dictionary
        final_summaries[review_id] = pegasus_summary

    except IndexError as e:
        print(f"Error processing review_id {review_id}: {e}")
        final_summaries[review_id] = ""

# Display the final summaries
for review_id, summary in final_summaries.items():
    print(f"Review ID: {review_id}\nAbstractive Summary: {summary}\n")


Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Review ID: 28514886
Abstractive Summary: The effect of supplementing an infant formula with Galacigosaccharides (GOS) on the gut microbiota and the fetal immune system has been investigated in a double-blind, placebo-controlled study of 113 neonates, all of whom were breast-fed.

Review ID: 18842808
Abstractive Summary: The aim of the present study was to determine the effect of adding active fibre to a balanced 1200 kcal diet in healthy adults and children, during a five week observation period, and to determine the effect of the substance on markers of cardiovascular disease and weight loss.

Review ID: 24297836
Abstractive Summary: The effect of short-term hypoxia on leukocyte lactate concentration (LF(SBP) ) during rapid ascent to 4,559-m above sea level was investigated in a group of mountaineers who had previously suffered from acute mountain sickness ( AMS ).

