In [1]:
import torch
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

print(f"PyTorch version: {torch.__version__}")

# Check PyTorch has access to MPS (Metal Performance Shader, Apple's GPU architecture)
print(f"Is MPS (Metal Performance Shader) built? {torch.backends.mps.is_built()}")
print(f"Is MPS available? {torch.backends.mps.is_available()}")

# Set the device      
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

PyTorch version: 2.1.0
Is MPS (Metal Performance Shader) built? True
Is MPS available? True
Using device: mps


In [2]:
import torch
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


**Extractive: BERT**

In [3]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from datasets import load_dataset
import pandas as pd
import time

# Set the device to MPS if available, else CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")


# Load the dataset and cut down to the first 5 for demonstration
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# dataset = dataset.select(range(10))  # Use select to create a subset


# Initialize BERT and move the model to the device
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').to(device)

def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Move inputs to the device
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move outputs back to CPU
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return np.array(embeddings)

def select_top_sentences(sentences, embeddings, n_sentences=2):
    if len(sentences) < n_sentences:
        return ' '.join(sentences)
    kmeans = KMeans(n_clusters=n_sentences, n_init=10)
    kmeans.fit(embeddings)
    top_sentence_indices = np.argmin(
        np.linalg.norm(embeddings[:, np.newaxis] - kmeans.cluster_centers_, axis=2), axis=0)
    top_sentences = [sentences[index] for index in top_sentence_indices]
    return ' '.join(top_sentences)

def process_row(row):
    review_id = row['review_id']
    abstract_list = row['abstract'] 

    combined_summary = ''

    for abstract in abstract_list:
        # Check if the abstract is a string; if not, join it into a single string
        abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

        # Split abstract into sentences
        sentences = abstract_text.split('. ')
        # Generate embeddings for each sentence
        embeddings = bert_sentence_embeddings(sentences)
        # Select the top sentences from these embeddings
        summary = select_top_sentences(sentences, embeddings)
        
#         # Print the top sentences for debugging
#         print("Top sentences for this abstract:", summary)
        
        
        # Combine the summaries from each abstract
        combined_summary += summary + ' '

    return {"review_id": review_id, "summary": combined_summary.strip()}

# Apply the function to each element of the dataset
summaries_dataset = dataset.map(process_row)

# Convert to pandas DataFrame
df = pd.DataFrame(summaries_dataset)
df = df[['review_id', 'summary']]
# Save to CSV
csv_file_path = 'BERT_extractive_prediction.csv'
df.to_csv(csv_file_path, index=True)

print(f"Saved summaries to {csv_file_path}")

Map:   0%|          | 0/2021 [00:00<?, ? examples/s]

Saved summaries to BERT_extractive_prediction.csv


**Extractive: SciBERT**

In [4]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from datasets import load_dataset
import pandas as pd
import time

# Set the device to MPS if available, else CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")


# Load the dataset and cut down to the first 5 for demonstration
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# dataset = dataset.select(range(5))  # Use select to create a subset


# Initialize the SciBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = BertModel.from_pretrained('allenai/scibert_scivocab_uncased').to(device)

def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Move inputs to the device
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move outputs back to CPU
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return np.array(embeddings)

def select_top_sentences(sentences, embeddings, n_sentences=2):
    if len(sentences) < n_sentences:
        return ' '.join(sentences)
    kmeans = KMeans(n_clusters=n_sentences, n_init=10)
    kmeans.fit(embeddings)
    top_sentence_indices = np.argmin(
        np.linalg.norm(embeddings[:, np.newaxis] - kmeans.cluster_centers_, axis=2), axis=0)
    top_sentences = [sentences[index] for index in top_sentence_indices]
    return ' '.join(top_sentences)

def process_row(row):
    review_id = row['review_id']
    abstract_list = row['abstract'] 

    combined_summary = ''

    for abstract in abstract_list:
        # Check if the abstract is a string; if not, join it into a single string
        abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

        # Split abstract into sentences
        sentences = abstract_text.split('. ')
        # Generate embeddings for each sentence
        embeddings = bert_sentence_embeddings(sentences)
        # Select the top sentences from these embeddings
        summary = select_top_sentences(sentences, embeddings)
        
#         # Print the top sentences for debugging
#         print("Top sentences for this abstract:", summary)
        
        
        # Combine the summaries from each abstract
        combined_summary += summary + ' '

    return {"review_id": review_id, "summary": combined_summary.strip()}

# Apply the function to each element of the dataset
summaries_dataset = dataset.map(process_row)

# Convert to pandas DataFrame
df = pd.DataFrame(summaries_dataset)
df = df[['review_id', 'summary']]
# Save to CSV
csv_file_path = 'SciBERT_extractive_prediction.csv'
df.to_csv(csv_file_path, index=True)

print(f"Saved summaries to {csv_file_path}")

Map:   0%|          | 0/2021 [00:00<?, ? examples/s]

Saved summaries to SciBERT_extractive_prediction.csv


**Extractive: BioBERT**

In [5]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from datasets import load_dataset
import pandas as pd
import time

# Set the device to MPS if available, else CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")


# Load the dataset and cut down to the first 5 for demonstration
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# dataset = dataset.select(range(5))  # Use select to create a subset


# Initialize the BioBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1').to(device)


def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Move inputs to the device
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move outputs back to CPU
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return np.array(embeddings)

def select_top_sentences(sentences, embeddings, n_sentences=2):
    if len(sentences) < n_sentences:
        return ' '.join(sentences)
    kmeans = KMeans(n_clusters=n_sentences, n_init=10)
    kmeans.fit(embeddings)
    top_sentence_indices = np.argmin(
        np.linalg.norm(embeddings[:, np.newaxis] - kmeans.cluster_centers_, axis=2), axis=0)
    top_sentences = [sentences[index] for index in top_sentence_indices]
    return ' '.join(top_sentences)

def process_row(row):
    review_id = row['review_id']
    abstract_list = row['abstract'] 

    combined_summary = ''

    for abstract in abstract_list:
        # Check if the abstract is a string; if not, join it into a single string
        abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

        # Split abstract into sentences
        sentences = abstract_text.split('. ')
        # Generate embeddings for each sentence
        embeddings = bert_sentence_embeddings(sentences)
        # Select the top sentences from these embeddings
        summary = select_top_sentences(sentences, embeddings)
        
#         # Print the top sentences for debugging
#         print("Top sentences for this abstract:", summary)
        
        
        # Combine the summaries from each abstract
        combined_summary += summary + ' '

    return {"review_id": review_id, "summary": combined_summary.strip()}

# Apply the function to each element of the dataset
summaries_dataset = dataset.map(process_row)

# Convert to pandas DataFrame
df = pd.DataFrame(summaries_dataset)
df = df[['review_id', 'summary']]
# Save to CSV
csv_file_path = 'BioBERT_extractive_prediction.csv'
df.to_csv(csv_file_path, index=True)

print(f"Saved summaries to {csv_file_path}")

Map:   0%|          | 0/2021 [00:00<?, ? examples/s]

Saved summaries to BioBERT_extractive_prediction.csv
