**Extractive: BERT_K_means**

In [1]:
# from transformers import BertTokenizer, BertModel
# import torch
# import numpy as np
# from sklearn.cluster import KMeans
# from datasets import load_dataset
# import pandas as pd

# import spacy
# # Load the SpaCy model
# nlp = spacy.load("en_core_web_sm")


# # Check if MPS is available and set the device accordingly
# if torch.cuda.is_available():
#     device = torch.device("cuda")
#     print("Using CUDA (GPU) device.")
# elif torch.backends.mps.is_available():
#     device = torch.device("mps")
#     print("Using MPS (Metal Performance Shaders) device.")
# else:
#     device = torch.device("cpu")
#     print("GPU not available. Using CPU.")


# # Load the dataset and cut down 
# dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# # Use select to create a subset
# # dataset = dataset.select(range(20,30))  


# # Initialize BERT and move the model to the device
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased').to(device)

# def bert_sentence_embeddings(sentences):
#     embeddings = []
#     for sentence in sentences:
#         # Move inputs to the device
#         inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
#         with torch.no_grad():
#             outputs = model(**inputs)
#         # Move outputs back to CPU
#         embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
#     return np.array(embeddings)

# #this is fixed : each selected sentence is from a different cluster
# def select_top_sentences(sentences, embeddings, n_sentences=5):
#     if len(sentences) <= n_sentences:
#         return ' '.join(sentences)

#     kmeans = KMeans(n_clusters=n_sentences, n_init=10)
#     kmeans.fit(embeddings)

#     # Initialize a set to store indices of selected sentences
#     selected_indices = set()
#     top_sentences = []

#     for i in range(n_sentences):
#         # Calculate distances of all sentences from the i-th centroid
#         distances = np.linalg.norm(embeddings - kmeans.cluster_centers_[i], axis=1)

#         # Sort the sentences by their distance from the centroid
#         sorted_indices = np.argsort(distances)

#         # Find the closest sentence that hasn't been selected yet
#         for index in sorted_indices:
#             if index not in selected_indices:
#                 selected_indices.add(index)
#                 top_sentences.append(sentences[index])
#                 break

#     return ' '.join(top_sentences)


# def process_row(row):
#     review_id = row['review_id']
#     abstract_list = row['abstract']
#     combined_summary = ''

#     for abstract in abstract_list:
#         # Check if the abstract is a string; if not, join it into a single string
#         abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

#         # Use SpaCy for sentence segmentation
#         doc = nlp(abstract_text)
#         sentences = [sent.text.strip() for sent in doc.sents]

#         # Generate embeddings for each sentence
#         embeddings = bert_sentence_embeddings(sentences)
#         # Select the top sentences from these embeddings
#         summary = select_top_sentences(sentences, embeddings)

#         # Combine the summaries from each abstract
#         combined_summary += summary + ' '

#     return {"review_id": review_id, "summary": combined_summary.strip()}

# # Apply the function to each element of the dataset
# summaries_dataset = dataset.map(process_row)


# # Convert to pandas DataFrame
# df = pd.DataFrame(summaries_dataset)
# df = df[['review_id', 'summary']]
# # Save to CSV
# csv_file_path = 'BERT_Kmeans_extractive.csv'  # Update with your desired file path
# df.to_csv(csv_file_path, index=True)

# print(f"Saved summaries to {csv_file_path}")

**Extractive: SciBERT_K_means**

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from datasets import load_dataset
import pandas as pd

import spacy
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")


# Check if MPS is available and set the device accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA (GPU) device.")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device.")
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU.")

    

# Load the dataset and cut down 
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# Use select to create a subset
# dataset = dataset.select(range(20,30))  


# Initialize the SciBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
model = BertModel.from_pretrained('allenai/scibert_scivocab_uncased').to(device)

def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Move inputs to the device
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move outputs back to CPU
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return np.array(embeddings)

#this is fixed : each selected sentence is from a different cluster
def select_top_sentences(sentences, embeddings, n_sentences=5):
    if len(sentences) <= n_sentences:
        return ' '.join(sentences)

    kmeans = KMeans(n_clusters=n_sentences, n_init=10)
    kmeans.fit(embeddings)

    # Initialize a set to store indices of selected sentences
    selected_indices = set()
    top_sentences = []

    for i in range(n_sentences):
        # Calculate distances of all sentences from the i-th centroid
        distances = np.linalg.norm(embeddings - kmeans.cluster_centers_[i], axis=1)

        # Sort the sentences by their distance from the centroid
        sorted_indices = np.argsort(distances)

        # Find the closest sentence that hasn't been selected yet
        for index in sorted_indices:
            if index not in selected_indices:
                selected_indices.add(index)
                top_sentences.append(sentences[index])
                break

    return ' '.join(top_sentences)


def process_row(row):
    review_id = row['review_id']
    abstract_list = row['abstract']
    combined_summary = ''

    for abstract in abstract_list:
        # Check if the abstract is a string; if not, join it into a single string
        abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

        # Use SpaCy for sentence segmentation
        doc = nlp(abstract_text)
        sentences = [sent.text.strip() for sent in doc.sents]

        # Generate embeddings for each sentence
        embeddings = bert_sentence_embeddings(sentences)
        # Select the top sentences from these embeddings
        summary = select_top_sentences(sentences, embeddings)

        # Combine the summaries from each abstract
        combined_summary += summary + ' '

    return {"review_id": review_id, "summary": combined_summary.strip()}

# Apply the function to each element of the dataset
summaries_dataset = dataset.map(process_row)


# Convert to pandas DataFrame
df = pd.DataFrame(summaries_dataset)
df = df[['review_id', 'summary']]
# Save to CSV
csv_file_path = 'SciBERT_Kmeans_extractive.csv'  # Update with your desired file path
df.to_csv(csv_file_path, index=True)

print(f"Saved summaries to {csv_file_path}")

Using MPS (Metal Performance Shaders) device.


Map:   0%|          | 0/2021 [00:00<?, ? examples/s]

**Extractive: BioBERT_K_means**

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from datasets import load_dataset
import pandas as pd

import spacy
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")


# Check if MPS is available and set the device accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA (GPU) device.")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device.")
else:
    device = torch.device("cpu")
    print("GPU not available. Using CPU.")


# Load the dataset and cut down 
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# Use select to create a subset
# dataset = dataset.select(range(20,30))  


# Initialize the BioBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1').to(device)

def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Move inputs to the device
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move outputs back to CPU
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return np.array(embeddings)

#this is fixed : each selected sentence is from a different cluster
def select_top_sentences(sentences, embeddings, n_sentences=5):
    if len(sentences) <= n_sentences:
        return ' '.join(sentences)

    kmeans = KMeans(n_clusters=n_sentences, n_init=10)
    kmeans.fit(embeddings)

    # Initialize a set to store indices of selected sentences
    selected_indices = set()
    top_sentences = []

    for i in range(n_sentences):
        # Calculate distances of all sentences from the i-th centroid
        distances = np.linalg.norm(embeddings - kmeans.cluster_centers_[i], axis=1)

        # Sort the sentences by their distance from the centroid
        sorted_indices = np.argsort(distances)

        # Find the closest sentence that hasn't been selected yet
        for index in sorted_indices:
            if index not in selected_indices:
                selected_indices.add(index)
                top_sentences.append(sentences[index])
                break

    return ' '.join(top_sentences)


def process_row(row):
    review_id = row['review_id']
    abstract_list = row['abstract']
    combined_summary = ''

    for abstract in abstract_list:
        # Check if the abstract is a string; if not, join it into a single string
        abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

        # Use SpaCy for sentence segmentation
        doc = nlp(abstract_text)
        sentences = [sent.text.strip() for sent in doc.sents]

        # Generate embeddings for each sentence
        embeddings = bert_sentence_embeddings(sentences)
        # Select the top sentences from these embeddings
        summary = select_top_sentences(sentences, embeddings)

        # Combine the summaries from each abstract
        combined_summary += summary + ' '

    return {"review_id": review_id, "summary": combined_summary.strip()}

# Apply the function to each element of the dataset
summaries_dataset = dataset.map(process_row)


# Convert to pandas DataFrame
df = pd.DataFrame(summaries_dataset)
df = df[['review_id', 'summary']]
# Save to CSV
csv_file_path = 'BioBERT_Kmeans_extractive.csv'  # Update with your desired file path
df.to_csv(csv_file_path, index=True)

print(f"Saved summaries to {csv_file_path}")