***Extractive step: BioBERT, HDBSCAN***

major changes: BioBERT for embedding. Using a different sentence segmentation like spacy

In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
import hdbscan
import umap
from datasets import load_dataset
import pandas as pd

import spacy
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")


# Check if MPS is available and set the device accordingly
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device.")
else:
    device = torch.device("cpu")
    print("MPS not available. Using CPU.")

# Load the dataset and cut down 
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# Use select to create a subset
# dataset = dataset.select(range(20,30))  


# Initialize the BioBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1').to(device)

def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Move inputs to the device
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move outputs back to CPU
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return np.array(embeddings)

def reduce_dimensions(embeddings, n_components=2):
    """
    Reduces the dimensions of the given embeddings using UMAP.

    :param embeddings: High-dimensional data to be reduced.
    :param n_components: The dimension of the space to embed into. Default is 2.
    :return: The reduced dimension embeddings.
    """
    # Ensure n_neighbors is at least 2 and less than the number of embeddings
    n_neighbors = max(2, min(embeddings.shape[0] - 1, 15))

    umap_reducer = umap.UMAP(n_neighbors=n_neighbors, n_components=n_components)
    return umap_reducer.fit_transform(embeddings)



# def select_top_sentences(sentences, embeddings, n_sentences=5):
#     # If there are fewer sentences than the desired number, return them all
#     if len(sentences) < n_sentences:
#         return ' '.join(sentences)

#     # Set the minimum cluster size for HDBSCAN
#     # Ensure it is not larger than the number of sentences and at least 2
#     cluster_size = max(4, min(len(sentences), len(sentences) // n_sentences))

#     # Initialize HDBSCAN with the determined minimum cluster size
#     clusterer = hdbscan.HDBSCAN(min_cluster_size=cluster_size, gen_min_span_tree=True)

#     # Fit the HDBSCAN clusterer to the embeddings
#     cluster_labels = clusterer.fit_predict(embeddings)

#     # Handle cases where meaningful clusters are not found
#     if len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) < 2:
#         # If less than two clusters (excluding noise), return the first n_sentences
#         return ' '.join(sentences[:n_sentences])

#     # Calculate the centroids of the clusters
#     unique_labels = set(cluster_labels)
#     unique_labels.discard(-1)  # Remove the noise label, if present
#     centroids = [embeddings[cluster_labels == label].mean(axis=0) for label in unique_labels]

#     # Find the closest sentence to each centroid
#     top_sentence_indices = []
#     for centroid in centroids:
#         distances = np.linalg.norm(embeddings - centroid, axis=1)
#         top_sentence_indices.append(np.argmin(distances))

#     # Remove duplicate indices while preserving order
#     top_sentence_indices = list(dict.fromkeys(top_sentence_indices))

#     # Select the sentences corresponding to the top indices
#     top_sentences = [sentences[index] for index in top_sentence_indices]

#     # Return the top sentences joined into a single string
#     return ' '.join(top_sentences)

def select_top_sentences(sentences, embeddings, n_sentences=5):
    # Return all sentences if there are fewer than the desired number
    if len(sentences) <= n_sentences:
        return ' '.join(sentences)

    # Initialize HDBSCAN with a dynamic minimum cluster size
    cluster_size = max(4, min(len(sentences), len(sentences) // n_sentences))
    clusterer = hdbscan.HDBSCAN(min_cluster_size=cluster_size, gen_min_span_tree=True)

    # Fit the HDBSCAN clusterer to the embeddings
    cluster_labels = clusterer.fit_predict(embeddings)

    # Check if meaningful clusters are found
    if len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) < 2:
        # If less than two clusters, return the first n_sentences
        return ' '.join(sentences[:n_sentences])

    # Calculate the centroids of the clusters
    unique_labels = set(cluster_labels)
    unique_labels.discard(-1)  # Remove the noise label
    centroids = [embeddings[cluster_labels == label].mean(axis=0) for label in unique_labels]

    # Initialize a set to store indices of selected sentences
    selected_indices = set()
    top_sentences = []

    # Select the closest sentence to each centroid without repeating sentences
    for centroid in centroids:
        distances = np.linalg.norm(embeddings - centroid, axis=1)
        sorted_indices = np.argsort(distances)

        for index in sorted_indices:
            if index not in selected_indices:
                selected_indices.add(index)
                top_sentences.append(sentences[index])
                break

    # Return the top sentences joined into a single string
    return ' '.join(top_sentences)


def process_row(row):
    review_id = row['review_id']
    abstract_list = row['abstract'] 

    combined_summary = ''

    for abstract in abstract_list:
        abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

        # Use SpaCy for sentence segmentation
        doc = nlp(abstract_text)
        sentences = [sent.text.strip() for sent in doc.sents]

        embeddings = bert_sentence_embeddings(sentences)
        summary = select_top_sentences(sentences, embeddings)
        combined_summary += summary + ' '

    return {"review_id": review_id, "summary": combined_summary.strip()}

# Apply the function to each element of the dataset
summaries_dataset = dataset.map(process_row)


# # Saving the dataset
# import pickle
# with open('summaries_dataset.pkl', 'wb') as file:
#     pickle.dump(summaries_dataset, file)

# # Convert to pandas DataFrame
# df = pd.DataFrame(summaries_dataset)
# df = df[['review_id', 'summary']]
# csv_file_path = 'test.csv'  # Update with your desired file path
# df.to_csv(csv_file_path, index=True)

# print(f"Saved summaries to {csv_file_path}")


***abstractive step: Long T5***

In [None]:
from transformers import AutoTokenizer, LongT5ForConditionalGeneration
import pandas as pd
import torch
from torch.cuda.amp import autocast
from datasets import load_dataset


# Check if MPS is available and set the device accordingly
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device.")
else:
    device = torch.device("cpu")
    print("MPS not available. Using CPU.")
    
    
# Load LongT5 Model and Tokenizer
model_to_use = "pszemraj/long-t5-tglobal-base-16384-book-summary"  # fine-tuned for summarization
longt5_model = LongT5ForConditionalGeneration.from_pretrained(model_to_use).to(device)
longt5_tokenizer = AutoTokenizer.from_pretrained(model_to_use)

# Define the batch size
batch_size = 1

# Initialize the results list
results = []

# Convert the dataset to a list of dictionaries if not already
data_list = list(summaries_dataset)

# Generate summaries in batches
for i in range(0, len(data_list), batch_size):
    batch = data_list[i:i + batch_size]
    input_texts = [row['summary'] for row in batch]
    review_ids_batch = [row['review_id'] for row in batch]

    inputs = longt5_tokenizer(
        input_texts,
        truncation=True,
        padding="longest",
        return_tensors="pt",
        max_length=16384
    ).to(device)

    try:
        summary_ids = longt5_model.generate(
            inputs['input_ids'],
            num_beams=2,
            no_repeat_ngram_size=2,
            min_length=10,
            max_length=512,
            early_stopping=True
        ).to('cpu')
        
        batch_summaries = longt5_tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

        # Append each summary with its ReviewID to the results list
        for review_id, summary in zip(review_ids_batch, batch_summaries):
            results.append({'review_id': review_id, 'Summary': summary})

    except Exception as e:
        print(f"Error in batch starting at index {i}: {e}")
        for review_id in review_ids_batch:
            results.append({'review_id': review_id, 'Summary': ""})

# Convert the results to a DataFrame
summaries_df_val = pd.DataFrame(results)

# Save the DataFrame to a CSV file
output_file_val = 'Summary_BioBERT_HDBSCAN_UMAP_Long_T5_prediction.csv'
summaries_df_val.to_csv(output_file_val, index=True)
print(f"Saved summaries to {output_file_val}")