***Extractive step: BioBERT, K means***

pipeline:
I use BioBERT for embdeddings

I use Spacy for sentence segmentation 



In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from datasets import load_dataset
import pandas as pd

import spacy
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")


# Check if MPS is available and set the device accordingly
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device.")
else:
    device = torch.device("cpu")
    print("MPS not available. Using CPU.")
    

# Load the dataset and cut down 
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# Use select to create a subset
# dataset = dataset.select(range(20,30))  


# Initialize the BioBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1').to(device)


def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Move inputs to the device
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move outputs back to CPU
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return np.array(embeddings)

#this is fixed : each selected sentence is from a different cluster
def select_top_sentences(sentences, embeddings, n_sentences=5):
    if len(sentences) <= n_sentences:
        return ' '.join(sentences)

    kmeans = KMeans(n_clusters=n_sentences, n_init=10)
    kmeans.fit(embeddings)

    # Initialize a set to store indices of selected sentences
    selected_indices = set()
    top_sentences = []

    for i in range(n_sentences):
        # Calculate distances of all sentences from the i-th centroid
        distances = np.linalg.norm(embeddings - kmeans.cluster_centers_[i], axis=1)

        # Sort the sentences by their distance from the centroid
        sorted_indices = np.argsort(distances)

        # Find the closest sentence that hasn't been selected yet
        for index in sorted_indices:
            if index not in selected_indices:
                selected_indices.add(index)
                top_sentences.append(sentences[index])
                break

    return ' '.join(top_sentences)


def process_row(row):
    review_id = row['review_id']
    abstract_list = row['abstract']
    combined_summary = ''

    for abstract in abstract_list:
        # Check if the abstract is a string; if not, join it into a single string
        abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

        # Use SpaCy for sentence segmentation
        doc = nlp(abstract_text)
        sentences = [sent.text.strip() for sent in doc.sents]

        # Generate embeddings for each sentence
        embeddings = bert_sentence_embeddings(sentences)
        # Select the top sentences from these embeddings
        summary = select_top_sentences(sentences, embeddings)

        # Combine the summaries from each abstract
        combined_summary += summary + ' '

    return {"review_id": review_id, "summary": combined_summary.strip()}

# Apply the function to each element of the dataset
summaries_dataset = dataset.map(process_row)

# Saving the dataset
import pickle
with open('summaries_dataset.pkl', 'wb') as file:
    pickle.dump(summaries_dataset, file)

# # Convert to pandas DataFrame
# df = pd.DataFrame(summaries_dataset)
# df = df[['review_id', 'summary']]
# # Save to CSV
# csv_file_path = 'test.csv'  # Update with your desired file path
# df.to_csv(csv_file_path, index=True)

# print(f"Saved summaries to {csv_file_path}")

***abstractive step: Long T5***

this is without batching

In [None]:
# import gc
# from transformers import AutoTokenizer, LongT5ForConditionalGeneration
# import pandas as pd
# import torch
# from torch.cuda.amp import autocast
# from datasets import load_dataset


# # Set the device to MPS if available, else CPU
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# # Load LongT5 Model and Tokenizer
# model_to_use = "pszemraj/long-t5-tglobal-base-16384-book-summary"  # fine-tuned for summarization
# longt5_model = LongT5ForConditionalGeneration.from_pretrained(model_to_use).to(device)
# longt5_tokenizer = AutoTokenizer.from_pretrained(model_to_use)


# # Dictionary to store the final summaries
# final_summaries = {}

# # Iterate over each summary in the summaries dataset
# for row in summaries_dataset:
#     review_id = row['review_id']
#     extractive_summary = row['summary']

#     # Prepare the input for the model
#     inputs = longt5_tokenizer(
#         extractive_summary, 
#         truncation=True, 
#         padding="longest", 
#         return_tensors="pt", 
#         max_length=16384
#     ).to(device)

#     # Generate the summary with LongT5
#     try:
#         summary_ids = longt5_model.generate(
#             inputs['input_ids'], 
#             num_beams=4,
#             min_length=50,
#             max_length=400,             
#             length_penalty=2.0, 
#             early_stopping=True
#         )
        
#         # Decode the generated IDs to text
#         longt5_summary = longt5_tokenizer.decode(
#             summary_ids[0], 
#             skip_special_tokens=True
#         )
        
#         # Store the summary in the final summaries dictionary
#         final_summaries[review_id] = longt5_summary

#     except IndexError as e:
#         print(f"Error processing review_id {review_id}: {e}")
#         final_summaries[review_id] = ""

# # Display the final summaries
# for review_id, summary in final_summaries.items():
#     print(f"Review ID: {review_id}\nAbstractive Summary: {summary}\n")


In [None]:
import pickle

# Saving the dataset
# with open('summaries_dataset.pkl', 'wb') as file:
#     pickle.dump(summaries_dataset, file)

# Later, you can load the dataset
with open('summaries_dataset.pkl', 'rb') as file:
    summaries_dataset = pickle.load(file)


batching version 1

In [None]:
from transformers import AutoTokenizer, LongT5ForConditionalGeneration
import pandas as pd
import torch
from torch.cuda.amp import autocast
from datasets import load_dataset


# Set the device to MPS if available, else CPU
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# Load LongT5 Model and Tokenizer
model_to_use = "pszemraj/long-t5-tglobal-base-16384-book-summary"  # fine-tuned for summarization
longt5_model = LongT5ForConditionalGeneration.from_pretrained(model_to_use).to(device)
longt5_tokenizer = AutoTokenizer.from_pretrained(model_to_use)

# Define the batch size
batch_size = 1

# Initialize the results list
results = []

# Convert the dataset to a list of dictionaries if not already
data_list = list(summaries_dataset)

# Generate summaries in batches
for i in range(0, len(data_list), batch_size):
    batch = data_list[i:i + batch_size]
    input_texts = [row['summary'] for row in batch]
    review_ids_batch = [row['review_id'] for row in batch]

    inputs = longt5_tokenizer(
        input_texts,
        truncation=True,
        padding="longest",
        return_tensors="pt",
        max_length=16384
    ).to(device)

    try:
        summary_ids = longt5_model.generate(
            inputs['input_ids'],
            num_beams=2,
            no_repeat_ngram_size=2,
            min_length=10,
            max_length=512,
            early_stopping=True
        ).to('cpu')
        
        batch_summaries = longt5_tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

        # Append each summary with its ReviewID to the results list
        for review_id, summary in zip(review_ids_batch, batch_summaries):
            results.append({'review_id': review_id, 'Summary': summary})

    except Exception as e:
        print(f"Error in batch starting at index {i}: {e}")
        for review_id in review_ids_batch:
            results.append({'review_id': review_id, 'Summary': ""})

# Convert the results to a DataFrame
summaries_df_val = pd.DataFrame(results)

# Save the DataFrame to a CSV file
output_file_val = 'summary_BioBERT_K_Means_Long_T5_prediction.csv'
summaries_df_val.to_csv(output_file_val, index=True)
print(f"Saved summaries to {output_file_val}")

#batching: using a method that saves summaries and clears memory

#batching using huggingface module, map

In [None]:
# import pandas as pd
# import torch
# from transformers import AutoTokenizer, LongT5ForConditionalGeneration
# from datasets import load_dataset

# # Check if MPS is available and set the device accordingly
# if torch.backends.mps.is_available():
#     device = torch.device("mps")
#     print("Using MPS (Metal Performance Shaders) device.")
# else:
#     device = torch.device("cpu")
#     print("MPS not available. Using CPU.")

# # Load LongT5 Model and Tokenizer
# model_to_use = "pszemraj/long-t5-tglobal-base-16384-book-summary"  # fine-tuned for summarization
# longt5_model = LongT5ForConditionalGeneration.from_pretrained(model_to_use).to(device)
# longt5_tokenizer = AutoTokenizer.from_pretrained(model_to_use)

# # Load your dataset here (assuming 'summaries_dataset' is your dataset name)
# # summaries_dataset = load_dataset('your_dataset_name')

# # Define the batch processing function
# def process_batch(batch):
#     input_texts = batch['summary']
#     review_ids_batch = batch['review_id']

#     inputs = longt5_tokenizer(
#         input_texts,
#         truncation=True,
#         padding="longest",
#         return_tensors="pt",
#         max_length=16384
#     ).to(device)

#     try:
#         summary_ids = longt5_model.generate(
#             inputs['input_ids'],
#             num_beams=4,
#             min_length=50,
#             max_length=512,
#             early_stopping=True
#         ).to('cpu')
        
#         batch_summaries = longt5_tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

#         # Return processed results
#         return {'review_id': review_ids_batch, 'Abstractive Summary': batch_summaries}

#     except Exception as e:
#         print(f"Error in processing batch: {e}")
#         # Returning empty summaries in case of error
#         return {'review_id': review_ids_batch, 'Abstractive Summary': ['' for _ in review_ids_batch]}

# # Apply the function to the entire dataset using map
# batch_size = 1  # Set your batch size
# processed_dataset = summaries_dataset.map(process_batch, batched=True, batch_size=batch_size)

# # Convert the processed results to a pandas DataFrame
# summaries_df = pd.DataFrame.from_dict({
#     'review_id': processed_dataset['review_id'],
#     'Abstractive Summary': processed_dataset['Abstractive Summary']
# })

# # Save the DataFrame to a CSV file
# output_file = 'summary_predictions.csv'
# summaries_df.to_csv(output_file, index=False)
# print(f"Saved summaries to {output_file}")


In [None]:
# Convert to pandas DataFrame
df = pd.DataFrame(summaries_dataset)
df = df[['review_id', 'summary']]
# Save to CSV
csv_file_path = 'BioBERT_K_Means_extractive.csv'  # Update with your desired file path
df.to_csv(csv_file_path, index=True)

print(f"Saved summaries to {csv_file_path}")