***Extractive step: BioBERT, K means***

pipeline:
I use BioBERT for embdeddings

I use Spacy for sentence segmentation 



In [None]:
from transformers import BertTokenizer, BertModel
import torch
import numpy as np
from sklearn.cluster import KMeans
from datasets import load_dataset
import pandas as pd

import spacy
# Load the SpaCy model
nlp = spacy.load("en_core_web_sm")


# Check if MPS is available and set the device accordingly
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device.")
else:
    device = torch.device("cpu")
    print("MPS not available. Using CPU.")
    

# Load the dataset and cut down 
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# Use select to create a subset
# dataset = dataset.select(range(20,30))  


# Initialize the BioBERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
model = BertModel.from_pretrained('dmis-lab/biobert-base-cased-v1.1').to(device)


def bert_sentence_embeddings(sentences):
    embeddings = []
    for sentence in sentences:
        # Move inputs to the device
        inputs = tokenizer(sentence, return_tensors='pt', max_length=512, truncation=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        # Move outputs back to CPU
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy())
    return np.array(embeddings)

#this is fixed : each selected sentence is from a different cluster
def select_top_sentences(sentences, embeddings, n_sentences=5):
    if len(sentences) <= n_sentences:
        return ' '.join(sentences)

    kmeans = KMeans(n_clusters=n_sentences, n_init=10)
    kmeans.fit(embeddings)

    # Initialize a set to store indices of selected sentences
    selected_indices = set()
    top_sentences = []

    for i in range(n_sentences):
        # Calculate distances of all sentences from the i-th centroid
        distances = np.linalg.norm(embeddings - kmeans.cluster_centers_[i], axis=1)

        # Sort the sentences by their distance from the centroid
        sorted_indices = np.argsort(distances)

        # Find the closest sentence that hasn't been selected yet
        for index in sorted_indices:
            if index not in selected_indices:
                selected_indices.add(index)
                top_sentences.append(sentences[index])
                break

    return ' '.join(top_sentences)


def process_row(row):
    review_id = row['review_id']
    abstract_list = row['abstract']
    combined_summary = ''

    for abstract in abstract_list:
        # Check if the abstract is a string; if not, join it into a single string
        abstract_text = ' '.join(abstract) if isinstance(abstract, list) else abstract

        # Use SpaCy for sentence segmentation
        doc = nlp(abstract_text)
        sentences = [sent.text.strip() for sent in doc.sents]

        # Generate embeddings for each sentence
        embeddings = bert_sentence_embeddings(sentences)
        # Select the top sentences from these embeddings
        summary = select_top_sentences(sentences, embeddings)

        # Combine the summaries from each abstract
        combined_summary += summary + ' '

    return {"review_id": review_id, "summary": combined_summary.strip()}

# Apply the function to each element of the dataset
summaries_dataset = dataset.map(process_row)

# Saving the dataset
import pickle
with open('summaries_dataset.pkl', 'wb') as file:
    pickle.dump(summaries_dataset, file)

# # Convert to pandas DataFrame
# df = pd.DataFrame(summaries_dataset)
# df = df[['review_id', 'summary']]
# # Save to CSV
# csv_file_path = 'test.csv'  # Update with your desired file path
# df.to_csv(csv_file_path, index=True)

# print(f"Saved summaries to {csv_file_path}")

***abstractive step: Long T5***

this is without batching

In [None]:
# import gc
# from transformers import AutoTokenizer, LongT5ForConditionalGeneration
# import pandas as pd
# import torch
# from torch.cuda.amp import autocast
# from datasets import load_dataset


# # Set the device to MPS if available, else CPU
# device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

# # Load LongT5 Model and Tokenizer
# model_to_use = "pszemraj/long-t5-tglobal-base-16384-book-summary"  # fine-tuned for summarization
# longt5_model = LongT5ForConditionalGeneration.from_pretrained(model_to_use).to(device)
# longt5_tokenizer = AutoTokenizer.from_pretrained(model_to_use)


# # Dictionary to store the final summaries
# final_summaries = {}

# # Iterate over each summary in the summaries dataset
# for row in summaries_dataset:
#     review_id = row['review_id']
#     extractive_summary = row['summary']

#     # Prepare the input for the model
#     inputs = longt5_tokenizer(
#         extractive_summary, 
#         truncation=True, 
#         padding="longest", 
#         return_tensors="pt", 
#         max_length=16384
#     ).to(device)

#     # Generate the summary with LongT5
#     try:
#         summary_ids = longt5_model.generate(
#             inputs['input_ids'], 
#             num_beams=4,
#             min_length=50,
#             max_length=400,             
#             length_penalty=2.0, 
#             early_stopping=True
#         )
        
#         # Decode the generated IDs to text
#         longt5_summary = longt5_tokenizer.decode(
#             summary_ids[0], 
#             skip_special_tokens=True
#         )
        
#         # Store the summary in the final summaries dictionary
#         final_summaries[review_id] = longt5_summary

#     except IndexError as e:
#         print(f"Error processing review_id {review_id}: {e}")
#         final_summaries[review_id] = ""

# # Display the final summaries
# for review_id, summary in final_summaries.items():
#     print(f"Review ID: {review_id}\nAbstractive Summary: {summary}\n")


In [None]:
# import pickle

# # Saving the dataset
# # with open('summaries_dataset.pkl', 'wb') as file:
# #     pickle.dump(summaries_dataset, file)

# # Later, you can load the dataset
# with open('summaries_dataset.pkl', 'rb') as file:
#     summaries_dataset = pickle.load(file)


batching version 1

In [None]:
# from transformers import AutoTokenizer, LongT5ForConditionalGeneration
# import pandas as pd
# import torch
# from torch.cuda.amp import autocast
# from datasets import load_dataset


# # Check if MPS is available and set the device accordingly
# if torch.backends.mps.is_available():
#     device = torch.device("mps")
#     print("Using MPS (Metal Performance Shaders) device.")
# else:
#     device = torch.device("cpu")
#     print("MPS not available. Using CPU.")

# # Load LongT5 Model and Tokenizer
# model_to_use = "pszemraj/long-t5-tglobal-base-16384-book-summary"  # fine-tuned for summarization
# longt5_model = LongT5ForConditionalGeneration.from_pretrained(model_to_use).to(device)
# longt5_tokenizer = AutoTokenizer.from_pretrained(model_to_use)

# # Define the batch size
# batch_size = 1

# # Initialize the results list
# results = []

# # Convert the dataset to a list of dictionaries if not already
# data_list = list(summaries_dataset)

# # Generate summaries in batches
# for i in range(0, len(data_list), batch_size):
#     batch = data_list[i:i + batch_size]
#     input_texts = [row['summary'] for row in batch]
#     review_ids_batch = [row['review_id'] for row in batch]

#     inputs = longt5_tokenizer(
#         input_texts,
#         truncation=True,
#         padding="longest",
#         return_tensors="pt",
#         max_length=16384
#     ).to(device)

#     try:
#         summary_ids = longt5_model.generate(
#             inputs['input_ids'],
#             num_beams=2,
#             no_repeat_ngram_size=2,
#             min_length=10,
#             max_length=512,
#             early_stopping=True
#         ).to('cpu')
        
#         batch_summaries = longt5_tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

#         # Append each summary with its ReviewID to the results list
#         for review_id, summary in zip(review_ids_batch, batch_summaries):
#             results.append({'review_id': review_id, 'Summary': summary})

#     except Exception as e:
#         print(f"Error in batch starting at index {i}: {e}")
#         for review_id in review_ids_batch:
#             results.append({'review_id': review_id, 'Summary': ""})

# # Convert the results to a DataFrame
# summaries_df_val = pd.DataFrame(results)

# # Save the DataFrame to a CSV file
# output_file_val = 'summary_BioBERT_K_Means_Long_T5_prediction.csv'
# summaries_df_val.to_csv(output_file_val, index=True)
# print(f"Saved summaries to {output_file_val}")

#batching version 2: using a method that saves summaries and clears memory

In [None]:
# from transformers import AutoTokenizer, LongT5ForConditionalGeneration
# import pandas as pd
# import torch
# import os
# import gc  # For garbage collection
# import pickle


# # Check for device availability and set accordingly
# if torch.cuda.is_available():
#     device = torch.device("cuda")
#     print("Using CUDA device.")
# elif torch.backends.mps.is_available():
#     device = torch.device("mps")
#     print("Using MPS (Metal Performance Shaders) device.")
# else:
#     device = torch.device("cpu")
#     print("CUDA and MPS not available. Using CPU.")


# # Load LongT5 Model and Tokenizer
# model_to_use = "pszemraj/long-t5-tglobal-base-16384-book-summary"  # fine-tuned for summarization
# longt5_model = LongT5ForConditionalGeneration.from_pretrained(model_to_use).to(device)
# longt5_tokenizer = AutoTokenizer.from_pretrained(model_to_use)

# # Define the batch size and save directory
# batch_size = 1
# save_dir = 'summaries'  # Directory to save individual summary files
# os.makedirs(save_dir, exist_ok=True)

# # Initialize the results list
# results = []



# #loading the .pkl file if needed 
# # you can load the dataset
# with open('summaries_dataset.pkl', 'rb') as file:
#     summaries_dataset = pickle.load(file)



# # Convert the dataset to a list of dictionaries if not already
# data_list = list(summaries_dataset)

# #Select a subset of the dataset if needed
# data_list = data_list[1000:1020]


# # Generate summaries in batches
# for i in range(0, len(data_list), batch_size):
#     batch = data_list[i:i + batch_size]
#     input_texts = [row['summary'] for row in batch]
#     review_ids_batch = [row['review_id'] for row in batch]

#     inputs = longt5_tokenizer(
#         input_texts,
#         truncation=True,
#         padding="longest",
#         return_tensors="pt",
#         max_length=16384
#     ).to(device)

#     try:
#         summary_ids = longt5_model.generate(
#             inputs['input_ids'],
#             num_beams=2,
#             no_repeat_ngram_size=2,
#             min_length=10,
#             max_length=512,
#             early_stopping=True
#         ).to('cpu')
        
#         batch_summaries = longt5_tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

#         # Save summaries and append to results
#         for review_id, summary in zip(review_ids_batch, batch_summaries):
#             with open(os.path.join(save_dir, f"{review_id}.txt"), "w") as f:
#                 f.write(summary)
#             results.append({'review_id': review_id, 'Summary': summary})

#     except Exception as e:
#         print(f"Error in batch starting at index {i}: {e}")
#         for review_id in review_ids_batch:
#             results.append({'review_id': review_id, 'Summary': ""})

#     # Clear memory
#     del inputs, summary_ids
#     if torch.cuda.is_available():
#         torch.cuda.empty_cache()
#     elif torch.backends.mps.is_available():
#         torch.mps.empty_cache()
#     gc.collect()

# # Convert the results to a DataFrame
# summaries_df_val = pd.DataFrame(results)

# # Define the destination folder
# output_folder = 'summaries'

# # Define the CSV file name within the folder
# output_file_val = os.path.join(output_folder, 'summary_BioBERT_K_Means_Long_T5_prediction.csv')
# # Save the DataFrame to the CSV file
# summaries_df_val.to_csv(output_file_val, index=True)
# print(f"Saved summaries to {output_file_val}")


#version 3: instead of saving to a txt file, append to a csv file 

In [1]:
from transformers import AutoTokenizer, LongT5ForConditionalGeneration
import pandas as pd
import torch
import os
import gc  # For garbage collection
import pickle

# Check for device availability and set accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device.")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device.")
else:
    device = torch.device("cpu")
    print("CUDA and MPS not available. Using CPU.")

# Load LongT5 Model and Tokenizer
model_to_use = "pszemraj/long-t5-tglobal-base-16384-book-summary"  # Fine-tuned for summarization
longt5_model = LongT5ForConditionalGeneration.from_pretrained(model_to_use).to(device)
longt5_tokenizer = AutoTokenizer.from_pretrained(model_to_use)

# Load the dataset from a pickle file
with open('summaries_dataset.pkl', 'rb') as file:
    summaries_dataset = pickle.load(file)

# Convert the dataset to a list
data_list = list(summaries_dataset)

# # Select a subset of the dataset for processing
# data_list = data_list[1240:1255]

# Define the destination folder and CSV file name
output_folder = 'summaries'
os.makedirs(output_folder, exist_ok=True)
output_file_val = os.path.join(output_folder, 'summary_BioBERT_K_Means_Long_T5_prediction.csv')

# Check if the output file already exists, create it with headers if not
if not os.path.exists(output_file_val):
    pd.DataFrame(columns=['review_id', 'Summary']).to_csv(output_file_val, index=False)

# Define the batch size
batch_size = 1

# Generate summaries in batches
for i in range(0, len(data_list), batch_size):
    batch = data_list[i:i + batch_size]
    input_texts = [row['summary'] for row in batch]
    review_ids_batch = [row['review_id'] for row in batch]

    # Tokenize the input texts
    inputs = longt5_tokenizer(
        input_texts,
        truncation=True,
        padding="longest",
        return_tensors="pt",
        max_length=16384
    ).to(device)

    try:
        # Generate summaries
        summary_ids = longt5_model.generate(
            inputs['input_ids'],
            num_beams=2,
            no_repeat_ngram_size=2,
            min_length=10,
            max_length=512,
            early_stopping=True
        ).to('cpu')

        # Decode the summaries
        batch_summaries = longt5_tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

        # Append summaries directly to the CSV file
        with open(output_file_val, 'a') as f:
            pd.DataFrame({'review_id': review_ids_batch, 'Summary': batch_summaries}).to_csv(f, header=False, index=False)

    except Exception as e:
        print(f"Error in batch starting at index {i}: {e}")

    print(f"Index {i} done!")

    # Clear memory
    if 'inputs' in locals():
        del inputs
    if 'summary_ids' in locals():
        del summary_ids
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    elif torch.backends.mps.is_available():
        torch.mps.empty_cache()
    gc.collect()

print(f"Saved summaries to {output_file_val}. Abstractive summarization portion done!")

Using MPS (Metal Performance Shaders) device.


  x = nn.functional.pad(x, pad=pad, mode="constant", value=pad_value)


Index 0 done!
Index 1 done!
Index 2 done!
Index 3 done!
Index 4 done!
Index 5 done!
Index 6 done!
Index 7 done!
Index 8 done!
Index 9 done!
Index 10 done!
Index 11 done!
Index 12 done!
Index 13 done!
Index 14 done!
Index 15 done!
Index 16 done!
Index 17 done!
Index 18 done!
Index 19 done!
Index 20 done!
Index 21 done!
Index 22 done!
Index 23 done!
Index 24 done!
Index 25 done!
Index 26 done!
Index 27 done!
Index 28 done!
Index 29 done!
Index 30 done!
Index 31 done!
Index 32 done!
Index 33 done!
Index 34 done!
Index 35 done!
Index 36 done!
Index 37 done!
Index 38 done!
Index 39 done!
Index 40 done!
Index 41 done!
Index 42 done!
Index 43 done!
Index 44 done!
Index 45 done!
Index 46 done!
Index 47 done!
Index 48 done!
Index 49 done!
Index 50 done!
Index 51 done!
Index 52 done!
Index 53 done!
Index 54 done!
Index 55 done!
Index 56 done!
Index 57 done!
Index 58 done!
Index 59 done!
Index 60 done!
Index 61 done!
Index 62 done!
Index 63 done!
Index 64 done!
Index 65 done!
Index 66 done!
Index

Index 519 done!
Index 520 done!
Index 521 done!
Index 522 done!
Index 523 done!
Index 524 done!
Index 525 done!
Index 526 done!
Index 527 done!
Index 528 done!
Index 529 done!
Index 530 done!
Index 531 done!
Index 532 done!
Index 533 done!
Index 534 done!
Index 535 done!
Index 536 done!
Index 537 done!
Index 538 done!
Index 539 done!
Index 540 done!
Index 541 done!
Index 542 done!
Index 543 done!
Index 544 done!
Index 545 done!
Index 546 done!
Index 547 done!
Index 548 done!
Index 549 done!
Index 550 done!
Index 551 done!
Index 552 done!
Index 553 done!
Index 554 done!
Index 555 done!
Index 556 done!
Index 557 done!
Index 558 done!
Index 559 done!
Index 560 done!
Index 561 done!
Index 562 done!
Index 563 done!
Index 564 done!
Index 565 done!
Index 566 done!
Index 567 done!
Index 568 done!
Index 569 done!
Index 570 done!
Index 571 done!
Index 572 done!
Index 573 done!
Index 574 done!
Index 575 done!
Index 576 done!
Index 577 done!
Index 578 done!
Index 579 done!
Index 580 done!
Index 58

Index 1030 done!
Index 1031 done!
Index 1032 done!
Index 1033 done!
Index 1034 done!
Index 1035 done!
Index 1036 done!
Index 1037 done!
Index 1038 done!
Index 1039 done!
Index 1040 done!
Index 1041 done!
Index 1042 done!
Index 1043 done!
Index 1044 done!
Index 1045 done!
Index 1046 done!
Index 1047 done!
Index 1048 done!
Index 1049 done!
Index 1050 done!
Index 1051 done!
Index 1052 done!
Index 1053 done!
Index 1054 done!
Index 1055 done!
Index 1056 done!
Index 1057 done!
Index 1058 done!
Index 1059 done!
Index 1060 done!
Index 1061 done!
Index 1062 done!
Index 1063 done!
Index 1064 done!
Index 1065 done!
Index 1066 done!
Index 1067 done!
Index 1068 done!
Index 1069 done!
Index 1070 done!
Index 1071 done!
Index 1072 done!
Index 1073 done!
Index 1074 done!
Index 1075 done!
Index 1076 done!
Index 1077 done!
Index 1078 done!
Index 1079 done!
Index 1080 done!
Index 1081 done!
Index 1082 done!
Index 1083 done!
Index 1084 done!
Index 1085 done!
Index 1086 done!
Index 1087 done!
Index 1088 don

NameError: name 'summary_ids' is not defined

#batching using huggingface module, map

In [None]:
# import pandas as pd
# import torch
# from transformers import AutoTokenizer, LongT5ForConditionalGeneration
# from datasets import load_dataset

# # Check if MPS is available and set the device accordingly
# if torch.backends.mps.is_available():
#     device = torch.device("mps")
#     print("Using MPS (Metal Performance Shaders) device.")
# else:
#     device = torch.device("cpu")
#     print("MPS not available. Using CPU.")

# # Load LongT5 Model and Tokenizer
# model_to_use = "pszemraj/long-t5-tglobal-base-16384-book-summary"  # fine-tuned for summarization
# longt5_model = LongT5ForConditionalGeneration.from_pretrained(model_to_use).to(device)
# longt5_tokenizer = AutoTokenizer.from_pretrained(model_to_use)

# # Load your dataset here (assuming 'summaries_dataset' is your dataset name)
# # summaries_dataset = load_dataset('your_dataset_name')

# # Define the batch processing function
# def process_batch(batch):
#     input_texts = batch['summary']
#     review_ids_batch = batch['review_id']

#     inputs = longt5_tokenizer(
#         input_texts,
#         truncation=True,
#         padding="longest",
#         return_tensors="pt",
#         max_length=16384
#     ).to(device)

#     try:
#         summary_ids = longt5_model.generate(
#             inputs['input_ids'],
#             num_beams=4,
#             min_length=50,
#             max_length=512,
#             early_stopping=True
#         ).to('cpu')
        
#         batch_summaries = longt5_tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

#         # Return processed results
#         return {'review_id': review_ids_batch, 'Abstractive Summary': batch_summaries}

#     except Exception as e:
#         print(f"Error in processing batch: {e}")
#         # Returning empty summaries in case of error
#         return {'review_id': review_ids_batch, 'Abstractive Summary': ['' for _ in review_ids_batch]}

# # Apply the function to the entire dataset using map
# batch_size = 1  # Set your batch size
# processed_dataset = summaries_dataset.map(process_batch, batched=True, batch_size=batch_size)

# # Convert the processed results to a pandas DataFrame
# summaries_df = pd.DataFrame.from_dict({
#     'review_id': processed_dataset['review_id'],
#     'Abstractive Summary': processed_dataset['Abstractive Summary']
# })

# # Save the DataFrame to a CSV file
# output_file = 'summary_predictions.csv'
# summaries_df.to_csv(output_file, index=False)
# print(f"Saved summaries to {output_file}")
