In [None]:
import pandas as pd
import torch
import gc  # For garbage collection
import pickle
import csv
from pprint import pprint  # Makes output readable without horizontal scrolling

# PyTorch and Transformers imports
from transformers import PegasusTokenizer, PegasusForConditionalGeneration

# Dataset import
from datasets import load_dataset

# Load the dataset
dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
# Use select to create a subset
# dataset = dataset.select(range(20,30)) 


# Check for device availability and set accordingly
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device.")
elif torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS (Metal Performance Shaders) device.")
else:
    device = torch.device("cpu")
    print("CUDA and MPS not available. Using CPU.")
    
    
# Initialize the Pegasus tokenizer and model
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large").to(device)

# Tokenize the inputs (assuming df_dev is previously defined)
inputs = tokenizer(df_dev['abstract'].tolist(), max_length=1024, 
                    truncation=True, padding="max_length", return_tensors="pt").to(device)

# Define the batch size and initialize the results list
batch_size = 10
results = []

# Generate summaries in batches
for i in range(0, len(inputs['input_ids']), batch_size):
    input_ids_batch = inputs['input_ids'][i:i+batch_size]
    review_ids_batch = df_dev['review_id'][i:i+batch_size].tolist()
    summary_ids = model.generate(input_ids_batch,
                                 num_beams=2,
                                 no_repeat_ngram_size=2,
                                 min_length=10,
                                 max_length=512,
                                 early_stopping=True)
    batch_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)

    for review_id, summary in zip(review_ids_batch, batch_summaries):
        results.append({'review_id': review_id, 'Summary': summary})

# Convert the results to a DataFrame and save to a CSV file
summaries_df_val = pd.DataFrame(results)
output_file_val = 'val-prediction.csv'
summaries_df_val.to_csv(output_file_val, index=True)
print(f"Saved summaries to {output_file_val}")
