In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

model_folder = "/content/gdrive/MyDrive/fine-tuned-pegasus-ariv"

# Create the folder if it does not exist
!mkdir -p "{model_folder}"

In [None]:
!nvidia-smi

In [None]:
!pip install transformers[sentencepiece] datasets sacrebleu rouge_score py7zr -q

In [None]:
# import libraries

from transformers import pipeline, set_seed, AutoModelForSeq2SeqLM, AutoTokenizer
# from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import pandas as pd
from datasets import load_dataset, load_metric
import nltk
from nltk.tokenize import sent_tokenize
import matplotlib.pyplot as plt
import torch
from tqdm import tqdm
from datasets import load_dataset
nltk.download("punkt")

In [None]:
# load model and tokenizer

device = "cuda" if torch.cuda.is_available() else "cpu"
pretrained_model = "google/pegasus-cnn_dailymail"
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
pegasus_pretrained = AutoModelForSeq2SeqLM.from_pretrained(pretrained_model).to(device)

In [None]:
technical_articles_dataset = load_dataset('scientific_papers', 'arxiv')


In [None]:
print(type(technical_articles_dataset))
print(technical_articles_dataset)

In [None]:
# Reduce dataset size
# Calculate the new dataset sizes while maintaining the initial ratio
try_train_size = 7000
train_size = try_train_size
total_initial_size = technical_articles_dataset['train'].num_rows + technical_articles_dataset['validation'].num_rows + technical_articles_dataset['test'].num_rows
new_validation_size = int(technical_articles_dataset['validation'].num_rows * train_size / technical_articles_dataset['train'].num_rows)
new_test_size = int(technical_articles_dataset['test'].num_rows * train_size / technical_articles_dataset['train'].num_rows)

# Shuffle the datasets
shuffled_train = technical_articles_dataset['train'].shuffle(seed=420)
shuffled_validation = technical_articles_dataset['validation'].shuffle(seed=420)
shuffled_test = technical_articles_dataset['test'].shuffle(seed=420)

# Select the new dataset sizes
reduced_train = shuffled_train.select(range(train_size))
reduced_validation = shuffled_validation.select(range(new_validation_size))
reduced_test = shuffled_test.select(range(new_test_size))

# Update dataset with the reduced version
technical_articles_dataset['train'] = reduced_train
technical_articles_dataset['validation'] = reduced_validation
technical_articles_dataset['test'] = reduced_test

print(type(technical_articles_dataset))
print(technical_articles_dataset)

In [None]:
def chunk_creation(dataset, batch_size):
    for i in range(0, len(dataset), batch_size):
        yield dataset[i : i + batch_size]

def compute_rouge_scores(data, rouge_reference, model, tokenizer, batch_size=16, device=device, column_text="article", column_summary="abstract"):

    # divide the data set into chunks of size = "batch_size"
    batches_of_articles = list(chunk_creation(data[column_text], batch_size))
    batches_of_targets = list(chunk_creation(data[column_summary], batch_size))

    # for each chunk, pass it into the model, generate the corresponding summaries and store them for rougue score calculation
    for article_batch, target_batch in tqdm(zip(batches_of_articles, batches_of_targets), total=len(batches_of_articles)):

        model_input = tokenizer(article_batch, max_length=1024,  truncation=True, padding="max_length", return_tensors="pt")
        model_output = model.generate(input_ids=model_input["input_ids"].to(device), attention_mask=model_input["attention_mask"].to(device), length_penalty=0.8, num_beams=8, max_length=128)
        summaries = [tokenizer.decode(s, skip_special_tokens=True, clean_up_tokenization_spaces=True) for s in model_output]      
        summaries = [d.replace("<n>", " ") for d in summaries]
        rouge_reference.add_batch(predictions=summaries, references=target_batch)
        
    #  calculate rougue scores
    rouge_scores = rouge_reference.compute()
    return rouge_scores

In [None]:
# Rouge average for 5 docs
rouge_names = ["rouge1", "rouge2", "rougeL", "rougeLsum"]

rouge_metric = load_metric('rouge')
score1 = compute_rouge_scores(technical_articles_dataset['test'][:5], rouge_metric, pegasus_pretrained, tokenizer, column_text = 'article', column_summary='abstract', batch_size=8 )
rouge_dict = dict((rn, score1[rn].mid.fmeasure ) for rn in rouge_names )

pd.DataFrame(rouge_dict, index = ['pegasus'])

In [None]:
def generate_features(data_input):
    encodings_input = tokenizer(data_input['article'] , max_length = 1024, truncation = True)
    with tokenizer.as_target_tokenizer():
        encodings_target = tokenizer(data_input['abstract'], max_length = 128, truncation = True)

        
    return {
        'input_ids' : encodings_input['input_ids'],
        'attention_mask': encodings_input['attention_mask'],
        'labels': encodings_target['input_ids']
    }
 
technical_articles_dataset_pt = technical_articles_dataset.map(generate_features, batched = True)




In [None]:
from transformers import DataCollatorForSeq2Seq
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=pegasus_pretrained)

In [None]:
from transformers import TrainingArguments, Trainer

model_folder = "/content/gdrive/MyDrive/fine-tuned-pegasus-ariv"

trainer_args = TrainingArguments( output_dir=model_folder, num_train_epochs=1, warmup_steps=500,
                                  per_device_train_batch_size=1, per_device_eval_batch_size=1,
                                  weight_decay=0.01, logging_steps=10,
                                  evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
                                  gradient_accumulation_steps=16
              ) 


trainer = Trainer(model=pegasus_pretrained, args=trainer_args,
                  tokenizer=tokenizer, data_collator=seq2seq_data_collator,
                  train_dataset=technical_articles_dataset_pt["train"], 
                  eval_dataset=technical_articles_dataset_pt["validation"])

trainer.train()

In [None]:
# save the fine-tuned model

model_folder = "/content/gdrive/MyDrive/fine-tuned-pegasus-ariv"
trainer.model.save_pretrained(model_folder)
tokenizer.save_pretrained(model_folder)

In [None]:
# Evaluate the performance of fine-tuned model

rougue_scores = compute_rouge_scores(technical_articles_dataset['test'], rouge_metric, trainer.model, tokenizer, batch_size = 2, column_text = 'article', column_summary= 'abstract')
rouge_dict = dict((rn, rougue_scores[rn].mid.fmeasure ) for rn in rouge_names )
pd.DataFrame(rouge_dict, index = [f'pegasus'] )



In [None]:
# save: 7, 
num = 4
sample_text = technical_articles_dataset["test"][num]["article"]
reference = technical_articles_dataset["test"][num]["abstract"]

# Load the fine-tuned model and tokenizer
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, PegasusConfig, TrainingArguments, Trainer, pipeline
model_folder = "/content/gdrive/MyDrive/fine-tuned-pegasus-ariv"
model = PegasusForConditionalGeneration.from_pretrained(model_folder)
tokenizer = PegasusTokenizer.from_pretrained(model_folder)


pipe = pipeline("summarization", model=model, tokenizer=tokenizer)
pipe_out = pipe(technical_articles_dataset['test'][num]['article'][:1024] )

## 
import textwrap
print("Article:")
print(sample_text)

print("\nReference Summary:")
print(reference)

print("\nModel Summary:")
summary = pipe_out[0]["summary_text"]
wrapped_summary = textwrap.fill(summary, width=80)
print(wrapped_summary)

In [None]:
summary = pipe_out[0]["summary_text"]

import textwrap
wrapped_summary = textwrap.fill(summary, width=80)

print("Summary:")
print(wrapped_summary)
