# Load in trained model

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"
drive.mount('/content/drive', force_remount=True)
model_to_load_in = "t5_100k_trained"
load_path = "/content/drive/MyDrive/CSC413_Final_Project/" + model_to_load_in + ".pth"

model.load_state_dict(torch.load(load_path, map_location=torch.device(device)))

Mounted at /content/drive


<All keys matched successfully>

# Run both models on test set

In [5]:
import torch
from transformers import T5TokenizerFast

# Convert DataFrame columns to lists
reviews = df_val['review'].tolist()
summaries = df_val['summary'].tolist()

# Tokenize the reviews and summaries
tokenized_reviews = tokenizer(reviews, padding=True, truncation=True, return_tensors="pt", max_length=MAX_REVIEW_LENGTH)
tokenized_summaries = tokenizer(summaries, padding=True, truncation=True, return_tensors="pt", max_length=MAX_REVIEW_LENGTH)

# Extract the tensors
review_tensors = tokenized_reviews['input_ids']
summary_tensors = tokenized_summaries['input_ids']

review_tensors = review_tensors.to(device)
summary_tensors = summary_tensors.to(device)

In [6]:
trained_generated_summaries = []
baseline_generated_summaries = []
target_summaries = []
input_reviews = []

def process_batch(reviews, summaries, model, baseline_model):
    # Process with the trained model
    trained_outputs = model.generate(input_ids=reviews, attention_mask=reviews.ne(tokenizer.pad_token_id), num_beams=4, do_sample=True, min_length=1, max_length=10)
    trained_decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in trained_outputs]
    trained_predicted_summaries = [nltk.sent_tokenize(decoded.strip())[0] for decoded in trained_decoded_outputs]

    # Process with the baseline model
    baseline_outputs = baseline_model.generate(input_ids=reviews, attention_mask=reviews.ne(tokenizer.pad_token_id), num_beams=4, do_sample=True, min_length=1, max_length=10)
    baseline_decoded_outputs = [tokenizer.decode(output, skip_special_tokens=True) for output in baseline_outputs]
    baseline_predicted_summaries = [nltk.sent_tokenize(decoded.strip())[0] for decoded in baseline_decoded_outputs]

    actual_summaries = [tokenizer.decode(summary, skip_special_tokens=True) for summary in summaries]

    return trained_predicted_summaries, baseline_predicted_summaries, actual_summaries

# Set a batch size
batch_size = 128
originalModel = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model.to(device)
originalModel.to(device)
# Process in batches
for i in range(0, len(review_tensors), batch_size):
    review_batch = review_tensors[i:i+batch_size]
    summary_batch = summary_tensors[i:i+batch_size]

    review_batch = review_batch.to(device)
    summary_batch = summary_batch.to(device)


    trained_preds, baseline_preds, actuals = process_batch(review_batch, summary_batch, model, originalModel)

    trained_generated_summaries.extend(trained_preds)
    baseline_generated_summaries.extend(baseline_preds)
    target_summaries.extend(actuals)
    input_reviews.extend([tokenizer.decode(review, skip_special_tokens=True) for review in review_batch])

# At this point:
# - trained_generated_summaries contains summaries generated by model
# - baseline_generated_summaries contains summaries generated by the baseline model
# - target_summaries contains the actual summaries
# - input_reviews contains the input reviews



In [7]:
count = 0

trained_generated_summaries_trimmed = []
baseline_generated_summaries_trimmed = []
target_summaries_trimmed = []
input_reviews_trimmed = []
indices = []

for i in range(len(trained_generated_summaries)):
  if trained_generated_summaries[i] != baseline_generated_summaries[i]:
    trained_generated_summaries_trimmed.append(trained_generated_summaries[i])
    baseline_generated_summaries_trimmed.append(baseline_generated_summaries[i])
    target_summaries_trimmed.append(target_summaries[i])
    input_reviews
print(count)

2837


# Save to CSV

In [8]:
import pandas as pd

# Create a DataFrame from the arrays
df = pd.DataFrame({
    'Trained Model Summaries': trained_generated_summaries_trimmed,
    'Baseline Model Summaries': baseline_generated_summaries_trimmed,
    'Target Summaries': target_summaries_trimmed,
    'Input Reviews': input_reviews_trimmed
})

# Write the DataFrame to a CSV file
csv_file = '/content/summaries_comparison.csv'
df.to_csv(csv_file, index=False)

# Download the file (uncomment the next line to enable download in Google Colab)
from google.colab import files
files.download(csv_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Calculate NLP Scores

In [13]:
!pip install rouge-score nltk

import nltk
from rouge_score import rouge_scorer
import pandas as pd

nltk.download('punkt')

def calculate_bleu_score(references, hypotheses):
    bleu_scores = [nltk.translate.bleu_score.sentence_bleu([ref.split()], hyp.split(), weights=(0.5, 0.5)) for ref, hyp in zip(references, hypotheses)]
    return sum(bleu_scores) / len(bleu_scores)

def calculate_rouge_scores(references, hypotheses):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = [scorer.score(ref, hyp) for ref, hyp in zip(references, hypotheses)]
    avg_scores = {key: sum([score[key].fmeasure for score in scores]) / len(scores) for key in ['rouge1', 'rouge2', 'rougeL']}
    return avg_scores

trained_bleu = calculate_bleu_score(target_summaries, trained_generated_summaries)
baseline_bleu = calculate_bleu_score(target_summaries, baseline_generated_summaries)

trained_rouge = calculate_rouge_scores(target_summaries, trained_generated_summaries)
baseline_rouge = calculate_rouge_scores(target_summaries, baseline_generated_summaries)




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


# Print table

In [16]:

results_df = pd.DataFrame({
    'Metric': ['BLEU', 'ROUGE-1', 'ROUGE-2', 'ROUGE-L'],
    'Trained Model': [trained_bleu, trained_rouge_1, trained_rouge_2, trained_rouge_L],
    'Baseline Model': [baseline_bleu, baseline_rouge_1, baseline_rouge_2, baseline_rouge_L]
})

# Display the table
print(results_df)

    Metric  Trained Model  Baseline Model
0     BLEU       0.228966        0.018471
1  ROUGE-1       0.416104        0.024515
2  ROUGE-2       0.251672        0.011014
3  ROUGE-L       0.268903        0.017273
