In [None]:
#Import the libraries

import nltk
import numpy as np
import os
import matplotlib.pyplot as plt
import pandas as pd
import torch

from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
from transformers import AutoTokenizer

In [None]:
device=torch.device("cuda")
# device = "cpu"

In [None]:
nltk.download('punkt')
model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
model_name = 'gpt2-large'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# This function calculates the cosine similarity between two text inputs, using a pre-trained model to generate embeddings and returning a score from -1 to 1.
def similarity_score(text1, text2):
    embeddings1 = model.encode(text1, convert_to_tensor=True)
    embeddings2 = model.encode(text2, convert_to_tensor=True)
    return util.cos_sim(embeddings1, embeddings2)

# This function calculates the similarity using the BERT model given the original and generated texts
def approximated_matching(original_text, generated_text, original_tokens, prompt_length):
    generated_sentences = nltk.sent_tokenize(generated_text)
    # Convert the tokens back into text
    original_sub_string = tokenizer.decode(original_tokens["input_ids"][0][:prompt_length])
    
    # Get the sentences of sub-strings
    original_sentences_length = len(nltk.sent_tokenize(original_sub_string))
    original_sentence = nltk.sent_tokenize(original_text)[original_sentences_length-1]
    generated_sentence = generated_sentences[original_sentences_length-1]
 
    score = similarity_score(original_sentence, generated_sentence)
    
    # Calculating the length difference
    temp_text = ' '.join(generated_sentences[:original_sentences_length])
    temp_text = tokenizer(temp_text, return_tensors="pt")
    
    diff = len(temp_text["input_ids"][0]) - prompt_length

    return score, diff, original_sentence, generated_sentence

def exact_matching(original_tokens, generated_tokens, prompt_length):
    pointer = 0
    while original_tokens['input_ids'][0][prompt_length + pointer] == generated_tokens['input_ids'][0][prompt_length + pointer]:
        pointer += 1
    return pointer

# If the decoding algorithm is one of the greedy search or beam search, the generated text might be repetitive.
# Hence, we wrote this function to check if the generated text is repetitive. If so, we will remove that sample.
# This function determines if there is repetitive text in a generated text using a sliding window of the specified size.
# Returns True if there is any repetition and False otherwise.
def has_repetition(generated_text, window_size=10):
    window_start = 0
    window_end = window_size
    while window_end < len(generated_text):
        window_text = generated_text[window_start:window_end]
        if window_text in generated_text[window_end:]:
            return True
        window_start += 1
        window_end += 1
    return False

This cell reads in pre-generated text for several models, and calculates the similarity between the generated text and a set of original texts. It skips any text that does not meet certain criteria and outputs the average similarity score for each model.

In [None]:
generated_text_path = "./generated_text_2/"
generated_text_files = [f for f in os.listdir(generated_text_path) if f.endswith(".csv")]
prompt_length = 70

for generated_text_file in generated_text_files:
    print(f"---------------------------------")
    print(f"Processing {generated_text_file}")

    generated_df = pd.read_csv(f"{generated_text_path}/{generated_text_file}")
    similarity_scores = []
    length_diffs = []
    exact_matches = []
    original_sentences = []
    generated_sentences = []
    repetitions = []
    
    for (original_text, generated_text) in tqdm(zip(generated_df['text'], generated_df['promptLength70_numBeams1']), total=len(generated_df.index)):
        original_tokens = tokenizer(original_text, return_tensors="pt").to(device)
        generated_tokens = tokenizer(generated_text, return_tensors="pt").to(device)
    
        sim_score, length_diff, original_sentence, generated_sentence = approximated_matching(original_text, generated_text, original_tokens, prompt_length)
        exact_match = exact_matching(original_tokens, generated_tokens, prompt_length)
        has_rep = has_repetition(generated_sentence)
                
        similarity_scores.append(sim_score.cpu().item())
        exact_matches.append(exact_match)
        length_diffs.append(length_diff)
        original_sentences.append(original_sentence)
        generated_sentences.append(generated_sentence)
        repetitions.append(has_rep)
        
    generated_df['similarity_score'] = similarity_scores
    generated_df['exact_match'] = exact_matches
    generated_df['length_diff'] = length_diffs
    generated_df['original_sentence'] = original_sentences
    generated_df['generated_sentence'] = generated_sentences
    generated_df['has_repetition'] = repetitions
        
    generated_df.to_csv(f"./metrics/metrics_{generated_text_file}", index=False)
