In [5]:
import pandas as pd
from transformers import GPT2Tokenizer, GPT2Model
import torch

# Load the GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add padding token to the tokenizer
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2Model.from_pretrained(model_name)

# Rest of the code remains the same


def calculate_similarity(text1, text2):
    # Tokenize the input texts
    inputs = tokenizer.encode_plus(text1, text2, add_special_tokens=True, truncation=True, padding="longest", return_tensors="pt")

    # Pass the input through the GPT-2 model
    outputs = model(**inputs)

    # Extract the embeddings from the model outputs
    embeddings = outputs.last_hidden_state.squeeze(0)  # Squeeze the batch dimension

    # Calculate the cosine similarity between the embeddings
    similarity = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0)

    return similarity.item()

def calculate_similarity_csv(csv_file, column1, column2):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)

    # Select the specified columns for similarity calculation
    text_data1 = df[column1].tolist()
    text_data2 = df[column2].tolist()

    # Calculate similarity for each pair of texts
    similarities = []
    for text1, text2 in zip(text_data1, text_data2):
        similarity = calculate_similarity(text1, text2)
        similarities.append(similarity)

    # Add the similarities to the DataFrame
    df['similarity'] = similarities

    return df

# Specify the CSV file path and the columns for similarity calculation
csv_file = "D:/Thesis/Processed Data/1.Summary/GPT-test-summary.csv"
column1 = 'Statement'
column2 = 'summary'

# Calculate similarity for the CSV file
similarity_df = calculate_similarity_csv(csv_file, column1, column2)

# Save the updated DataFrame to a new CSV file
output_csv = 'D:/Thesis/Processed Data/tt-test-summary-similarity-gpt2.csv'
similarity_df.to_csv(output_csv, index=False)
