In [1]:
from transformers import AutoTokenizer, AutoModel

# Load the BioBERT model and tokenizer
model_name = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)


In [7]:
import torch

def calculate_similarity(text1, text2):
    # Tokenize the input texts
    inputs = tokenizer(text1, text2, padding=True, truncation=True, return_tensors="pt")

    # Pass the input through the BioBERT model
    outputs = model(**inputs)

    # Extract the embeddings from the model outputs
    embeddings = outputs.last_hidden_state.squeeze(0)  # Squeeze the batch dimension

    # Calculate the cosine similarity between the embeddings
    similarity = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1], dim=0)

    return similarity.item()


In [19]:
import pandas as pd

def calculate_similarity_csv(csv_file, column1, column2):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    df=df.drop(columns="rogue_score")
    # Select the specified columns for similarity calculation
    text_data1 = df[column1].tolist()
    text_data2 = df[column2].tolist()

    # Calculate similarity for each pair of texts
    similarities = []
    for text1, text2 in zip(text_data1, text_data2):
        similarity = calculate_similarity(text1, text2)
        similarities.append(similarity)

    # Add the similarities to the DataFrame
    df['similarity'] = similarities

    return df


In [20]:
csv_file = 'D:/Thesis/Processed Data/GPT-test-summary.csv'
column1 = 'Statement'
column2 = 'summary'


In [22]:
# Calculate similarity for the CSV file
similarity_df = calculate_similarity_csv(csv_file, column1, column2)

# Save the updated DataFrame to a new CSV file
output_csv = 'D:/Thesis/Processed Data/test-summary-similarity-bert.csv'
similarity_df.to_csv(output_csv, index=False)
