In [13]:
from transformers import RobertaModel, RobertaTokenizer

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaModel.from_pretrained('roberta-base')


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def process_text(text):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True)
    
    # Forward pass through RoBERTa model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the last hidden state (corresponding to [CLS] token)
    last_hidden_state = outputs.last_hidden_state
    
    # Compute mean pooling of the last hidden state
    pooled_output = torch.mean(last_hidden_state, dim=1).squeeze()
    
    return pooled_output

def compute_similarity(file1_path, file2_path):
    # Read contents of file1
    with open(file1_path, 'r', encoding='utf-8') as file:
        text1 = file.read().strip()
    
    # Read contents of file2
    with open(file2_path, 'r', encoding='utf-8') as file:
        text2 = file.read().strip()
    
    # Process both texts to obtain embeddings
    embeddings1 = process_text(text1)
    embeddings2 = process_text(text2)
    
    # Convert embeddings to numpy arrays and compute cosine similarity
    similarity_score = cosine_similarity([embeddings1.numpy()], [embeddings2.numpy()])[0][0]
    
    return similarity_score


In [15]:
# File paths for input texts
file1_path = 'NYC_cleaned_penal_codes.csv'
file2_path = 'Cali_cleaned_penal_codes.csv'

# Compute similarity between texts in files
similarity_score = compute_similarity(file1_path, file2_path)
print("Similarity Score:", similarity_score)

Similarity Score: 0.99632895
