# Evaluation

In [None]:
pip install transformers

In [None]:
pip install torch

In [None]:
import pandas as pd
import spacy
from transformers import AutoTokenizer, AutoModel
import torch

# Load dataset
df = pd.read_csv("/Users/dionnespaltman/Desktop/Luiss/Machine Learning/Project/stopword_removal_dataframe.csv") 

# Load multilingual model for embeddings 
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/LaBSE")
model = AutoModel.from_pretrained("sentence-transformers/LaBSE")

def get_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        output = model(**inputs)
    return output.last_hidden_state.mean(dim=1)

# Add embeddings
df["embed1"] = df["processed_language1"].apply(lambda x: get_embedding(str(x)) if pd.notnull(x) else None)
df["embed2"] = df["processed_language2"].apply(lambda x: get_embedding(str(x)) if pd.notnull(x) else None)


In [6]:
df.to_csv("/Users/danielebiggi/Desktop/Data Science/Machine learning/rosettastone2/Daniele_notebooks/embedding.csv", index=False)

In [2]:
import pandas as pd
import pickle
df = pd.read_pickle("/Users/danielebiggi/Downloads/sentence_to_embedding.pkl")

In [None]:
# Convert the dictionary back to a DataFrame
df = pd.DataFrame.from_dict(df, orient='index')
df.to_csv("/Users/danielebiggi/Desktop/Data Science/Machine learning/rosettastone2/Daniele_notebooks/sentence_to_embedding.csv", index=False)

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

t5_tokenizer = T5Tokenizer.from_pretrained("ramsrigouthamg/t5_paraphraser")
t5_model = T5ForConditionalGeneration.from_pretrained("ramsrigouthamg/t5_paraphraser")

def paraphrase(text):
    input_text = f"paraphrase: {text} </s>"
    inputs = t5_tokenizer([input_text], return_tensors="pt", truncation=True, padding=True)
    outputs = t5_model.generate(**inputs, max_length=60, num_return_sequences=1)
    return t5_tokenizer.decode(outputs[0], skip_special_tokens=True)

# Apply to one language 
df["sentence2_paraphrased"] = df["sentence2"].apply(lambda x: paraphrase(x) if pd.notnull(x) and "en" in x else x)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

df["similarity_score"] = df.apply(lambda row: cosine_similarity(row["embed1"], row["embed2"])[0][0], axis=1)


In [None]:
import torch.nn as nn

class SiameseNetwork(nn.Module):
    def __init__(self, embed_size):
        super(SiameseNetwork, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(embed_size * 2, 512),
            nn.ReLU(),
            nn.Linear(512, 1)
        )

    def forward(self, x1, x2):
        x = torch.cat((x1, x2), dim=1)
        return self.fc(x)


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt

mse = mean_squared_error(df["score"], df["similarity_score"])
r2 = r2_score(df["score"], df["similarity_score"])

print(f"MSE: {mse:.3f}, R²: {r2:.3f}")
plt.scatter(df["score"], df["similarity_score"])
plt.xlabel("Human Score")
plt.ylabel("Model Score")
plt.title("Similarity Score Correlation")
plt.show()
