In [None]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from sentence_transformers import SentenceTransformer, InputExample, losses
from torch.utils.data import DataLoader

# Read the data from train and val CSV files
train_df = pd.read_csv('/content/train.csv', sep='\t')
train_df = train_df.dropna()
val_df = pd.read_csv('/content/dev.csv', sep='\t')
val_df = val_df.dropna()

train_examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']) for _, row in train_df.iterrows()]
val_examples = [InputExample(texts=[row['sentence1'], row['sentence2']], label=row['score']) for _, row in val_df.iterrows()]
model = SentenceTransformer('distilbert-base-nli-mean-tokens')

train_batch_size = 1
train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=train_batch_size)
val_dataloader = DataLoader(val_examples, shuffle=False, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

model.fit(
    train_objectives=[(train_dataloader, train_loss)],
    epochs=10,
)

model.eval()
predictions = []
true_labels = []

for batch in val_dataloader:
    texts = batch.texts
    labels = batch.label
    with torch.no_grad():
        outputs = model.encode(texts)
    predictions.extend(outputs.tolist())
    true_labels.extend(labels.tolist())

correlation_coefficient = pearsonr(predictions, true_labels)[0]
print(f"Pearson Correlation on Validation Set: {correlation_coefficient}")

# Plotting Losses
plt.plot(range(1, len(model.training_loss_history) + 1), model.training_loss_history, label='Train Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()
plt.show()
