In [None]:
!pip install -U scikit-learn
!pip install -U sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Load the test dataset:

In [None]:
import pandas as pd
test_df = pd.read_csv("/content/drive/MyDrive/Mini-projects/MSRP/test.tsv", on_bad_lines='skip', sep="\t")
print(f"Total test samples : {test_df.shape[0]}")

print("Number of missing values")
print(test_df.isnull().sum())
test_df.dropna(axis=0, inplace=True)

Total test samples : 1639
Number of missing values
Quality      0
#1 ID        0
#2 ID        0
#1 String    0
#2 String    9
dtype: int64


Load the dev dataset:

In [None]:
import pandas as pd
dev_df = pd.read_csv("/content/drive/MyDrive/Mini-projects/MSRP/dev.tsv", on_bad_lines='skip', sep="\t")
print(f"Total dev samples: {dev_df.shape[0]}")

print("Number of missing values")
print(dev_df.isnull().sum())
dev_df.dropna(axis=0, inplace=True)

Total dev samples: 480
Number of missing values
Quality      0
#1 ID        0
#2 ID        0
#1 String    0
#2 String    4
dtype: int64


Load the best model (having the highest F1 score) saved during the training:



In [None]:
from sentence_transformers import CrossEncoder

model = CrossEncoder("/content/drive/MyDrive/Mini-projects/output/training_paraphrases_cross-encoder-stsb-roberta-base", num_labels=1)

Get the values of some metrics and thresholds of the model evaluated using the dev dataset.



In [None]:
import numpy as np
from sentence_transformers.evaluation import BinaryClassificationEvaluator
from sklearn.metrics import average_precision_score

# calculate final dev scores:
sentence_pairs = [[row["#1 String"], row["#2 String"]] for i, row in dev_df.iterrows()]
labels =  np.asarray([int(row["Quality"]) for i, row in dev_df.iterrows()])
pred_scores = model.predict(sentence_pairs, convert_to_numpy=True, show_progress_bar=True)

acc, acc_threshold = BinaryClassificationEvaluator.find_best_acc_and_threshold(pred_scores, labels, True)
f1, precision, recall, f1_threshold = BinaryClassificationEvaluator.find_best_f1_and_threshold(pred_scores, labels, True)
ap = average_precision_score(labels, pred_scores)

print(f"Accuracy = {acc}")
print(f"F1 score = {f1}")
print(f"Average Precision = {ap}")

Batches:   0%|          | 0/15 [00:00<?, ?it/s]

Accuracy = 0.9222689075630253
F1 score = 0.9450222882615156
Average Precision = 0.972630005944577


Using the *F1 threshold* to calculate the accuracy and F1 score of the model when evaluated on the test dataset.

In [None]:
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, average_precision_score

# calculate test scores:
sentence_pairs = [[row["#1 String"], row["#2 String"]] for i, row in test_df.iterrows()]
labels =  np.asarray([int(row["Quality"]) for i, row in test_df.iterrows()])
pred_scores = model.predict(sentence_pairs, convert_to_numpy=True, show_progress_bar=True)
ap = average_precision_score(labels, pred_scores)

pred_labels = [1 if score > f1_threshold else 0 for score in pred_scores]
acc = accuracy_score(labels, pred_labels)
f1 = f1_score(labels, pred_labels, average="binary")

print(f"Accuracy = {acc}")
print(f"F1 score = {f1}")
print(f"Average Precision = {ap}")

Batches:   0%|          | 0/51 [00:00<?, ?it/s]

Accuracy = 0.8852760736196319
F1 score = 0.915269596737653
Average Precision = 0.9493976114261478
