In [1]:
import torch
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np

# Load BERT model (use a lightweight transformer model for efficiency)
bert_model = SentenceTransformer('all-MiniLM-L6-v2')  # Small & fast

def calculate_scores(reference, response, threshold=0.7):
    """
    Calculate Precision, Recall, and F1-score based on BERT embeddings.

    Args:
        reference (str): The ground truth reference response.
        response (str): The chatbot's generated response.
        threshold (float): Similarity threshold to consider a match.

    Returns:
        precision, recall, f1 (float): Computed Precision, Recall, and F1-score.
    """

    # Convert sentences into BERT embeddings
    ref_embedding = bert_model.encode(reference, convert_to_tensor=True)
    resp_embedding = bert_model.encode(response, convert_to_tensor=True)

    # Compute cosine similarity between reference and response
    similarity_score = util.pytorch_cos_sim(ref_embedding, resp_embedding).item()

    # Convert similarity into a binary classification (match or not)
    predicted_match = 1 if similarity_score >= threshold else 0
    actual_match = 1  # Since the reference is always a valid response

    # Compute Precision, Recall, and F1-score
    precision = precision_score([actual_match], [predicted_match])
    recall = recall_score([actual_match], [predicted_match])
    f1 = f1_score([actual_match], [predicted_match])

    return precision, recall, f1, similarity_score

# Example chatbot response evaluation
reference_response = "I am a virtual assistant. How can I help you?"
chatbot_response = "I'm a chatbot. What do you need assistance with?"

# Compute evaluation scores
precision, recall, f1, similarity = calculate_scores(reference_response, chatbot_response)

# Print results
print(f"Cosine Similarity: {similarity:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")


  from tqdm.autonotebook import tqdm, trange


Cosine Similarity: 0.55
Precision: 0.00
Recall: 0.00
F1-Score: 0.00


  _warn_prf(average, modifier, msg_start, len(result))
