In [2]:
# Evaluation Metrics
# 1. Exact Match (EM)
# Strict comparison of normalized predictions and ground truth (lowercase, trimmed spaces)
# 2. F1 Score
# Word-level overlap measure combining precision and recall of tokenized answers
# 3. BERTScore
# Contextual similarity using roberta-large embeddings (layer 17, batch size 32)
# 4. Bi-Encoder Score
# Cosine similarity between answer embeddings using SentenceTransformers (all-MiniLM-L6-v2)
# 5. Semantic Answer Similarity (SAS)
# Domain-adapted BERT model fine-tuned on restaurant reviews using MLM, measuring semantic equivalence between answers

In [3]:
import pandas as pd
import re

def normalize_text(text):
    """
    Normalize text by:
    1. Converting to lowercase
    2. Removing extra whitespace
    3. Removing punctuation
    4. Standardizing special characters
    """
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation and special characters, keeping only alphanumeric and spaces
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # Replace multiple spaces with single space and trim
    text = ' '.join(text.split())

    return text

def calculate_exact_match(row):
    """
    Calculate Exact Match score (0 or 1) for a single pair of answers
    """
    actual = normalize_text(row['actual_answer'])
    predicted = normalize_text(row['predicted_answer'])
    return 1 if actual == predicted else 0

def evaluate_exact_match(df):
    """
    Calculate Exact Match scores for the entire dataset
    Returns:
    - DataFrame with EM scores
    - Overall EM accuracy
    """
    # Create copy of dataframe to avoid modifying original
    df_eval = df.copy()

    # Calculate EM score for each row
    df_eval['exact_match_score'] = df_eval.apply(calculate_exact_match, axis=1)

    # Calculate overall accuracy
    overall_em = df_eval['exact_match_score'].mean()

    return df_eval, overall_em


In [4]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import string
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from collections import Counter

class F1Evaluator:
    def __init__(self):
        # Download required NLTK data
        try:
            nltk.download('punkt', quiet=True)
        except:
            pass

    def preprocess_text(self, text):
        """
        Preprocess text by:
        1. Converting to lowercase
        2. Removing punctuation
        3. Tokenizing into words
        """
        # Convert to lowercase
        text = text.lower()

        # Remove punctuation
        text = text.translate(str.maketrans("", "", string.punctuation))

        # Tokenize
        tokens = word_tokenize(text)

        # Remove empty tokens
        tokens = [token for token in tokens if token.strip()]

        return tokens

    def calculate_f1_score(self, actual, predicted):
        """
        Calculate F1 score between actual and predicted answers
        """
        # Tokenize both answers
        actual_tokens = self.preprocess_text(actual)
        predicted_tokens = self.preprocess_text(predicted)

        # Convert to Counter objects for easy intersection
        actual_counter = Counter(actual_tokens)
        predicted_counter = Counter(predicted_tokens)

        # Calculate intersection
        intersection = sum((actual_counter & predicted_counter).values())

        # Calculate precision and recall
        precision = intersection / len(predicted_tokens) if predicted_tokens else 0
        recall = intersection / len(actual_tokens) if actual_tokens else 0

        # Calculate F1 score
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        return {
            'f1_score': f1,
            'precision': precision,
            'recall': recall,
            'actual_tokens': len(actual_tokens),
            'predicted_tokens': len(predicted_tokens),
            'matching_tokens': intersection
        }

def evaluate_qa_f1(df):
    """
    Evaluate QA pairs using F1 score
    """
    evaluator = F1Evaluator()

    # Calculate scores for each row
    results = []
    for _, row in df.iterrows():
        score_dict = evaluator.calculate_f1_score(row['actual_answer'], row['predicted_answer'])
        results.append(score_dict)

    # Add scores to DataFrame
    df_scored = df.copy()
    for metric in ['f1_score', 'precision', 'recall', 'matching_tokens']:
        df_scored[metric] = [r[metric] for r in results]

    # Calculate overall scores
    overall_scores = {
        'f1_score': np.mean([r['f1_score'] for r in results]),
        'precision': np.mean([r['precision'] for r in results]),
        'recall': np.mean([r['recall'] for r in results])
    }

    return df_scored, overall_scores

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [5]:
import torch
from transformers import RobertaTokenizer, RobertaModel
import numpy as np
from tqdm import tqdm
import pandas as pd

class BERTScoreEvaluator:
    def __init__(self, model_name='roberta-large', layer=17, batch_size=32):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
        self.model = RobertaModel.from_pretrained(model_name).to(self.device)
        self.layer = layer
        self.batch_size = batch_size

    def get_embeddings(self, text_batch):
        """Extract embeddings from specified layer for a batch of text"""
        # Tokenize and move to device
        inputs = self.tokenizer(text_batch,
                              padding=True,
                              truncation=True,
                              return_tensors="pt").to(self.device)

        # Get model outputs
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)

        # Get embeddings from specified layer
        layer_embeddings = outputs.hidden_states[self.layer]

        # Create attention mask for proper averaging
        attention_mask = inputs['attention_mask'].unsqueeze(-1)

        # Apply mask and compute mean
        masked_embeddings = layer_embeddings * attention_mask
        sum_embeddings = torch.sum(masked_embeddings, dim=1)
        sum_mask = torch.sum(attention_mask, dim=1)
        embeddings = sum_embeddings / sum_mask

        return embeddings.cpu().numpy()

    def calculate_bertscore(self, references, candidates):
        """Calculate BERTScore for batches of text"""
        scores = []

        # Process in batches
        for i in range(0, len(references), self.batch_size):
            batch_refs = references[i:i + self.batch_size]
            batch_cands = candidates[i:i + self.batch_size]

            # Get embeddings
            ref_embeddings = self.get_embeddings(batch_refs)
            cand_embeddings = self.get_embeddings(batch_cands)

            # Normalize embeddings
            ref_embeddings = ref_embeddings / np.linalg.norm(ref_embeddings, axis=1)[:, None]
            cand_embeddings = cand_embeddings / np.linalg.norm(cand_embeddings, axis=1)[:, None]

            # Calculate cosine similarity
            batch_scores = np.sum(ref_embeddings * cand_embeddings, axis=1)
            scores.extend(batch_scores)

        return np.array(scores)

def evaluate_qa_bertscore(df):
    """
    Evaluate QA pairs using BERTScore
    """
    # Initialize evaluator
    evaluator = BERTScoreEvaluator()

    # Get lists of answers
    actual_answers = df['actual_answer'].tolist()
    predicted_answers = df['predicted_answer'].tolist()

    # Calculate scores
    print("Calculating BERTScores...")
    bertscores = evaluator.calculate_bertscore(actual_answers, predicted_answers)

    # Add scores to DataFrame
    df_scored = df.copy()
    df_scored['bertscore'] = bertscores

    # Calculate overall score
    overall_score = np.mean(bertscores)

    return df_scored, overall_score

In [6]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

class BiEncoderEvaluator:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Initialize the Bi-Encoder evaluator with specified model
        """
        self.model = SentenceTransformer(model_name)

    def calculate_similarity(self, actual_answers, predicted_answers):
        """
        Calculate cosine similarity between actual and predicted answer embeddings
        """
        # Generate embeddings for both sets of answers
        actual_embeddings = self.model.encode(actual_answers)
        predicted_embeddings = self.model.encode(predicted_answers)

        # Calculate cosine similarity for each pair
        similarities = [cosine_similarity(
            actual_emb.reshape(1, -1),
            pred_emb.reshape(1, -1)
        )[0][0] for actual_emb, pred_emb in zip(actual_embeddings, predicted_embeddings)]

        return similarities

def evaluate_qa_biencoder(df):
    """
    Evaluate QA pairs using Bi-Encoder score
    """
    # Initialize evaluator
    evaluator = BiEncoderEvaluator()

    # Get lists of answers
    actual_answers = df['actual_answer'].tolist()
    predicted_answers = df['predicted_answer'].tolist()

    # Calculate similarities
    scores = evaluator.calculate_similarity(actual_answers, predicted_answers)

    # Add scores to DataFrame
    df_scored = df.copy()
    df_scored['biencoder_score'] = scores

    # Calculate overall score
    overall_score = np.mean(scores)

    return df_scored, overall_score

In [7]:
import torch
from transformers import (
    BertTokenizer,
    BertForMaskedLM,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from torch.utils.data import Dataset
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from typing import List, Dict
from tqdm import tqdm
import os


class RestaurantReviewDataset(Dataset):
    def __init__(self, texts: List[str], tokenizer: BertTokenizer):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)

    def __getitem__(self, idx) -> Dict:
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self) -> int:
        return len(self.encodings.input_ids)

class BERTSimilarityEvaluator:
    def __init__(self, model_name: str = "bert-base-uncased"):
        """Initialize with base BERT model"""
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = BertTokenizer.from_pretrained(model_name)
        self.model = BertForMaskedLM.from_pretrained(model_name)
        self.model.to(self.device)

    def save_model(self, save_path: str) -> None:
        """
        Save the model and tokenizer to the specified path

        Args:
            save_path: Directory path where model should be saved
        """
        # Create directory if it doesn't exist
        os.makedirs(save_path, exist_ok=True)

        # Save model state
        model_path = os.path.join(save_path, 'model_state.pt')
        torch.save({
            'model_state_dict': self.model.state_dict(),
            'device': str(self.device)
        }, model_path)

        # Save tokenizer
        self.tokenizer.save_pretrained(save_path)

        print(f"Model and tokenizer saved to {save_path}")

    @classmethod
    def load_model(cls, load_path: str) -> 'BERTSimilarityEvaluator':
        """
        Load a saved model and tokenizer

        Args:
            load_path: Directory path where model is saved

        Returns:
            BERTSimilarityEvaluator: Instance with loaded model and tokenizer
        """
        # Initialize with default model
        instance = cls()

        # Load tokenizer
        instance.tokenizer = BertTokenizer.from_pretrained(load_path)

        # Load model state
        model_path = os.path.join(load_path, 'model_state.pt')
        checkpoint = torch.load(model_path, map_location=instance.device)
        instance.model.load_state_dict(checkpoint['model_state_dict'])
        instance.model.to(instance.device)

        print(f"Model and tokenizer loaded from {load_path}")
        return instance

    def fine_tune_mlm(self,
                     texts: List[str],
                     output_dir: str = "./bert_restaurant",
                     num_epochs: int = 3,
                     batch_size: int = 24):
        """Fine-tune BERT using MLM on domain-specific texts"""

        # Prepare dataset
        dataset = RestaurantReviewDataset(texts, self.tokenizer)

        # Create data collator for MLM
        data_collator = DataCollatorForLanguageModeling(
            tokenizer=self.tokenizer,
            mlm=True,
            mlm_probability=0.15
        )

        # Define training arguments
        training_args = TrainingArguments(
            output_dir=output_dir,
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            save_steps=1000,
            save_total_limit=2,
            logging_steps=100,
            learning_rate=2e-5,
        )

        # Initialize trainer
        trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=dataset,
            data_collator=data_collator,
        )

        # Train the model
        trainer.train()

        # Save model
        self.model.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)

    def get_embeddings(self, text: str) -> np.ndarray:
        """Get embeddings for a text using the fine-tuned model"""
        # Tokenize text
        inputs = self.tokenizer(
            text,
            return_tensors="pt",
            truncation=True,
            max_length=512,
            padding=True
        ).to(self.device)

        # Get BERT output
        self.model.eval()
        with torch.no_grad():
            outputs = self.model(**inputs, output_hidden_states=True)

            # Get the last hidden state
            last_hidden_state = outputs.hidden_states[-1]

            # Mean pooling
            attention_mask = inputs['attention_mask']
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
            embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

        return embeddings.cpu().numpy()

    def compute_similarity(self, text1: str, text2: str) -> float:
        """Compute similarity between two texts"""
        emb1 = self.get_embeddings(text1)
        emb2 = self.get_embeddings(text2)
        return cosine_similarity(emb1, emb2)[0][0]

    def evaluate_batch(self,
                      ground_truths: List[str],
                      predictions: List[str],
                      threshold: float = 0.8) -> Dict:
        """Evaluate batch of predictions against ground truths"""
        similarities = []
        matches = []

        for gt, pred in tqdm(zip(ground_truths, predictions), total=len(ground_truths)):
            sim = self.compute_similarity(gt, pred)
            similarities.append(sim)
            matches.append(sim >= threshold)

        return {
            'mean_similarity': np.mean(similarities),
            'std_similarity': np.std(similarities),
            'min_similarity': np.min(similarities),
            'max_similarity': np.max(similarities),
            'matching_percentage': np.mean(matches) * 100,
            'similarities': similarities
        }


In [8]:
import pandas as pd
df = pd.read_csv("/content/qa_2_lla_results_final.csv")

In [9]:
qa_df = df[["question", "answer", "predicted_answer"]]

In [10]:
qa_df.columns = ["question", "actual_answer", "predicted_answer"]

In [11]:
qa_df

Unnamed: 0,question,actual_answer,predicted_answer
0,Does Adelphia Restaurant & Events serve Egypti...,Adelphia Restaurant & Events does not offer eg...,Adelphia Restaurant & Events does not offer eg...
1,How attentive are the staff at Adelphia Restau...,"Based on the reviews, the staff at Adelphia Re...","Based on the reviews, the staff at Adelphia Re..."
2,Does Adelphia Restaurant & Events have vegan o...,Adelphia Restaurant & Events may not accommoda...,Adelphia Restaurant & Events may not accommoda...
3,What fire-roasted specialties does Adelphia Re...,They offer fire-roasted preparations of crab c...,"They offer fire-roasted wings, pizzas, and var..."
4,Does Adelphia Restaurant & Events have spa ser...,Adelphia Restaurant & Events does not offer be...,Adelphia Restaurant & Events does not offer be...
...,...,...,...
3090,What is the noise level at Yard House?,The noise level is loud,Average
3091,How does Yard House celebrate special occasions?,Reviews mention birthday celebrations though n...,They offer special packages and decorations fo...
3092,What do customers say about appetizer pricing?,Happy hour appetizer prices are well-received ...,Mixed reviews on appetizer pricing
3093,Which spicy dishes are recommended at Yard House?,Reviews mention Nashville hot chicken spicy ja...,Spicy Poke Nachos and Korean Fried Wings are s...


In [12]:
 df_with_scores, overall_accuracy = evaluate_exact_match(qa_df)

In [13]:
overall_accuracy

0.34507269789983847

In [14]:
df_with_scores, overall_scores = evaluate_qa_f1(qa_df)

In [15]:
overall_scores

{'f1_score': 0.6435596920052596,
 'precision': 0.6663953924216646,
 'recall': 0.6409449210733891}

In [16]:
df_with_scores, overall_bertscore = evaluate_qa_bertscore(qa_df)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Calculating BERTScores...


In [17]:
overall_bertscore

0.99285716

In [18]:
df_with_scores, overall_score = evaluate_qa_biencoder(qa_df)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [19]:
overall_score

0.83779156

In [23]:
# Load the model
loaded_evaluator = BERTSimilarityEvaluator.load_model("/content/saved_restaurant_model")


# Batch evaluation example
ground_truths = qa_df['actual_answer'].tolist()
predictions = qa_df['predicted_answer'].tolist()

results = loaded_evaluator.evaluate_batch(ground_truths, predictions)
print(results)
print("\nBatch Evaluation Results:")
for metric, value in results.items():
    if metric != 'similarities':
        print(f"{metric}: {value:.3f}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  checkpoint = torch.load(model_path, map_location=instance.device)


Model and tokenizer loaded from /content/saved_restaurant_model


100%|██████████| 3095/3095 [01:24<00:00, 36.42it/s]


{'mean_similarity': 0.93392456, 'std_similarity': 0.08203296, 'min_similarity': 0.33991194, 'max_similarity': 1.0000005, 'matching_percentage': 95.18578352180937, 'similarities': [0.9999999, 0.9477155, 0.9999999, 0.87979966, 0.9999999, 0.9004575, 0.88304275, 0.92202115, 0.878147, 0.94665194, 1.0000001, 1.0000002, 0.9519767, 0.92229676, 0.96153253, 0.8860356, 0.89633083, 0.9606828, 0.9103503, 1.0000001, 0.99999994, 0.9664182, 0.9223119, 1.0000002, 1.0, 0.87258255, 1.0, 0.9189347, 1.0, 1.0, 1.0, 0.87248546, 0.92517626, 1.0, 0.9391342, 0.9323503, 1.0000002, 1.0, 0.88277555, 0.9669061, 0.91431296, 1.0, 0.962914, 0.84076655, 0.9357563, 0.90706694, 1.0, 0.9999999, 0.96593523, 0.9171272, 0.9112084, 0.9999998, 0.94810903, 0.93965733, 0.8570425, 0.9999999, 0.9522877, 0.93327165, 0.8983, 0.96335566, 0.9237033, 0.9659075, 0.8927859, 0.9729564, 0.906666, 1.0000001, 0.93155754, 0.8498397, 0.9999998, 0.9727782, 0.9271128, 1.0000001, 0.9115354, 0.8839322, 0.92591566, 0.90794265, 0.9999999, 0.8580235,