In [3]:
## Import necessary libraries
import os
import json
import deprecation
import numpy as np
from typing import List, Dict, Callable
from nltk import ngrams
from src.reasoning.reasoning_utils import get_majority_vote_answer

ModuleNotFoundError: No module named 'src'

In [4]:
@deprecation.deprecated(deprecated_in="1.0", details="Use the compute_majority_vote_accuracy function instead")
def compute_majority_vote_accuracy_trivial(qa_generations: List[object]) -> float:
    """Compute the majority vote accuracy trivially for the QA generation output

    Args:
        qa_generations (List[object]): List of question-answer generation outputs 

    Returns:
        float: Majority vote accuracy
    """
    # Compute the majority vote accuracy
    majority_vote_accuracy = sum([1 for qa in qa_generations if qa['ground_truth_answer'] == qa['majority_vote_answer']]) / len(qa_generations)
    # Return the majority vote accuracy
    return majority_vote_accuracy

def compute_majority_vote_accuracy(qa_generations: List[object], subsample_size: int | None = None) -> float:
    """Compute the majority vote accuracy for the QA generation output

    Args:
        qa_generations (List[object]): List of question-answer generation outputs 

    Returns:
        float: Majority vote accuracy
    """
    qa_correct = 0
    for qa in qa_generations:
        # Subsample the generated sequences
        sentences = qa['generated_sequences'][:subsample_size]
        # Compute the majority vote answer for the QA generation
        qa_majority_vote_accuracy = 1 if get_majority_vote_answer(sentences, "GSM8K") == qa['ground_truth_answer'] else 0
        qa_correct += qa_majority_vote_accuracy
    # Return the majority vote accuracy
    return qa_correct / len(qa_generations)

def compute_n_gram_diversity(qa_generations: List[object], subsample_size: int | None = None) -> float:
    """Compute the n-gram diversity for the QA generation output

    Args:
        qa_generations (List[object]): List of question-answer generation outputs

    Returns:
        float: n-gram diversity for 1-4 grams
    """
    n_gram_diversity = 0.0
    # Compute the n-gram diversity for 1-4 grams
    for qa in qa_generations:
        # Subsample the generated sequences
        sentences = qa['generated_sequences'][:subsample_size]
        # Compute the n-gram diversity for the QA generation
        qa_n_gram_diversity = distinct_n_gram_helper(sentences)
        n_gram_diversity += qa_n_gram_diversity
    # Return the n-gram diversity
    return n_gram_diversity / len(qa_generations)

def distinct_n_gram_helper(sentences: List[str]):
    """Compute distinct-n for n in [1,4] a list of sentences
    Args:
        sentences (List[str]): a list of sentences
    
    Returns:
        float: distinct-n score for n in [1,4]
    """
    # Initialize the distinct-n score
    distinct_n = 0.0
    # Compute the distinct-n score for n in [1,4]
    for n in range(1, 5):
        corpus_n_grams = [*[ngrams(sentence, n) for sentence in sentences]]
        distinct_n_grams = set(corpus_n_grams)
        distinct_n += len(distinct_n_grams) / (len(corpus_n_grams) + np.finfo(float).eps)
    # Return the distinct-n score
    return distinct_n / 4


TypeError: unsupported operand type(s) for |: 'type' and 'NoneType'

In [None]:
# Lists of datasets, strategies, and models 
DATASETS = ["gsm8k"]
STRATEGIES = ["baseline", "greedy"]
MODELS = ["gpt2-large","gemma-2b", "gemma-7b"]

# Mapping of metrics to their corresponding functions
METRICS = {
    "accuracy": compute_majority_vote_accuracy,
    "n-gram-diversity": compute_n_gram_diversity,
}

# Mapping of (strategy, dataset, model) to their corresponding output file 
OUTPUT_FILE_TEMPLATE_FN: Callable[[str, str, str], str] = (
    lambda strategy_name, dataset_name, model_name: f"../../results/{strategy_name}__{dataset_name}__{model_name}__output.json"
)

# List of sample sizes to subsample from the generated sequences
SUBSAMPLE_SIZES = [5, 10, 20, 30, 40]

In [None]:
def compute_metrics(qa_generations: object) -> Dict[str, float]:
    """
    Compute the metrics for subsamples of the QA generation output
    """
    # Compute the metrics for the QA generation output
    metrics = {}
    for subsample_size in SUBSAMPLE_SIZES:
        metrics[subsample_size] = {}
        for metric_name in METRICS:
            metrics[subsample_size][metric_name] = METRICS[metric_name](qa_generations, subsample_size)
    # Return the metrics
    return metrics