In [7]:
# IMPORTS

import pandas as pd
from datasets import load_dataset
import ast
from tqdm import tqdm
import numpy as np

In [8]:
# Load Results File

input_df = pd.read_excel('gpt_4o_semeval_results.xlsx')

In [9]:
# Load the questions

all_qa = load_dataset("cardiffnlp/databench", name="qa", split="train")

In [10]:
df = pd.DataFrame(all_qa)
df = df[['question', 'answer', 'type']]

In [12]:
merged_df = pd.merge(input_df, df, on='question', how='left')
merged_df[['answer', 'Result', 'type']]

Unnamed: 0,answer,Result,type
0,"['girl youll be a woman soon', 'papa dont prea...","['im telling you now', 'the seventh son', 'im ...",list[category]
1,"[509.0, 503.0, 497.0, 451.0, 440.0]","['116', '19', '8', '7', '4']",list[number]
2,"['Eastern Time (US & Canada)', 'Central Time (...","['Eastern Time (US & Canada)', 'Central Time (...",list[category]
3,F,'F',category
4,"[0, 12, 10, 11]","[0, 12, 10, 11]",list[number]
...,...,...,...
195,frog,frog,category
196,CW,[None],category
197,United,United,category
198,"[1, 1]","[1, 1]",list[number]


In [None]:
# Cleaning our results before Evaluation

tqdm.pandas()

# Function to convert string representations of lists to actual lists
def convert_to_list(value):
    try:
        return ast.literal_eval(value)
    except (ValueError, SyntaxError) as e:
        return f"error in conversion: {str(e)}"

# Function to map values to specified types and output the error message if conversion fails
def map_to_type(row, column):
    value = row[column]
    expected_type = row['type']
    
    # If type is NaN, return np.nan
    if pd.isna(expected_type):
        return np.nan

    if pd.isna(value):
        return "error in conversion: value is NaN"
    
    # For list types, handle differently.
    if expected_type.startswith("list"):
        try:
            if isinstance(value, list):
                converted_value = value
            elif isinstance(value, str):
                converted_value = convert_to_list(value)
                # If conversion returns an error message, raise an exception
                if isinstance(converted_value, str) and converted_value.startswith("error in conversion"):
                    raise ValueError(converted_value)
            else:
                raise ValueError("Value is not a list or a string representing a list")
            
            if expected_type == "list[number]":
                if isinstance(converted_value, list) and all(isinstance(i, (int, float)) for i in converted_value):
                    return converted_value
                else:
                    raise ValueError("List elements are not all numbers")
            elif expected_type == "list[category]":
                if isinstance(converted_value, list) and all(isinstance(i, str) for i in converted_value):
                    return converted_value
                else:
                    raise ValueError("List elements are not all strings")
            else:
                raise ValueError("Unknown list type")
        except Exception as e:
            return f"error in conversion: {str(e)}"
    
    # For non-list types, first remove extraneous single quotes if the value is a string.
    if isinstance(value, str):
        value = value.replace("'", "")
    
    try:
        if expected_type == "number":
            return float(value)
        elif expected_type == "boolean":
            return bool(value)
        elif expected_type == "category":
            return str(value)
        else:
            raise ValueError("Unknown non-list type")
    except Exception as e:
        return f"error in conversion: {str(e)}"

# Apply the function to each row and create a new column 'converted_Result'
merged_df['converted_answer'] = merged_df.progress_apply(lambda x: map_to_type(x, 'answer'), axis=1)
merged_df['converted_Result'] = merged_df.progress_apply(lambda x: map_to_type(x, 'Result'), axis=1)

100%|██████████| 200/200 [00:00<00:00, 5248.92it/s]


In [None]:
#Mapping failed conversions to NaN as they do not follow type condition

merged_df["converted_answer"] = merged_df["converted_answer"].apply(
    lambda x: np.nan if isinstance(x, str) and "error" in x else x
)
merged_df["converted_Result"] = merged_df["converted_Result"].apply(
    lambda x: np.nan if isinstance(x, str) and "error" in x else x
)

In [62]:
from typing import Callable, List, Union, Optional
from datasets import Dataset
from tqdm import tqdm
import pandas as pd


class Evaluator:
    def __init__(
        self,
        compare: Optional[Callable] = None,
        qa: Optional[Dataset] = None,
        batch_size: int = 10,
        **kwargs,
    ):
        self.compare = compare if compare else self.default_compare
        self.qa = qa #if qa is not None else load_qa(**kwargs)

    def default_compare(self, value, truth, semantic):
        STRIP_CHARS = "[]'\" "
        semantic = semantic.strip()
        if semantic == "boolean":
            return str(value).strip(STRIP_CHARS).lower() == str(truth).strip(STRIP_CHARS).lower()
        elif semantic == "category":
            if value is None and truth is None:
                return True
            if value is None or truth is None:
                return False

            value_str = str(value).strip(STRIP_CHARS)
            truth_str = str(truth).strip(STRIP_CHARS)
            if value_str == truth_str:
                return True

            try:
                value_date = pd.to_datetime(value_str).date()
                truth_date = pd.to_datetime(truth_str).date()
                return value_date == truth_date
            except (ValueError, TypeError):
                if not value_str and not truth_str:
                    return True
                return value_str == truth_str
        elif semantic == "number":
            try:
                value_cleaned = ''.join(char for char in str(value) if char.isdigit() or char in ['.', '-'])
                truth_cleaned = ''.join(char for char in str(truth) if char.isdigit() or char in ['.', '-'])
                return round(float(value_cleaned), 2) == round(float(truth_cleaned), 2)
            except:
                return False
        elif semantic == "list[category]":
            try:
                value_list = [item.strip(STRIP_CHARS) for item in str(value).strip('[]').split(',')]
                truth_list = [item.strip(STRIP_CHARS) for item in str(truth).strip('[]').split(',')]
                if len(value_list) != len(truth_list):
                    return False

                # Attempt to parse each item as a date
                try:
                    value_dates = [pd.to_datetime(item).date() for item in value_list]
                    truth_dates = [pd.to_datetime(item).date() for item in truth_list]
                    return set(value_dates) == set(truth_dates)
                except (ValueError, TypeError):
                    # If parsing as dates fails, compare as strings
                    return set(value_list) == set(truth_list)
            except Exception as exc:
                return False
        elif semantic == "list[number]":
            try:
                value_list = sorted(round(float(''.join(c for c in v.strip() if c.isdigit() or c in ['.', '-'])), 2) for v in str(value).strip('[]').split(',') if v.strip())
                truth_list = sorted(round(float(''.join(c for c in t.strip() if c.isdigit() or c in ['.', '-'])), 2) for t in str(truth).strip('[]').split(',') if t.strip())

                if len(value_list) != len(truth_list):
                    return False

                return set(value_list) == set(truth_list)
            except Exception as exc:
                return False
        else:
            raise Exception(f"Semantic not supported: {semantic}")

    def eval(
        self,
        responses: Union[List[str], str],
        lite: bool = False,
    ) -> float:
        if isinstance(responses, str):
            with open(responses, "r") as f:
                responses = f.read().splitlines()

        correct = 0
        truths = self.qa["answer"] if not lite else self.qa["sample_answer"]
        evals = []
        for response, truth, semantic in tqdm(zip(responses, truths, self.qa["type"]), total=len(truths)):
            truthy = self.compare(response, truth, semantic)
            if self.compare(response, truth, semantic):
                correct += 1
            evals.append(truthy)
        self.evals = evals
        return correct / len(truths)

In [72]:
def evaluate_dataframe(df: pd.DataFrame, evaluator: Evaluator, semantic: str) -> float:
    """
    Evaluate the predicted and actual columns of a DataFrame using the Evaluator.

    Args:
        df (pd.DataFrame): The DataFrame containing 'predicted' and 'actual' columns.
        evaluator (Evaluator): The initialized Evaluator instance.
        semantic (str): The default semantic type for all comparisons.

    Returns:
        float: The accuracy of predictions.
    """
    correct = 0
    evals = []

    for _, row in df.iterrows():
        predicted = row["converted_Result"]
        actual = row["converted_answer"]
        result = evaluator.compare(predicted, actual, semantic)
        evals.append(result)
        if result:
            correct += 1

    accuracy = correct / len(df)
    df["evaluation"] = evals  # Optionally store evaluation results in the DataFrame
    return accuracy

In [None]:
# Example: Using 'category' as the default semantic type
default_type = "category"
evaluator = Evaluator()

accuracy = evaluate_dataframe(merged_df, evaluator, default_type)

print(f"Accuracy: {accuracy:.2f}")