In [None]:
%load_ext autoreload
%autoreload 2
from loguru import logger
import sys

### Check the result for each candidate

In [None]:
from evaluation_utils import ActivityEvaluator
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np

In [None]:
folder = "results"
ground_truth_folder = "data"
llms = ["gpt-4o-mini", "gpt-4o", "Meta-Llama-3.1-8B-Instruct", "Meta-Llama-3.1-70B-Instruct"]
llm_labels = [
    "gpt-4o-mini", "gpt-4o", "Llama3.1-8b", "Llama3.1-70b"
]

dataset = "paged"
num_generations = range(1, 11)

In [None]:
evaluator = ActivityEvaluator("", dataset)

In [None]:
llm_results = {}
for llm in llms:
    results = {
        "con": [],
        "incon": []
    }
    folder_path = f"{folder}/{llm}"
    for num_generation in tqdm(num_generations):
        df = pd.read_csv(f"{folder_path}/{dataset}/results_{num_generation}.csv")[
            "0"
        ].tolist()
        evaluation_results = evaluator.evaluate_solutions(df, return_value="all")
        num_samples = len(evaluation_results["f1"])
        f1_con = np.mean(
            [
                evaluation_results["f1"][i]
                for i in range(num_samples)
                if evaluation_results["consistency"][i]
            ]
        )
        f1_incon = np.mean(
            [
                evaluation_results["f1"][i]
                for i in range(num_samples)
                if not evaluation_results["consistency"][i]
            ]
        )

        results["con"].append(f1_con)
        results["incon"].append(f1_incon)
    llm_results[llm] = results

In [None]:
num_generation = 10
abscon_result = {}
for llm in tqdm(llms):
    folder_path = f"{folder}/{llm}"
    df = pd.read_csv(f"{folder_path}/{dataset}/results_abscon_{num_generation}.csv")[
        "0"
    ].tolist()
    evaluation_results = evaluator.evaluate_solutions(df, return_value="avg")
    # print(len(evaluation_results["f1"]))
    results = {
        "abscon": [evaluation_results["f1"]]
    }

    abscon_result[llm] = results

In [None]:
import matplotlib.pyplot as plt
import numpy as np

In [None]:
## Plot the difference between consistent and non-consistent candidates
llm_results_categorization = {
    "abscon": [],
    "consistent": [],
    "inconsistent": []
}

for llm in llms:
    llm_results_categorization["abscon"].append(abscon_result[llm]["abscon"])
    llm_results_categorization["consistent"].append(llm_results[llm]["con"])
    llm_results_categorization["inconsistent"].append(llm_results[llm]["incon"])


In [None]:
abscon_result[llm]["abscon"]

In [None]:
abscon_results = llm_results_categorization["abscon"] 
abscon_plot = plt.boxplot(abscon_results,
                               positions=np.array(
    np.arange(len(abscon_results)))*3.0-0.8, 
                               widths=0.6)


consistent_results = llm_results_categorization["consistent"] 
consistent_plot = plt.boxplot(consistent_results,
                               positions=np.array(
    np.arange(len(consistent_results)))*3.0-0, 
                               widths=0.6)

inconsistent_results = llm_results_categorization["inconsistent"] 
inconsistent_plot = plt.boxplot(inconsistent_results,
                               positions=np.array(
    np.arange(len(inconsistent_results)))*3.0+0.8, 
                               widths=0.6)

def define_box_properties(plot_name, color_code, label):
    for k, v in plot_name.items():
        plt.setp(plot_name.get(k), color=color_code)
         
    # use plot function to draw a small line to name the legend.
    plt.plot([], c=color_code, label=label)
    plt.legend()

# setting colors for each groups
define_box_properties(abscon_plot, '#F5B841', 'AbsCon')
define_box_properties(consistent_plot, '#067BC2', 'Consistent')
define_box_properties(inconsistent_plot, '#E84855', 'Inconsistent')

plt.xticks(np.arange(0, len(llm_labels) * 3, 3), llm_labels)
plt.xlim(-2, len(llm_labels)*2.7)

plt.ylabel("f1 score")
plt.title("Paged")
# plt.savefig(f"Paged.png", dpi=300)
plt.show()

In [None]:
print(llm_labels)

In [None]:
from cliffs_delta import cliffs_delta, lookup_size
from scipy.stats import ranksums

In [None]:
statistical_test_results = []
for consistent_result, inconsistent_result in zip(consistent_results, inconsistent_results):
    statistical_test_results.append(ranksums(consistent_result, inconsistent_result, alternative="greater"))


In [None]:
statistical_test_results

In [None]:
dull = {'small': 0.147, 'medium': 0.33, 'large': 0.474} # effect sizes from (Hess and Kromrey, 2004)

In [None]:
cliffs_values = []
for consistent_result, inconsistent_result in zip(consistent_results, inconsistent_results):
    cliffs_values.append(cliffs_delta(consistent_result, inconsistent_result)[0])
mean_cliff = np.mean(cliffs_values)

In [None]:
cliffs_values

In [None]:
mean_cliff

In [None]:
lookup_size(mean_cliff, dull)