In [None]:
%load_ext autoreload
%autoreload 2
from loguru import logger
import sys

In [None]:
logger.remove()
logger.add(sys.stderr, level="INFO")

In [None]:
from evaluation_utils import TaxonomyEvaluator
import pandas as pd
import numpy as np

In [None]:
folder = "results"
ground_truth_folder = "data"
approaches = ["greedy", "mv", "abscon"]
llms = ["gpt-4o-mini", "gpt-4o", "Meta-Llama-3.1-8B-Instruct", "Meta-Llama-3.1-70B-Instruct"]

dataset = "ccs"
num_generation = 10

In [None]:
results = []

for approach in approaches:
    ground_truth_path = f"{ground_truth_folder}/{dataset}.csv"

    result = {}
    result["approach"] = approach
    for llm in llms:
        folder_path = f"{folder}/{llm}"
        if approach == "greedy":
            evaluator = TaxonomyEvaluator(
                folder_path,
                dataset,
                ground_truth_path,
                num_generation,
                evaluate_greedy=True,
            )
            df = pd.read_csv(f"{folder_path}/{dataset}/results_{approach}.csv")
            metrics = evaluator.evaluate_abstraction(
                num_generation, concretization_method="mv", dataset=dataset
            )
        else:
            evaluator = TaxonomyEvaluator(
                folder_path, dataset, ground_truth_path, num_generation
            )
            df = pd.read_csv(
                f"{folder_path}/{dataset}/results_{approach}_{num_generation}.csv"
            )
            metrics = evaluator.evaluate_taxonomies(df, dataset)
        

        for metric_name in ["precision", "recall", "f1", "consistency"]:
            result[f"{metric_name}_{llm}"] = metrics[metric_name]
    results.append(result)

In [None]:
results_df = pd.DataFrame(results)
results_df[results_df.select_dtypes(include=['number']).columns] *= 100
# results_df = results_df[["approach", "precision", "recall", "f1", "consistency"]]
# results_df.columns = ["approach", "P", "R", "F1", "Con"]
print(results_df.round(2).to_latex(index=False, header=False))

In [None]:
results_df_index = results_df.set_index("approach", inplace=False)

In [None]:
diff = results_df_index.loc["abscon", :] - results_df_index.loc["greedy", :]
recall_diff = [value for key, value in diff.items() if "recall" in key]
print(f"Min improvement: {min(recall_diff)}, max improvement: {max(recall_diff)}, average improvement: {np.mean(recall_diff)}")

In [None]:
f1_diff = [value for key, value in diff.items() if "f1" in key]
print(f"Min improvement: {min(f1_diff)}, max improvement: {max(f1_diff)}, average improvement: {np.mean(f1_diff)}")

## Plots

In [None]:
from tqdm.notebook import tqdm

In [None]:
folder = "results"
ground_truth_folder = "data"
approaches = ["mv", "greedy", "abscon"]
llms = ["meta-llama-3-1-70b-instruct-20241203161536"] # ["Meta-Llama-3.1-70B-Instruct"]

dataset = "ccs"
num_generations = range(1, 21)
metrics = ["f1", "consistency"]

In [None]:
def get_result(ground_truth_folder, dataset, llms, num_generation, approaches, folder):
    ground_truth_path = f"{ground_truth_folder}/{dataset}.csv"
    result = {}
    for approach in approaches:
        result[approach] = {}
        for llm in llms:
            folder_path = f"{folder}/{llm}"
            if approach == "greedy":
                evaluator = TaxonomyEvaluator(
                    folder_path,
                    dataset,
                    ground_truth_path,
                    num_generation,
                    evaluate_greedy=True,
                )
                metrics = evaluator.evaluate_abstraction(
                    num_generation, concretization_method="mv", dataset=dataset
                )
            else:
                evaluator = TaxonomyEvaluator(
                    folder_path, dataset, ground_truth_path, num_generation
                )
                df = pd.read_csv(
                    f"{folder_path}/{dataset}/results_{approach}_{num_generation}.csv"
                )
                metrics = evaluator.evaluate_taxonomies(df, dataset)
            result[approach][llm] = metrics
    
    return result

In [None]:
results = []

for num_generation in tqdm(num_generations):
    result = get_result(ground_truth_folder, dataset, llms, num_generation, approaches, folder)

    results.append(result)

In [None]:
for i in tqdm(range(len(results))):
    results[i]["max"] = {}
    results[i]["median"] = {}
    for llm in llms:
        folder_path = f"{folder}/{llm}"
        evaluator = TaxonomyEvaluator(
            folder_path,
            dataset,
            ground_truth_path,
            num_generations=len(list(num_generations)),
        )

        results[i]["max"][llm] = evaluator.evaluate_individual(i + 1, dataset, aggregator=max)
        results[i]["median"][llm] = evaluator.evaluate_individual(i + 1, dataset, aggregator=np.median)

In [None]:
import matplotlib.pyplot as plt
import scienceplots
import matplotlib
plt.style.use(['science', "ieee"])

# models = ["llama_8b", "llama_70b", "gpt_4o_mini", "gpt_4o"]
models = ["meta-llama-3-1-70b-instruct-20241203161536"]# ["Meta-Llama-3.1-70B-Instruct"]
model_names = ["Llama3.1 70b"]
metrics = ["f1", "consistency"]
approaches = ["mv", "abscon", "max", "median", "greedy"]
approach_names = ["MV", "AbsCon", "Best", "Median", "Direct"]
lines = ["-", "-", "--", "--", "-"]
markers = ['*', '.', '^', 'v', '']

colors = [[33, 25, 24], [195, 56, 40], [71, 133, 90] , [71, 133, 90],  [231, 189, 57]]
colors = [[c / 255 for c in color] for color in colors]

In [None]:
plt.figure(figsize=(4,1.5))
metric = "f1"
x = num_generations
f1_values = []
for i, llm in enumerate(models):
    for j, approach in enumerate(approaches):
        values = [data[approach][llm][metric] for data in results]
        if approach in ["mv", "abscon"]:
            f1_values.extend(values)
        plt.plot(x, values, color=colors[j], linestyle=lines[j], label=approach_names[j], marker=markers[j])
plt.legend(shadow=True, ncol=2)
plt.title("CCS")
plt.ylabel("F1")
plt.xlabel("Candidates")
plt.savefig("CCS.png", dpi=300)
plt.show()        

In [None]:
plt.figure(figsize=(4,2.25))
metric = "consistency"
approaches = ["mv", "abscon", "greedy"]
x = num_generations
consistency_values = []
for i, llm in enumerate(models):
    for j, approach in enumerate(approaches):
        values = [data[approach][llm][metric] for data in results]
        if approach != "greedy":
            consistency_values.extend(values)
        plt.plot(x, values, color=colors[j], linestyle=lines[i])
plt.show()        

In [None]:
from scipy.stats import spearmanr

spearmanr(f1_values, consistency_values, alternative="greater")

In [None]:
from scipy.stats import spearmanr

spearmanr(f1_values, consistency_values, alternative="greater")

## RQ3: Impact of temperature

In [None]:
def transform_results(results):
    transformed = []
    for temperature in results.keys():
        temperature_result = {
            "temperature": temperature
        }
        for approach in results[temperature].keys():
            for llm in results[temperature][approach].keys():
                for metric in results[temperature][approach][llm].keys():
                    temperature_result[f"{metric}_{llm}"] = results[temperature][
                        approach
                    ][llm][metric] 
        transformed.append(temperature_result)
    return transformed

In [None]:
folder = "results"
ground_truth_folder = "data"
approaches = ["abscon"]
llms = ["Meta-Llama-3.1-70B-Instruct", "gpt-4o-mini"]

num_generation = 10

temperatures = ["0.2", "0.5", "0.7", "1"]

In [None]:
dataset = "wordnet"

temperature_results = {}

for temperature in temperatures:
    temperature_folder = f"{folder}/temperature/{temperature}"
    temperature_results[temperature] = get_result(
        ground_truth_folder, dataset, llms, num_generation, approaches, temperature_folder
    )
temperature_results = transform_results(temperature_results)

In [None]:
df = pd.DataFrame(temperature_results)
df[df.select_dtypes(include=["number"]).columns] *= 100
df = df[
    [
        "temperature",
        "f1_Meta-Llama-3.1-70B-Instruct",
        "consistency_Meta-Llama-3.1-70B-Instruct",
        "f1_gpt-4o-mini",
        "consistency_gpt-4o-mini",
    ]
]

print(df.round(2).to_latex(index=False, header=False))

In [None]:
dataset = "ccs"

temperature_results = {}

for temperature in temperatures:
    temperature_folder = f"{folder}/temperature/{temperature}"
    temperature_results[temperature] = get_result(
        ground_truth_folder, dataset, llms, num_generation, approaches, temperature_folder
    )
temperature_results = transform_results(temperature_results)

In [None]:
df = pd.DataFrame(temperature_results)
df[df.select_dtypes(include=["number"]).columns] *= 100
df = df[
    [
        "temperature",
        "f1_Meta-Llama-3.1-70B-Instruct",
        "consistency_Meta-Llama-3.1-70B-Instruct",
        "f1_gpt-4o-mini",
        "consistency_gpt-4o-mini",
    ]
]

print(df.round(2).to_latex(index=False, header=False))

## Plot other figure

In [None]:
x = range(1, 11)


def plot_metric(metric, row, col, index, results, type, legend=True, title=True):
    plt.subplot(row, col, index)
    for i, model in enumerate(models):
        plt.plot(
            x,
            [data[metric] for data in results[model]],
            lines[i],
            label=model_names[i],
            linewidth=2,
            # color=colors[i]
        )
        plt.xticks(x, [1,2,3,4,5,6,7,8,9,10]) 
        plt.annotate
    if legend:
        plt.legend()
    if title:
        plt.title(f"{metric.title()} v.s. Number of Candidates")

In [None]:
font = {'family' : 'normal',
        'size'   : 12}

plt.rc('font', **font)

In [None]:
plt.figure(figsize=(8,4.5))
plot_metric("consistency", 2, 2, 1, result_changes, "Constraints", legend=True)
plot_metric("f1", 2, 2, 2, result_changes, "Constraints", legend=False)
plot_metric("consistency", 2, 2, 3, result_changes_mv, "Majority Voting", legend=False, title=False)
plot_metric("f1", 2, 2, 4, result_changes_mv, "Majority Voting", legend=False, title=False)

plt.savefig("results.png", dpi=300)
plt.show()

In [None]:
x = range(1, 6)

for metric in metrics:
    for model in models:
        plt.plot(x, [data[metric] for data in result_changes[model]], label=model)
    plt.legend()
    plt.title(f"{metric} change w.r.t. samples (constraint)")
    plt.show()


In [None]:
x = range(1, 6)

for metric in metrics:
    for model in models:
        plt.plot(x, [data[metric] for data in result_changes_mv[model]])
    plt.title(f"{metric} change w.r.t. samples (voting)")
    plt.show()


## Evaluation of DeepSeek

In [None]:
llm = "gpt-4o-mini/"
dataset = "wordnet"
folder = "results"
folder_path = f"{folder}/{llm}"
ground_truth_folder = "data"
ground_truth_path = f"{ground_truth_folder}/{dataset}.csv"
num_generation = 1

evaluator = TaxonomyEvaluator(
    folder_path,
    dataset,
    ground_truth_path,
    num_generation,
    evaluate_greedy=True,
)
metrics = evaluator.evaluate_abstraction(
    num_generation, concretization_method="mv", dataset=dataset
)