In [None]:
%load_ext autoreload
%autoreload 2
from loguru import logger
import sys
from tqdm.notebook import tqdm
logger.remove()
logger.add(sys.stderr, level="INFO")
import os

In [None]:
os.environ["https_proxy"] = os.getenv("https_proxyOPT")

## Evaluate results

In [None]:
import pandas as pd
from evaluation_utils import ActivityEvaluator
import numpy as np

In [None]:
def get_result(ground_truth_folder, dataset, llms, num_generation, approaches, folder):
    results = []

    for approach in approaches:
        ground_truth_path = f"{ground_truth_folder}/{dataset}.csv"

        result = {}
        result["approach"] = approach
        for llm in llms:
            folder_path = f"{folder}/{llm}"
            if approach == "greedy":
                evaluator = ActivityEvaluator(
                    folder_path,
                    dataset,
                )
                metrics = evaluator.evaluate_greedy_result()
            else:
                evaluator = ActivityEvaluator(
                    folder_path, dataset
                )
                df = pd.read_csv(f"{folder_path}/{dataset}/results_{approach}_{num_generation}.csv")["0"].tolist()
                metrics = evaluator.evaluate_solutions(df)
            

            for metric_name in ["precision", "recall", "f1", "consistency"]:
                result[f"{metric_name}_{llm}"] = metrics[metric_name]
        results.append(result)

    return results

In [None]:
folder = "results"
ground_truth_folder = "data"
approaches = ["greedy", "mv", "abscon"]
llms = ["gpt-4o-mini", "gpt-4o", "Meta-Llama-3.1-8B-Instruct", "Meta-Llama-3.1-70B-Instruct"]

dataset = "paged"
num_generation = 10

In [None]:
results = get_result(ground_truth_folder, dataset, llms, num_generation, approaches, folder)

In [None]:
results_df = pd.DataFrame(results)
results_df[results_df.select_dtypes(include=['number']).columns] *= 100
results_df = results_df[["f1_gpt-4o-mini", "f1_gpt-4o","f1_Meta-Llama-3.1-8B-Instruct", "f1_Meta-Llama-3.1-70B-Instruct"]]
print(results_df.round(2).to_markdown(index=False))

In [None]:
results_df.loc[2] - results_df.loc[0]

In [None]:
results_df = pd.DataFrame(results)
results_df[results_df.select_dtypes(include=['number']).columns] *= 100
# results_df = results_df[["approach", "precision", "recall", "f1", "consistency"]]
# results_df.columns = ["approach", "P", "R", "F1", "Con"]
print(results_df.round(2).to_latex(index=False, header=False))

In [None]:
results_df
results_df_index = results_df.set_index("approach", inplace=False)

In [None]:
diff = results_df_index.loc["abscon", :] - results_df_index.loc["greedy", :]
recall_diff = [value for key, value in diff.items() if "recall" in key]
print(f"Min improvement: {min(recall_diff)}, max improvement: {max(recall_diff)}, average improvement: {np.mean(recall_diff)}")

In [None]:
f1_diff = [value for key, value in diff.items() if "f1" in key]
print(f"Min improvement: {min(f1_diff)}, max improvement: {max(f1_diff)}, average improvement: {np.mean(f1_diff)}")

## Plotting

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from evaluation_utils import ActivityEvaluator
import numpy as np

In [None]:
folder = "results"
ground_truth_folder = "data"
approaches = ["mv", "greedy", "abscon"]
llms = ["Meta-Llama-3.1-70B-Instruct"]# ["Meta-Llama-3.1-70B-Instruct"]

dataset = "paged"
num_generations = range(1, 21)

In [None]:
results = []

for num_generation in tqdm(num_generations):
    result = {}
    for approach in approaches:
        ground_truth_path = f"{ground_truth_folder}/{dataset}.csv"

        result[approach] = {}
        for llm in llms:
            folder_path = f"{folder}/{llm}"
            if approach == "greedy":
                evaluator = ActivityEvaluator(
                    folder_path,
                    dataset,
                    ground_truth_path,
                )
                metrics = evaluator.evaluate_greedy_result()
            else:
                evaluator = ActivityEvaluator(
                    folder_path, dataset, ground_truth_path
                )
                df = pd.read_csv(f"{folder_path}/{dataset}/results_{approach}_{num_generation}.csv")["0"].tolist()
                metrics = evaluator.evaluate_solutions(df)
            result[approach][llm] = metrics
    results.append(result)

In [None]:
for llm in llms:
    for i in range(len(results)):
        results[i]["max"] = {}
        results[i]["median"] = {}
    evaluator = ActivityEvaluator(
        folder_path, dataset, ground_truth_path
    )

    for i in tqdm(range(len(results))):
        results[i]["max"][llm] = evaluator.evaluate_individual(
            i + 1, dataset=dataset, aggregator=max
        )
        results[i]["median"][llm] = evaluator.evaluate_individual(
            i + 1, dataset=dataset, aggregator=np.median
        )

In [None]:
import matplotlib.pyplot as plt
import scienceplots

plt.style.use(['science', "ieee"])

# models = ["llama_8b", "llama_70b", "gpt_4o_mini", "gpt_4o"]
models = ["Meta-Llama-3.1-70B-Instruct"]# ["Meta-Llama-3.1-70B-Instruct"]
model_names = ["Llama3.1 70b"]
metrics = ["f1", "consistency"]
approaches = ["mv", "abscon", "max", "median", "greedy"]
approach_names = ["MV", "AbsCon", "Best", "Median", "Direct"]
lines = ["-", "-", "--", "--", "-"]
markers = ['*', '.', '^', 'v', '']

colors = [[33, 25, 24], [195, 56, 40], [71, 133, 90] , [71, 133, 90],  [231, 189, 57]]
colors = [[c / 255 for c in color] for color in colors]

In [None]:
plt.figure(figsize=(4,1.25))
metric = "f1"
x = num_generations
f1_values = []
for i, llm in enumerate(models):
    for j, approach in enumerate(approaches):
        values = [data[approach][llm][metric] for data in results]
        if approach in ["mv", "abscon"]:
            f1_values.extend(values)
        plt.plot(x, values, color=colors[j], linestyle=lines[j], label=approach_names[j], marker=markers[j])
# plt.legend(shadow=True, ncol=2)
plt.title("Paged")
plt.ylabel("F1")
plt.xlabel("Candidates")
plt.savefig("Paged.png", dpi=300)
plt.show()        

In [None]:
plt.figure(figsize=(4,2.25))
metric = "consistency"
x = range(1, 11)
consistency_values = []
for i, llm in enumerate(models):
    for j, approach in enumerate(approaches):
        values = [data[approach][llm][metric] for data in results]
        if approach != "greedy":
            consistency_values.extend(values)
        plt.plot(x, values, color=colors[j], linestyle=lines[i])
plt.show()        

In [None]:
from scipy.stats import spearmanr

spearmanr(f1_values, consistency_values, alternative="greater")

## RQ3: Impact of Temperature

In [None]:
folder = "results"
ground_truth_folder = "data"
approaches = ["abscon"]
llms = ["Meta-Llama-3.1-70B-Instruct", "gpt-4o-mini"]

dataset = "paged"
num_generation = 10

temperatures = ["0.2", "0.5", "0.7", "1"]

In [None]:
temperature_results = {}

for temperature in temperatures:
    temperature_folder = f"{folder}/temperature/{temperature}"
    temperature_results[temperature] = get_result(
        ground_truth_folder, dataset, llms, num_generation, approaches, temperature_folder
    )

In [None]:
def transform_results(results):
    transformed = []
    for temperature in results.keys():
        temperature_result = {
            "temperature": temperature
        }
        for key in results[temperature][0].keys():
            temperature_result[key] = results[temperature][0][key]
        transformed.append(temperature_result)
    return transformed

In [None]:
temperature_results = transform_results(temperature_results)

In [None]:
df = pd.DataFrame(temperature_results)
df[df.select_dtypes(include=["number"]).columns] *= 100
df = df[
    [
        "temperature",
        "f1_Meta-Llama-3.1-70B-Instruct",
        "consistency_Meta-Llama-3.1-70B-Instruct",
        "f1_gpt-4o-mini",
        "consistency_gpt-4o-mini",
    ]
]

print(df.round(2).to_latex(index=False, header=False))