In [None]:
%load_ext autoreload
%autoreload 2
from loguru import logger
import sys


In [None]:
logger.remove()
logger.add(sys.stderr, level="INFO")

In [None]:
import json
import random
import os
from tqdm import tqdm

from parser import ClevrParser
from program_executor import (
    programs_from_networkx,
    networkx_from_programs,
    set_scene,
    evaluate,
)
import numpy as np
from remote_encoder import RemoteEncoder
from tqdm import tqdm

random.seed(42)

### Test abstraction

In [None]:
from abscon.abstraction import ClevrAbstractor
from abscon.concretization import ClevrConcretizer
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from parser import ClevrParser
from tqdm import tqdm
from evaluation_utils import ClevrEvaluator, evaluate_graph_with_scene, evaluate_prediction


In [None]:
folder = "results"
ground_truth_folder = "data"
approaches = ["greedy", "mv", "esc", "escf", "abscon",]
llms = ["gpt-4o-mini", "gpt-4o", "Meta-Llama-3.1-8B-Instruct", "Meta-Llama-3.1-70B-Instruct"]

dataset = "clevr"
num_generations = 10

In [None]:
results = []

for approach in approaches:
    ground_truth_path = f"{ground_truth_folder}/{dataset}.csv"
    
    result = {}
    result["approach"] = approach
    for llm in llms:
        folder_path = f"{folder}/{llm}"
        evaluator = ClevrEvaluator(folder_path=folder_path, dataset_name=dataset)

        if approach == "greedy":
            metrics = evaluator.evaluate_greedy_result()
        elif approach == "esc": 
            metrics = evaluator.evaluate_execution_sc(num_generations)
        elif approach == "escf":
            metrics = evaluator.evaluate_execution_sc(num_generations, exclude_error=True)
        else:
            df = pd.read_csv(f"{folder_path}/{dataset}/results_{approach}_{num_generations}.csv")["0"].tolist()
            metrics = evaluator.evaluate_solutions(df)
        result[f"ACC_{llm}"] = metrics["accuracy"]
        result[f"SR_{llm}"] = metrics["success_rate"]
    results.append(result)

In [None]:
results_df = pd.DataFrame(results)
results_df[results_df.select_dtypes(include=['number']).columns] *= 100
# results_df = results_df[["approach", "success_rate", "accuracy"]]
# results_df.columns = ["approach", "SR", "ACC"]
print(results_df.round(2).to_latex(index=False, header=False))

In [None]:
results_df
results_df_index = results_df.set_index("approach", inplace=False)

In [None]:
diff = results_df_index.loc["abscon", :] - results_df_index.loc["greedy", :]
acc_diff = [value for key, value in diff.items() if "ACC" in key]
print(f"Min improvement: {min(acc_diff)}, max improvement: {max(acc_diff)}, average improvement: {np.mean(acc_diff)}")

In [None]:
diff = results_df_index.loc["abscon", :] - results_df_index.loc["escf", :]
acc_diff = [value for key, value in diff.items() if "ACC" in key]
print(f"Min improvement: {min(acc_diff)}, max improvement: {max(acc_diff)}, average improvement: {np.mean(acc_diff)}")

## RQ2

In [None]:
def get_result(dataset, llms, num_generation, approaches, folder):
    result = {}
    for approach in approaches:
        result[approach] = {}
        for llm in llms:
            folder_path = f"{folder}/{llm}"
            evaluator = ClevrEvaluator(
                folder_path=folder_path,
                dataset_name=dataset,
            )

            if approach == "greedy":
                metrics = evaluator.evaluate_greedy_result()
            elif approach == "esc":
                metrics = evaluator.evaluate_execution_sc(num_generation)
            elif approach == "escf":
                metrics = evaluator.evaluate_execution_sc(num_generation, exclude_error=True)
            elif approach == "best":
                metrics = evaluator.evaluate_execution_sc(
                    num_generation, exclude_error=True, best_answer=True
                )
            else:
                df = pd.read_csv(
                    f"{folder_path}/{dataset}/results_{approach}_{num_generation}.csv"
                )["0"].tolist()
                metrics = evaluator.evaluate_solutions(df)
            result[approach][llm] = metrics
    return result

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from evaluation_utils import ClevrEvaluator
import numpy as np

In [None]:
folder = "results"
ground_truth_folder = "data"
approaches = ["mv", "greedy", "abscon", "esc", "escf", "best"]
llms = ["Meta-Llama-3.1-8B-Instruct", "Meta-Llama-3.1-70B-Instruct"]# ["Meta-Llama-3.1-70B-Instruct"]

dataset = "clevr"
num_generations = range(1, 21)

In [None]:
results = []

for num_generation in tqdm(num_generations):
    result = get_result(dataset, llms, num_generation, approaches, folder)
    results.append(result)

In [None]:
import matplotlib.pyplot as plt
import scienceplots
import matplotlib
plt.style.use(['science', "ieee"])

models = ["Meta-Llama-3.1-8B-Instruct"] #["Meta-Llama-3.1-70B-Instruct"]
model_names = ["Llama3.1 70b"]
approaches = ["escf", "abscon", "best", "esc", "greedy"]
approach_names = ["ESC-F", "AbsCon", "Best", "Median", "Direct"]
lines = ["-", "-", "--", "--", "-"]
markers = ['*', '.', '^', 'v', '']

colors = [[33, 25, 24], [195, 56, 40], [71, 133, 90] , [71, 133, 90],  [231, 189, 57]]
colors = [[c / 255 for c in color] for color in colors]

In [None]:
plt.figure(figsize=(4,1.5))
metric = "accuracy"
x = num_generations
f1_values = []
for i, llm in enumerate(models):
    for j, approach in enumerate(approaches):
        values = [data[approach][llm][metric] for data in results]
        if approach in ["escf", "abscon"]:
            f1_values.extend(values)
        plt.plot(x, values, color=colors[j], linestyle=lines[j], label=approach_names[j], marker=markers[j])
plt.legend(shadow=True, ncol=2)
plt.title("Clevr")
plt.ylabel("Accuracy")
plt.xlabel("Candidates")
plt.savefig("Clevr.png", dpi=300)
plt.show()        

## RQ3: Impact of Temperature

In [None]:
def transform_results(results):
    transformed = []
    for temperature in results.keys():
        temperature_result = {
            "temperature": temperature
        }
        for approach in results[temperature].keys():
            for llm in results[temperature][approach].keys():
                for metric in results[temperature][approach][llm].keys():
                    temperature_result[f"{metric}_{llm}"] = results[temperature][
                        approach
                    ][llm][metric] 
        transformed.append(temperature_result)
    return transformed

In [None]:
folder = "results"
ground_truth_folder = "data"
approaches = ["abscon"]
llms = ["Meta-Llama-3.1-70B-Instruct", "gpt-4o-mini"]

dataset = "clevr"
num_generation = 10

temperatures = ["0.2", "0.5", "0.7", "1"]

In [None]:
temperature_results = {}

for temperature in temperatures:
    temperature_folder = f"{folder}/temperature/{temperature}"
    temperature_results[temperature] = get_result(
        dataset, llms, num_generation, approaches, temperature_folder
    )
temperature_results = transform_results(temperature_results)

In [None]:
df = pd.DataFrame(temperature_results)
df

In [None]:
df = pd.DataFrame(temperature_results)
df[df.select_dtypes(include=["number"]).columns] *= 100
df = df[
    [
        "temperature",
        "accuracy_Meta-Llama-3.1-70B-Instruct",
        "success_rate_Meta-Llama-3.1-70B-Instruct",
        "accuracy_gpt-4o-mini",
        "success_rate_gpt-4o-mini",
    ]
]

print(df.round(2).to_latex(index=False, header=False))