In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import os
import pandas as pd
import numpy as np
import pprint
import json

repeats = 2
folds = 5
# Year_final/skipspecies/out/species_prop/skipspecies/non_induced/results
results_dir = "../out_final/species_prop/skipspecies/induced/results"
results = os.listdir(results_dir)

results = [result for result in results if result.endswith(".csv")]

supported_tasks = ["band_gap", "formation_energy_per_atom", "is_metal", "is_metallic"]
regression_tasks = ["band_gap", "formation_energy_per_atom"]
classification_tasks = ["is_metal", "is_magnetic"]
# Create a dictionary for the results
r_reg = {
    "representation": None,
    "pooling": None,
    "dimension": None,
    "task": None,
    "classification": False,
    "repeat": {
        1: {
            "fold": {
                1: {"val_mae": None},
                2: {"val_mae": None},
                3: {"val_mae": None},
                4: {"val_mae": None},
                5: {"val_mae": None},
            }
        },
        2: {
            "fold": {
                1: {"val_mae": None},
                2: {"val_mae": None},
                3: {"val_mae": None},
                4: {"val_mae": None},
                5: {"val_mae": None},
            }
        },
    },
    "mean_mae": None,
    "std_mae": None,
}

r_clf = {
    "representation": None,
    "pooling": None,
    "dimension": None,
    "task": None,
    "classification": True,
    "repeat": {
        1: {
            "fold": {
                1: {"val_auc": None},
                2: {"val_auc": None},
                3: {"val_auc": None},
                4: {"val_auc": None},
                5: {"val_auc": None},
            }
        },
        2: {
            "fold": {
                1: {"val_auc": None},
                2: {"val_auc": None},
                3: {"val_auc": None},
                4: {"val_auc": None},
                5: {"val_auc": None},
            }
        },
    },
    "mean_auc": None,
    "std_auc": None,
}
# print(results)


def get_repeat_fold_from_df(path: str, repeat: int, fold: int) -> pd.DataFrame:
    df = pd.read_csv(path)
    return df.query(f"fold == {fold} & repeat == {repeat}")


def check_complete(path: str, max_repeats: int = 2, max_folds: int = 5) -> bool:
    for repeat in range(max_repeats):
        for fold in range(max_folds):
            # print(fold, repeat)
            df = get_repeat_fold_from_df(path, repeat=repeat + 1, fold=fold + 1)
            # print(df)
            if (df.empty) and (len(df) != 100):
                return False
    return True


for result in results:
    path = f"{results_dir}/{result}"
    # print(result, check_complete(path))


def get_maes(path: str, max_repeats: int = 2, max_folds: int = 5) -> float:
    maes: list = []
    for repeat in range(max_repeats):
        for fold in range(max_folds):
            # print(fold, repeat)
            df = get_repeat_fold_from_df(path, repeat=repeat + 1, fold=fold + 1)

            maes.append(df.iloc[-1]["val_mae"])

    return np.mean(maes), np.std(maes)


def report_results(path, repeats: int = 2, folds: int = 5) -> str:
    if not check_complete(path):
        print(path)
        return "Incomplete"

    # Check the task name from the path
    if "band_gap" in path:
        r = r_reg.copy()
        r["task"] = "band_gap"
        r["classification"] = False
    elif "formation_energy_per_atom" in path:
        r = r_reg.copy()
        r["task"] = "formation_energy_per_atom"
        r["classification"] = False
    elif "is_metal" in path:
        r = r_clf.copy()
        r["task"] = "is_metal"
        r["classification"] = True
    elif "is_magnetic" in path:
        r = r_clf.copy()
        r["task"] = "is_magnetic"
        r["classification"] = True
    else:
        # print(path)
        raise ValueError("Unknown task")

    # Get the representation, pooling and dimension from the path
    file = path.split("/")[-1] if "/" in path else path
    rep = file.split("_")[1]
    r["representation"] = f"{rep}_induced"
    r["pooling"] = file.split("_")[2]
    r["dimension"] = int(file.split("_")[3].split("dim")[1])

    # Get the MAE or AUC for each repeat and fold
    for repeat in range(repeats):
        for fold in range(folds):
            df = get_repeat_fold_from_df(path, repeat=repeat + 1, fold=fold + 1)
            if r["classification"]:
                r["repeat"][repeat + 1]["fold"][fold + 1]["val_auc"] = df.iloc[-1][
                    "val_auc"
                ]
            else:
                r["repeat"][repeat + 1]["fold"][fold + 1]["val_mae"] = df.iloc[-1][
                    "val_mae"
                ]

    # Get the mean and std of the MAE or AUC
    if r["classification"]:
        r["mean_auc"] = np.mean(
            [
                r["repeat"][repeat + 1]["fold"][fold + 1]["val_auc"]
                for repeat in range(repeats)
                for fold in range(folds)
            ]
        )
        r["std_auc"] = np.std(
            [
                r["repeat"][repeat + 1]["fold"][fold + 1]["val_auc"]
                for repeat in range(repeats)
                for fold in range(folds)
            ]
        )
    else:
        r["mean_mae"] = np.mean(
            [
                r["repeat"][repeat + 1]["fold"][fold + 1]["val_mae"]
                for repeat in range(repeats)
                for fold in range(folds)
            ]
        )
        r["std_mae"] = np.std(
            [
                r["repeat"][repeat + 1]["fold"][fold + 1]["val_mae"]
                for repeat in range(repeats)
                for fold in range(folds)
            ]
        )

    return r


path = f"{results_dir}/{results[0]}"
# pprint.pprint(path)

a = get_repeat_fold_from_df(path, 2, 1)
# print(a)
# print(a.iloc[-1])
# pprint.pprint(report_results(path))

# Iterate over all the results files
# and export the results to a json file
for result in results:
    path = f"{results_dir}/{result}"

    # Check if the results are complete
    if not check_complete(path):
        print(f"{result} is incomplete")
        continue

    # Save the results to a json file in the out/summary folder
    # with open(f'../out_final/species_prop/skipspecies/induced/summary/{result}.json', 'w') as f:
    #    json.dump(report_results(path), f, indent=4)

    # print(f'{result} is complete')

print("Done")

In [None]:
sns.set_theme(context="paper", style="ticks", font_scale=1.3)
# Skipspecies non-induced
# Plot the band gap results
# Should be a plot of the mean of the mean absolute error against the dimension
# The error bars should be the standard deviation of the mean absolute error
# The different lines should be the different pooling methods

# Set the results directory
summary_dir = "../out_final/species_prop/skipspecies/induced/summary"
plot_dir = "../out_final/species_prop/skipspecies/induced/plots"
results = os.listdir(summary_dir)
results = [r for r in results if r.endswith(".json")]

# Load the results


def json_loader(path: str) -> dict:
    with open(path) as f:
        return json.load(f)


summaries = []
for result in results:
    # print(result)
    summaries.append(json_loader(f"{summary_dir}/{result}"))

print(len(summaries))

# Make a dataframe of the results
df = pd.DataFrame(summaries).sort_values(by="dimension")

# print(df)

# Get task specific dataframes
df_band_gap = (
    df.query('task == "band_gap"')
    .drop_duplicates(subset=["pooling", "dimension"])
    .reset_index(drop=True)
)
df_formation_energy_per_atom = (
    df.query('task == "formation_energy_per_atom"')
    .drop_duplicates(subset=["pooling", "dimension"])
    .reset_index(drop=True)
)
df_is_metal = (
    df.query('task == "is_metal"')
    .drop_duplicates(subset=["pooling", "dimension"])
    .reset_index(drop=True)
)
df_is_magnetic = (
    df.query('task == "is_magnetic"')
    .drop_duplicates(subset=["pooling", "dimension"])
    .reset_index(drop=True)
)

pools = ["mean", "max", "sum"]
formats = {"mean": "ro:", "max": "bx:", "sum": "gs:"}
task_df = {
    "band_gap": df_band_gap,
    "formation_energy_per_atom": df_formation_energy_per_atom,
    "is_metal": df_is_metal,
    "is_magnetic": df_is_magnetic,
}


def plot_pools(ax, task, task_dict=task_df):
    df = task_dict[task]
    classification = None
    for pool in pools:
        df_pool = df.query(f'pooling == "{pool}"')
        classification = df_pool.iloc[0].classification
        if classification:
            ax.errorbar(
                df_pool["dimension"],
                df_pool["mean_auc"],
                yerr=df_pool["std_auc"],
                fmt=formats[pool],
                label=f"{pool}-pool",
                capsize=5,
                capthick=2,
            )
        else:
            ax.errorbar(
                df_pool["dimension"],
                df_pool["mean_mae"],
                yerr=df_pool["std_mae"],
                fmt=formats[pool],
                label=f"{pool}-pool",
                capsize=5,
                capthick=2,
            )
    ax.set_xlabel("Dimension")
    t = task.replace("_", " ").capitalize()
    ax.set_title(f"Skipspecies induced - Task: {t}")
    if classification:
        ax.set_ylabel("AUC")
    else:
        if task == "band_gap":
            ax.set_ylabel("MAE [eV]")
        else:
            ax.set_ylabel("MAE [eV/atom]")
    ax.legend(frameon=False)
    return plt


# Plot the band gap results
fig, ax = plt.subplots()
plot_pools(ax, "band_gap")
# plt.savefig(f"{plot_dir}/band_gap.png", dpi=300)
plt.show()

# Plot the formation energy per atom results
fig, ax = plt.subplots()
plot_pools(ax, "formation_energy_per_atom")
plt.tight_layout()
# plt.savefig(f"{plot_dir}/formation_energy_per_atom.png", dpi=300)
plt.show()

# Plot the is metal results
fig, ax = plt.subplots()
plot_pools(ax, "is_metal")
plt.tight_layout()
# plt.savefig(f"{plot_dir}/is_metal.png", dpi=300)
plt.show()

# Plot the is magnetic results
fig, ax = plt.subplots()
plot_pools(ax, "is_magnetic")
plt.tight_layout()
# plt.savefig(f"{plot_dir}/is_magnetic.png", dpi=300)
plt.show()

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 6))

supported_tasks = ["band_gap", "formation_energy_per_atom", "is_metal", "is_magnetic"]
for ax, task in zip(axes.flatten(), supported_tasks):
    plot_pools(ax, task)

fig.text(0.01, 0.99, "(a)", weight="bold")
fig.text(0.51, 0.99, "(b)", weight="bold")
fig.text(0.01, 0.49, "(c)", weight="bold")
fig.text(0.51, 0.49, "(d)", weight="bold")
plt.tight_layout()
plt.savefig(
    "../plots/Skipspecies_induced_results_final_publication.pdf",
    bbox_inches="tight",
    transparent=True,
    dpi=600,
)
plt.show()