In [None]:
%pip install anytree

In [None]:
import copy 
import itertools
from pathlib import Path
import os
import pprint
from anytree import Node, RenderTree
import pickle
import matplotlib.pyplot as plt
import scipy 
import torch
import numpy as np

# Helper functions

In [None]:
def unpickle(filepath : str):
    file_object = open(filepath, "rb")
    results = pickle.load(file_object)
    file_object.close()
    return results

In [None]:
def extract_setting_from_name(name :str, naming_schema : list):
    result = {}
    # Main assumption for the naming schema: parameters are strictly separated by underscores '_'
    for value, setting in zip(name.split("_"), naming_schema):
        result[setting] = value
    return result
    

In [None]:
def load_results(directory, experiment_name, naming_schema=None, regex_scheme=None):
    path = Path(directory)

    dirs = [e for e in path.iterdir() if e.is_dir() and not str(e) == '.ipynb_checkpoints']
    subdirs = {str(path): [e for e in path.iterdir() if e.is_dir()] for path in dirs}
    
    relevant_subdirs = subdirs[os.path.join(directory, experiment_name)]
    relevant_subdirs.sort()
    #result_filename = '*.pickle'
    pickle_dirs = list()
    for subdir in relevant_subdirs:
        if regex_scheme:
            pickle_dirs.extend(sorted(subdir.glob(regex_scheme)))
        else:
            pickle_dirs.extend(sorted(subdir.glob("*.pickle")))
    results = []
    if not naming_schema is None:
        all_attributes = [extract_setting_from_name(subdir.name, naming_schema) for subdir in relevant_subdirs]    
        for attributes, pick in zip(all_attributes, pickle_dirs):
            try:
                unpickled_stuff = unpickle(pick)
                results.append({'attributes': attributes, 'results': unpickled_stuff})
            except:
                # Sometimes there were unknown issues with the pickle files, in those instances we re-ran training
                print("Catastrophic failure")
    else:
        for pick in pickle_dirs:
            try:
                unpickled_stuff = unpickle(pick)
                results.append({'results': unpickled_stuff})
            except:
                # Sometimes there were unknown issues with the pickle files, in those instances we re-ran training
                print("Catastrophic failure")
    return results


In [None]:
def deep_dict_get(data : dict, path : str):
    """
    path a slash ('/') separated path down to 
    """
    temp = data.copy()
    for entry in path.split("/"):
        # Catches leading '/' in tree printing
        if entry == '':
            continue
        temp = temp[entry]
    return temp

In [None]:
def compare_values(d1 : dict, d2 : dict, path : str):
    v1 = deep_dict_get(d1, path)
    v2 = deep_dict_get(d2, path)
    # Sanity check 1
    if not type(v1) == type(v2):
        return False
    else:
        # This can potentially cause errors when comparing lists of lists (or Tensors/Arrays)
        return v1 == v2

In [None]:
def create_common_root_list(root : str, values : list):
    return [f"{root}/{val}" for val in values]

# Main functions    
What features do I need?    
- ~~Show a structure tree of the results (i.e. experiment settings and result values)~~~
- ~~Filtering by setting, given a key~~
- ~~Filtering by values, given a key~~
- Create selected statistics (mean/med/std/quartiles/...) for certain values/keys
- Apply a function to certain values/keys and return the results (e.g. Eigendecomposition/Normalization/...)
- ~~Group all entries that share settings/values~~
- 

In [None]:
def generate_tree_structure(data, parent=None):
    # Solved through recursively going deeper into the data structure and then returning the leafs if at the end

    if parent is None and not len(data.keys()) == 1:
        parent = Node("root")
    elif parent is None:
        parent = Node(list(data.keys())[0])
        data = data[list(data.keys())[0]]
    
    # Recursion condition
    # If there are any dictionaries inside then go deeper
    if not any([type(data[entry]) == dict for entry in data]):
        for entry in data:
            Node(entry, parent=parent) 
    else:
        for entry in data:
            if type(data[entry]) == dict:
                branch = Node(entry, parent=parent)
                generate_tree_structure(data[entry], parent=branch)
            else:
                Node(entry, parent=parent)
    return parent
    

In [None]:
def group_by(list_of_data : list, path, value=None):
    """
    value : If None, make subgroups of equal values. 
            Otherwise return a single group where value is matched
    """
    grouped_data = {}

    if not value is None:
        grouped_data[f"{path} = {value}"] = [data for data in list_of_data if deep_dict_get(data, path) == value]
    else:    
        finished_values = list()
        for data in list_of_data:
            value = deep_dict_get(data, path)
            if f"{path} = {value}" in grouped_data.keys():
                grouped_data[f"{path} = {value}"].append(data)
            else:
                grouped_data[f"{path} = {value}"] = list()
                grouped_data[f"{path} = {value}"].append(data)
            #if not value in finished_values:
            #    grouped_data[f"{path} = value"] = [data for data in list_of_data if deep_dict_get(data, path) == value]
            #    finished_values.append(value)
    return grouped_data

In [None]:
def dict_compare(d1, d2):
    # Checking for empty list in d2, i.e. initial value
    if not type(d1) == type(d2):
        return False
    return all((d1.get(k) == v for k, v in d2.items()))
        

# Could contain an alternative head as (list_of_data : list, paths : [list, dict], values : None) 
# where paths-values would require a 1-to-1 correspondence. 
# But this could quickly become error prone on the user side...
def group_by_multiple(list_of_data : list, paths):
    grouped_data = {}
    # Grouping without values
    if type(paths) == list:
        finished_values = list()
        for data in list_of_data:
            paths_vals = {path : deep_dict_get(data, path) for path in paths}
            # not any X <=> all not X
            # i.e. only succeeds when this combination didn't exist before
            if not any([dict_comapare(paths_vals, fin_val) for fin_val in finished_values]):
                grouped_data[" ; ".join([f"{path} = {paths_vals[path]}" for path in paths_vals])] = [data]
            else:
                grouped_data[" ; ".join([f"{path} = {paths_vals[path]}" for path in paths_vals])].append(data)

    # Grouping by path-value combinations
    # Only returns the group where all those pairs are true
    elif type(paths) == dict:
        # The keys will contain the paths
        # The values will be the corresponding expected values
        # Yes, this could be a one-liner with a very neat nested list creation, 
        #  but I chose readability with temporary variables over it.
        good_data = list()
        for data in list_of_data:
            if all([deep_dict_get(data, path) == paths[path] for path in paths]):
                good_data.append(data)
        grouped_data[" ; ".join([f"{path} = {paths[path]}" for path in paths])] = good_data
    return grouped_data

In [None]:
def filter_by(list_of_data : list, path : str, value):
    filtered_data = list()
    for data in list_of_data:
        if deep_dict_get(data, path) == value:
            filtered_data.append(data)
    return filtered_data

In [None]:
def filter_value(list_of_data : list, path : str):
    """
        returns a list of the target value from each data dict
    """
    list_of_values = list()
    for data in list_of_data:
        list_of_values.append(deep_dict_get(data, path))
    return list_of_values

In [None]:
def get_attributes(data : dict):
    return data["results"]["attributes"]

def get_results(data : dict):
    return data["results"]["results"]

def find_common_attributes(att_1 : dict, att_2 : dict):
    """
    Returns a dictionary of common attributes between two attribute dictionaries
    """
    common_attributes = {}
    for key in att_1.keys():
        if key in att_2.keys() and att_1[key] == att_2[key]:
            common_attributes[key] = att_1[key]
    return common_attributes

def find_differing_attributes(att_1 : dict, att_2 : dict):
    """
    Returns a dictionary of differing attributes between two attribute dictionaries
    """
    differing_attributes = {}
    for key in att_1.keys():
        if key in att_2.keys() and att_1[key] != att_2[key]:
            differing_attributes[key] = (att_1[key], att_2[key])
    return differing_attributes

# Things I want to check
 - **Hardcoded setting**: Do the metrics have the same order as MCMC?
 - **CO2**: Also check the order
 - **CO2**: Check the prediction of future datapoints for each favourite metric 
 - **Kernel search**: How do the kernels found by the metrics perform according to MCMC?
 - (Kernel search: How do the kernels found by the metrics predict the future datapoints?)
 - **LODE**: Does Laplace find the most appropriate Differential equation? I.e. the correct numbers of parameters?

# Load results

In [None]:
all_results = load_results('results', "hardcoded", regex_scheme="results.pickle")

postfix = ""
result_tree = generate_tree_structure(all_results[0]).descendants

In [None]:
# Combining two different seeds
# Look for the exact same set of attributes and append the corresponding runs to the first result list
import copy
def combine_results(main_results, other_results):
    """
        main_results : If the result lists are of different size (in terms of the settings), this list is the one containing the larger set of settings
        other_results: This is the list of results added to the other
    """
    return_results = copy.deepcopy(main_results)
    for main_dict in main_results:
        # Check for these two 
        data_gen_kernel = main_dict["results"]["attributes"]["data_gen"]
        dataset_size = main_dict["results"]["attributes"]["eval_COUNT"]


In [None]:
# =============
# Preparing the results dict
# =============

# relevant keys: "data_gen" and "eval_COUNT"
# Have a dictionary for each evaluation metric. It has dictionaries for each "eval_COUNT" that contains a dictionary per "data_gen" that has dictionaries for the different kernels.
results_dict = {}

metrics_keys = ('AIC', 'BIC', 'MLL', 'MAP', 'Nested', 'Lap', "Lap0", "LapAIC", "LapBIC")

all_data_gens = sorted(set([get_attributes(result)["data_gen"] for result in all_results]))
all_data_sizes = sorted(set([get_attributes(result)["eval_COUNT"] for result in all_results]))
str_all_data_sizes = sorted(set([str(get_attributes(result)["eval_COUNT"]) for result in all_results]))


results_dict = {key: 
                {str(key1): 
                 {key2: 
                  {key3: [] for key3 in all_data_gens} # The TESTED kernel
                    for key2 in all_data_gens} # The GENERATING kernel
                      for key1 in all_data_sizes} # Then the dataset size
                        for key in metrics_keys} # Far out: The different metrics

for result in all_results:
    attributes = get_attributes(result)
    data_gen_kernel = attributes["data_gen"]
    dataset_size = str(attributes["eval_COUNT"])

    results = get_results(result)
    for run in results:
        for kernel in all_data_gens:
            for metric in ('AIC', 'BIC', 'MLL', 'MAP', 'Nested'):
                if metric in ("AIC", "BIC"):
                    results_dict[metric][str(dataset_size)][data_gen_kernel][kernel].append(deep_dict_get(run, f"{metric}/{kernel}/loss").detach().numpy().item() * (-0.5))
                else:
                    results_dict[metric][str(dataset_size)][data_gen_kernel][kernel].append(deep_dict_get(run, f"{metric}/{kernel}/loss").detach().numpy().item() if type(deep_dict_get(run, f"{metric}/{kernel}/loss")) == torch.Tensor else deep_dict_get(run, f"{metric}/{kernel}/loss"))
            for threshold in deep_dict_get(run, f"Laplace/{kernel}"):
                if threshold == 0.0:
                    results_dict['Lap'][str(dataset_size)][data_gen_kernel][kernel].append(deep_dict_get(run, f"Laplace/{kernel}")[threshold]["details"]["laplace without replacement"].detach().numpy().item())
                    results_dict['Lap0'][str(dataset_size)][data_gen_kernel][kernel].append(deep_dict_get(run, f"Laplace/{kernel}")[threshold]["loss"].detach().numpy().item())
                elif threshold == -1.0:
                    results_dict['LapAIC'][str(dataset_size)][data_gen_kernel][kernel].append(deep_dict_get(run, f"Laplace/{kernel}")[threshold]["loss"].detach().numpy().item())
                elif threshold == "BIC":
                    results_dict['LapBIC'][str(dataset_size)][data_gen_kernel][kernel].append(deep_dict_get(run, f"Laplace/{kernel}")[threshold]["loss"].detach().numpy().item())

In [None]:

# Draw a a matrix of heatmaps where each row is a dataset size is a metric.
# Inside the heatmaps apply the "numpy.mean" function to the results of each kernel.

fig, axs = plt.subplots(len(metrics_keys), len(all_data_sizes), figsize=(20, 40))
for i, metric in enumerate(metrics_keys):
    for j, dataset_size in enumerate(all_data_sizes):
        dataset_size = str(dataset_size)  # Ensure dataset_size is a string for consistency in the dictionary keys
        # Get the data for this metric and dataset size
        data = np.array([[results_dict[metric][dataset_size][data_gen][kernel] for kernel in all_data_gens] for data_gen in all_data_gens])
        
        # Calculate the mean across kernels
        # Rows show the GENERATING kernel, columns show the TESTED kernel
        mean_data = np.mean(data, axis=2)

        ## Translate each row into an ordering, i.e. the best is assigned a "1", the second best a "2", etc.
        #for row in mean_data:
        #    sorted_indices = np.argsort(row)
        #    rank = np.empty_like(sorted_indices)
        #    rank[sorted_indices] = np.arange(len(row)) + 1
        #    row[:] = rank
       
        # Create a heatmap
        im = axs[i, j].imshow(mean_data, aspect='auto', cmap='viridis_r')
        
        # Set the ticks and labels
        axs[i, j].set_xticks(np.arange(len(all_data_gens)))
        axs[i, j].set_yticks(np.arange(len(all_data_gens)))
        axs[i, j].set_xticklabels(all_data_gens)
        axs[i, j].set_yticklabels(all_data_gens)
        
        # Set title and labels
        axs[i, j].set_title(f"{metric} - {dataset_size}")
        # Rotate the x-tick labels for better readability
        plt.setp(axs[i, j].xaxis.get_majorticklabels(), rotation=45, ha='right')
        # Add a colorbar
        fig.colorbar(im, ax=axs[i, j])
plt.tight_layout()
plt.show()


In [None]:
# Helper function that returns the kendalltau correlation of two sequences
def kendalltau(list1, list2):
    return scipy.stats.kendalltau(list1, list2)

# Helper function that returns the spearman correlation of two sequences
def spearman(list1, list2):
    return scipy.stats.spearmanr(list1, list2)

# Helper function that translates a list of values to a ranking
def rank_values(values):
    sorted_indices = np.argsort(values)
    rank = np.empty_like(sorted_indices)
    rank[sorted_indices] = np.arange(len(values)) + 1
    return rank

In [None]:
all_kendall_taus = {metric: {str(size) : {gen: None for gen in all_data_gens} for size in all_data_sizes} for metric in metrics_keys}
all_spearman_rs = {metric: {str(size) : {gen: None for gen in all_data_gens} for size in all_data_sizes} for metric in metrics_keys}
all_rank_kendall_taus = {metric: {str(size) : {gen: None for gen in all_data_gens} for size in all_data_sizes} for metric in metrics_keys}
all_rank_spearman_rs = {metric: {str(size) : {gen: None for gen in all_data_gens} for size in all_data_sizes} for metric in metrics_keys}

for test_key, nested_key in itertools.product(results_dict.keys(), ["Nested"]):
    for dataset_size in all_data_sizes:
        dataset_size = str(dataset_size)  # Ensure dataset_size is a string for consistency in the dictionary keys
        for data_gen_kernel in all_data_gens:
            value_under_test = results_dict[test_key][dataset_size][data_gen_kernel]
            ground_truth = results_dict[nested_key][dataset_size][data_gen_kernel]

            tau, p_value = kendalltau(([np.nanmean(ground_truth[key]) for key in all_data_gens]), ([np.nanmean(value_under_test[key]) for key in all_data_gens]))
            spearman_corr, spearman_p_value = spearman(([np.nanmean(ground_truth[key]) for key in all_data_gens]), ([np.nanmean(value_under_test[key]) for key in all_data_gens]))
            
            tau_rank, p_value_rank = kendalltau(rank_values([np.nanmean(ground_truth[key]) for key in all_data_gens]), rank_values([np.nanmean(value_under_test[key]) for key in all_data_gens]))
            spearman_corr_rank, spearman_p_value_rank = spearman(rank_values([np.nanmean(ground_truth[key]) for key in all_data_gens]), rank_values([np.nanmean(value_under_test[key]) for key in all_data_gens]))

            all_kendall_taus[test_key][dataset_size][data_gen_kernel] = tau
            all_spearman_rs[test_key][dataset_size][data_gen_kernel] = spearman_corr
            all_rank_kendall_taus[test_key][dataset_size][data_gen_kernel] = tau_rank
            all_rank_spearman_rs[test_key][dataset_size][data_gen_kernel] = spearman_corr_rank




In [None]:
all_data_gens

In [None]:
def find_all_possible_dict_slicings(data_dict, path):
    keys = path.split('/')
    valid_key = "" 
    for key in keys:
        if key == ":":
            all_possible_dict_values = deep_dict_get(data_dict, valid_key).keys()
        else:
            valid_key += f"{key}/"
    return [path.replace(":", str(value)) for value in all_possible_dict_values]

def dict_slice(data_dict, path):
    all_possible_dict_values = []
    all_sliced_paths = find_all_possible_dict_slicings(data_dict, path)
    for sliced_path in all_sliced_paths:
        all_possible_dict_values.append(deep_dict_get(data_dict, sliced_path))
    return all_possible_dict_values 


In [None]:
find_all_possible_dict_slicings(all_kendall_taus, "Lap/:/SE")

In [None]:
target_dict = all_rank_kendall_taus 
fig, axs = plt.subplots(len(str_all_data_sizes), 1,  figsize=(5, 15))
for row, dataset_size in enumerate(sorted(all_data_sizes)):
    axs[row].boxplot([dict_slice(target_dict, f"{metric}/{dataset_size}/:") for metric in target_dict.keys()], tick_labels=target_dict.keys())
    axs[row].set_title(f"{dataset_size}")
    axs[row].set_xlabel("Kernel")
    axs[row].set_ylabel("Kendall Tau Rank")
    # Rotate the x-tick labels for better readability
    plt.setp(axs[row].xaxis.get_majorticklabels(), rotation=45, ha='right')
    axs[row].grid(True)
plt.tight_layout()
plt.show()

#plt.boxplot([dict_slice(target_dict, f"{metric}/{dataset_size}/:") for metric in target_dict.keys()], tick_labels=target_dict.keys())
