# Analysis
Load results files and run analysis on them

# Main program

## Specifications

In [None]:
import sys
from pathlib import Path

import utils.utils as utils
from utils.plotting import plot_links, find_linked_variables

from utils.setup import Setup

argv           = sys.argv[1:]
# argv           = ['-c', 'cfg_pipeline.yml']
# argv           = ['-c', 'cfg_concat_lon120.yml']
argv           = ['-c', 'cfg_single_lon120.yml']

setup = Setup(argv)

Path(setup.plots_folder).mkdir(parents=True, exist_ok=True)


## Aggregate results

In [None]:
def get_parents_from_links(links):
    linked_variables = set() # Avoids duplicates and sorts when list
    for parents_list in links.values():
        if len(parents_list) > 0:
            linked_variables.add(child)
            for parent in parents_list:
                linked_variables.add(parent[0])
    return [(i in linked_variables)  for i in range(len(links))]


In [None]:
def add_error(file, error_type, errors):
    error_list = errors.get(error_type, list())
    if file not in error_list:
        error_list.append(results_file)
        errors[error_type] = error_list

def aggregate_results(key, results_file, aggregated_results, setup, errors):
    aggregated_pc_alpha = aggregated_results.get(key, dict())
    if not results_file.is_file():
        add_error(results_file, "not_found", errors)
        return
    results = utils.load_results(results_file)
    for pc_alpha, alpha_result in results.items():
        if len(alpha_result) > 0:
            aggregated = aggregated_pc_alpha.get(pc_alpha, dict())
            aggregated_pc_alpha[pc_alpha] = aggregated
            # Aggregate parents
            links = alpha_result["links"]
            parents = get_parents_from_links(links)
            aggregated_parents = aggregated.get("parents", list())
            aggregated_parents.append(parents)
            aggregated["parents"] = aggregated_parents
            # Aggregate val_matrix
            val_matrix = alpha_result["val_matrix"]
            aggregated_val_matrix = aggregated.get("val_matrix", list())
            aggregated_val_matrix.append(val_matrix)
            aggregated["val_matrix"] = aggregated_val_matrix
            global var_names
            var_names = alpha_result["var_names"] # TODO Very uncomfortable way to obtain this data. Should be metadata
        else:
            add_error(results_file, "is_empty", errors)
    aggregated_results[key] = aggregated_pc_alpha
    # Doesn't return, instead modifies the received `aggregated results` object


## Load aggregated results

In [None]:
def count_total_variables(var_children, levels):
    total = 0
    for child in var_children:
        if child.dimensions == 2:
            total += 1
        elif child.dimensions == 3:
            total += len(levels)
    return total


In [None]:
import numpy as np

KEY_PATTERN = "{var_name}-{level}"

aggregated_results = dict()
errors = dict()

total_vars = count_total_variables(
    setup.var_children, setup.children_idx_levs)
total_files = total_vars * len(setup.gridpoints)
file_progress = 0
step_progress = 1
step = 5

for child in setup.var_children:
    print(f"Variable: {child.name}")
    
    if child.dimensions == 2:
        child_levels = [[setup.levels[-1],0]]
        key = child.name
    elif child.dimensions == 3:
        child_levels = setup.children_idx_levs
    for level in child_levels:      
        if child.dimensions == 3:
            key = KEY_PATTERN.format(
                    var_name = child.name,
                    level = round(level[0], 2)
            )
        if setup.analysis == "single":
            for i_grid, (lat, lon) in enumerate(setup.gridpoints):
                results_file = utils.generate_results_filename_single(
                        child, level[1], lat, lon, setup.ind_test_name,
                        setup.experiment, setup.output_file_pattern,
                        setup.output_folder)
                aggregate_results(
                        key, results_file, aggregated_results, setup, errors)
        elif setup.analysis == "concat":
            results_file = utils.generate_results_filename_concat(
                    child, level[-1], setup.gridpoints, setup.ind_test_name,
                    setup.experiment, setup.output_file_pattern,
                    setup.output_folder)
            aggregate_results(
                    key, results_file, aggregated_results, setup, errors)
        
        file_progress += len(setup.gridpoints)
        if(file_progress == total_files or
           ((file_progress / total_files * 100) >= step * step_progress)):
            step_progress += 1
            print("Progress: {:.2f}% - {} of {} files".format(
                    file_progress / total_files * 100,
                    file_progress,
                    total_files
                ))


## Print errors found loading aggregated results

In [None]:
print("ERRORS\n======")
for error_type, error_list in errors.items():
    msg = "{}: {} of {} files ({:.2f}%)".format(
            error_type,
            len(error_list),
            total_files,
            len(error_list)/total_files*100
    )
    print(msg)
    print("-" * len(msg))
    for file in error_list:
        print(file)


## Analyze aggregated results

In [None]:
# Configuration

# thresholds = [.2, .3, .5, .6]
thresholds = [.3, .5]
# thresholds = [1]
pc_alphas_filter = [str(a) for a in [.001, .01, .1]]
var_names_np = np.array(var_names)

# Print analysis
dict_child_parents = dict()
for child, aggregated_pc_alpha in aggregated_results.items():
    print(f"\n{child}")
    dict_pc_alpha_parents = dict()
    for pc_alpha, aggregated in aggregated_pc_alpha.items():
        dict_threshold_parents = dict()
        if pc_alpha not in pc_alphas_filter:
            continue # Skip this pc alpha
        parents_matrix = aggregated["parents"]
        parents_matrix = np.array(parents_matrix)
        parents_percent = parents_matrix.sum(axis = 0) / parents_matrix.shape[0]
#         dict_pc_alpha_parents["percent"] = parents_percent
        print(f"pc_alpha = {pc_alpha}")
        for threshold in thresholds:
            parents_filtered = parents_percent >= threshold
            parents = [i for i in range(len(parents_filtered)) if parents_filtered[i]]
#             print(parents_filtered)
#             print(parents)
            dict_threshold_parents[str(threshold)] = parents
            print(f"* Threshold {threshold}:\t{var_names_np[parents]}")
        dict_pc_alpha_parents[pc_alpha] = dict_threshold_parents
    dict_child_parents[child] = dict_pc_alpha_parents


## Plots

### Plot for each pc_alpha and threshold combination with all children

#### Generate results file for filter combination

In [None]:
var_names_parents = var_names[:-1] # NOTE: This assumes that the list has only one child
var_names_children = list()

dict_combinations = dict()
len_parents = len(var_names_parents)
len_total = len_parents + len(dict_child_parents)
for i_child, (child, dict_pc_alpha_parents) in enumerate(dict_child_parents.items()):
    i_child = i_child + len_parents
    var_names_children.append(child)
    for pc_alpha, dict_threshold_parents in dict_pc_alpha_parents.items():
        for threshold, parents in dict_threshold_parents.items():
            key = f"a{pc_alpha}-t{threshold}"
            combination_results = dict_combinations.get(key, dict())
            
            # Build links
            links = combination_results.get(
                    "links", {i : [] for i in range(len(var_names_parents))})
            links[i_child] = [(parent, 0) for parent in parents]
            combination_results["links"] = links
            
            # Build link_matrix
            link_matrix = combination_results.get(
                    "link_matrix",
                    [[[False] for i in range(len_total)]
                         for j in range(len_total)])
            for parent in parents:
                link_matrix[parent][i_child] = [True]
            combination_results["link_matrix"] = link_matrix
            
            # Store results
            dict_combinations[key] = combination_results

all_var_names = var_names_parents + var_names_children # Concatenation
for combination, combination_results in dict_combinations.items():
    combination_results["link_matrix"] = np.array(combination_results["link_matrix"])


#### Print plots

In [None]:
def recommend_sizes(links):
    linked_variables = find_linked_variables(links)
    n_linked_vars = len(linked_variables)
    print(f"n_linked_vars : {n_linked_vars}")
    if n_linked_vars <= 35:
        # Small
        figsize = (16, 16)
        node_size = 0.15
    elif n_linked_vars <= 70:
        # Medium
        figsize = (32, 32)
        node_size = 0.10
    else:
        #Big
        figsize = (48, 48)
        node_size = 0.05
    
    return figsize, node_size

In [None]:
val_matrix = link_width = None # TODO DELETE

for combination, combination_results in dict_combinations.items():
    print(combination)
    plot_filename = "{cfg}_{combination}.png".format(
            cfg = setup.yml_filename.rsplit(".")[0],
            combination = combination
    )
    plot_file = Path(setup.plots_folder, plot_filename)
    if not setup.overwrite and plot_file.is_file():
        print(f"Found file {plot_file}, skipping.")
        continue # Ignore this result
    links = combination_results["links"]
    figsize, node_size = recommend_sizes(links)
    plot_links(
        combination_results["link_matrix"],
        links,
        all_var_names,
        val_matrix = val_matrix,
        link_width = link_width,
        save_name = plot_file,
        figsize = figsize,
        node_size = node_size
    )
