# Final Synthetic and Real results

This notebook will calculate the final results for the synthetic and real data.

The datasets are:
1. CE-Cha
2. CE-Multi
3. CE-Net
4. CE-Gauss
5. CE-Tueb
 

In [1]:
WORKDIR = "/vol/bitbucket/ad6013/Research/gp-causal"

In [2]:
import sys
sys.path.append(WORKDIR)
import dill
import numpy as np
from sklearn.metrics import roc_auc_score
from data import get_data
from utils import BEST_SCORES
from utils import return_all_scores, return_best_causal_scores

In [3]:
def return_pairs(data_name):
    synth_data_names = ["cha", "multi", "net", "gauss"]
    if data_name in synth_data_names:
        data_get = getattr(get_data, f"get_{data_name}_pairs_dataset")
        x, y, weight, target = data_get(data_path=f"{WORKDIR}/data/{data_name}_pairs/files")
    elif data_name == "tueb":
        data_get = getattr(get_data, f"get_tubingen_pairs_dataset")
        x, y, weight, target = data_get(data_path=f"{WORKDIR}/data/pairs/files")
    else:
        full_data_name = synth_data_names.append("tueb")
        raise ValueError(f"data_name variable must be in {full_data_name}")
    return x, y, weight, target
    

In [4]:
# File types for the results
# The files may have different formats of results saved
# For some datasets, there are multiple result files with different names
# these will be returned as a list and the best results chosen 
# Indicator next to file will indicate what kind of format the results are
# stored in

def cha_result_files():
    file_1 = [
        f"fullscore-cha_pairs-gplvm-reinit20-numind200_start:{i}_end:{i+150}.p"
        for i in np.linspace(0, 150, 2, dtype=int)
    ]
    file_2 = [
        f"fullscore-cha_pairs-gplvmgeneralised-reinit10-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
     ]
    return [(file_1, 0), (file_2, 1)]


def multi_result_files():
    files_1 = [
        f"fullscore-multi_pairs-gplvm-reinit20-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    files_2 = [
        f"fullscore-multi_pairs-gplvm_adam-reinit2-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    return [(files_2, 0), (files_2, 0)]


def net_result_files():
    files = [
        f"fullscore-net_pairs-gplvm-reinit20-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    return [(files, 0)]


def gauss_result_files():
    files_1 = [
        f"fullscore-gauss_pairs-gplvm-reinit20-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    files_2 = [
        f"fullscore-gauss_pairs-gplvm_adam-reinit2-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    return [(files_1, 0), (files_2, 0)]


def tueb_result_files():
    files = [
        f"fullscore-cep-gplvmgeneralised-reinit20-numind200_start:{i}_end:{i+5}.p"
        for i in np.linspace(0, 100, 21, dtype=int)
    ]
    return [(files, 1)]

In [5]:
# Functions to put the results in the right format
def extract_file(file_name):
    with open(f"{WORKDIR}/results/{file_name}", "rb") as f:
        result = dill.load(f)
    return result


def find_best_scores_tuples(all_files: list):
    all_best_scores = {}
    # Need a file counter so that the keys are the correct number
    file_counter = 0 
    for file in all_files:
        result = extract_file(file)
        for idx, scores in enumerate(result["scores"]):
            best_scores = BEST_SCORES(
                scores[0][0], scores[0][1], scores[1][0], scores[1][1]
            )
            all_best_scores[idx + file_counter] = best_scores
        # Need to find the index where the file ends
        ending_idx = file[:-2].split(':')[-1]
        file_counter = int(ending_idx)
    return all_best_scores
            

def find_best_scores_from_dict(all_files: list):
    all_results = {}
    for file in all_files:
        result = extract_file(file)
        all_results.update(result["final_scores"])

    all_x, all_y_x, all_y, all_x_y = return_all_scores(all_results)
    best_scores = return_best_causal_scores(all_x, all_y_x, all_y, all_x_y)
    return best_scores


def choose_best_scores(score_dict: dict):
    """Given multiple best score dicts, this will choose the best scores among
    them.

    Args:
        dict (dict): Values are dicts with value of form BEST_SCORES 
        which is a names tuple with arguements 'best_loss_x best_loss_y_x 
        best_loss_y best_loss_x_y'.
    """
    if len(score_dict.keys()) == 1:
        return score_dict[0]
    else:
        num_datasets = len(score_dict[0].keys())
        final_best_score = {}
        # For each run, find the best score
        for run_idx in range(num_datasets):
            all_dicts_this_run = [score_dict[i][run_idx] for i in list(score_dict.keys())]
            x_scores = [d_loop.best_loss_x for d_loop in all_dicts_this_run]
            y_x_scores = [d_loop.best_loss_y_x for d_loop in all_dicts_this_run]
            y_scores = [d_loop.best_loss_y for d_loop in all_dicts_this_run]
            x_y_scores = [d_loop.best_loss_x_y for d_loop in all_dicts_this_run]
            best_scores_this_run = BEST_SCORES(
                min(x_scores), min(y_x_scores), min(y_scores), min(x_y_scores)
            )
            final_best_score[run_idx] = best_scores_this_run
        return final_best_score


def get_final_scores_from_best_scores(best_scores):
    total_runs = len(list(best_scores.keys()))
    y_scores = {}
    y_scores_array = np.zeros(total_runs)
    for idx, run_idx in enumerate(list(best_scores.keys())):
        current_run = best_scores[run_idx]
        causal_score = current_run.best_loss_x + current_run.best_loss_y_x
        anticausal_score = current_run.best_loss_y + current_run.best_loss_x_y
        y_scores[run_idx] = - causal_score + anticausal_score
        y_scores_array[idx] = - causal_score + anticausal_score 
    return y_scores, y_scores_array


def shuffle_and_find_auc(num_shuffles):
    pass


def calculate_auc(target, pred_scores):
    # Need to make sure that the classes are evenly balanced
    # TODO: Need to make sure the classes are balanced
    balance = np.sum(target)
    return roc_auc_score(target, pred_scores)


def return_auc_results(data_name):
    # Check the data name is correct
    all_data_names = ["cha", "multi", "net", "gauss", "tueb"]
    if data_name not in all_data_names:
        raise ValueError(f"data_name is not correct. Must be one of {all_data_names}")
    # Get the result files
    all_result_files = eval(f"{data_name}_result_files")()
    # processing of result file will be different for different data
    # if the number of files is more than 1, we need to select the best one
    best_score_dict = {}
    for idx, result_files in enumerate(all_result_files):
        actual_files, indicator = result_files
        if indicator == 0:
            best_scores = find_best_scores_tuples(all_files=actual_files)
        else:
            best_scores = find_best_scores_from_dict(all_files=actual_files)
        best_score_dict[idx] = best_scores
    final_best_score = choose_best_scores(best_score_dict)
    # Find the final score by adding the causla and anticausal scores
    _, final_scores = get_final_scores_from_best_scores(best_scores=final_best_score)
    _, _, _, target =  return_pairs(data_name=data_name)
    auc = calculate_auc(target=target, pred_scores=final_scores)
    return auc, final_scores, target

In [6]:
# Multi is missing a run (run number 199 due to NaNs)
for dn in ["multi"]:
    auc, score, target = return_auc_results(data_name=dn)
    print(f"{dn}: {auc}")

KeyError: 199

In [79]:
y_pred = np.zeros_like(score)
for i in range(len(score)):
    if score[i] < 0:
        y_pred[i] = -1
    else:
        y_pred[i] = 1

In [89]:
target = np.ones(99)

In [91]:
(target == y_pred).mean()

0.6868686868686869

In [88]:
y_pred.shape

(99,)

In [87]:
target.shape

(108,)

In [56]:
cha_scores_1[45]

-3227.9546961455067

In [57]:
cha_scores_2[45]

1434.5928820609242

In [58]:
cha_scores_1[45]

-3227.9546961455067