# Final Synthetic and Real results

This notebook will calculate the final results for the synthetic and real data.

The datasets are:
1. CE-Cha
2. CE-Multi
3. CE-Net
4. CE-Gauss
5. CE-Tueb
 

In [1]:
WORKDIR = "/vol/bitbucket/ad6013/Research/gp-causal"

In [2]:
import sys
sys.path.append(WORKDIR)
import dill
import numpy as np
from sklearn.metrics import roc_auc_score
from data import get_data
from tqdm import trange
from utils import BEST_SCORES
from utils import return_all_scores, return_best_causal_scores

In [3]:
def return_pairs(data_name):
    synth_data_names = ["cha", "multi", "net", "gauss"]
    if data_name in synth_data_names:
        data_get = getattr(get_data, f"get_{data_name}_pairs_dataset")
        x, y, weight, target = data_get(data_path=f"{WORKDIR}/data/{data_name}_pairs/files")
    elif data_name == "tueb":
        data_get = getattr(get_data, f"get_tubingen_pairs_dataset")
        x, y, weight, target = data_get(data_path=f"{WORKDIR}/data/pairs/files")
    else:
        full_data_name = synth_data_names.append("tueb")
        raise ValueError(f"data_name variable must be in {full_data_name}")
    return x, y, weight, target
    

In [4]:
# File types for the results
# The files may have different formats of results saved
# For some datasets, there are multiple result files with different names
# these will be returned as a list and the best results chosen 
# Indicator next to file will indicate what kind of format the results are
# stored in

def cha_result_files():
    file_1 = [
        f"fullscore-cha_pairs-gplvm-reinit20-numind200_start:{i}_end:{i+150}.p"
        for i in np.linspace(0, 150, 2, dtype=int)
    ]
    file_2 = [
        f"fullscore-cha_pairs-gplvmgeneralised-reinit10-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
     ]
    return [(file_1, 0)]


def multi_result_files():
    files_1 = [
        f"fullscore-multi_pairs-gplvm-reinit20-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    files_2 = [
        f"fullscore-multi_pairs-gplvm_adam-reinit2-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    return [(files_2, 0), (files_1, 0)]


def net_result_files():
    files = [
        f"fullscore-net_pairs-gplvm-reinit20-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    return [(files, 0)]


def gauss_result_files():
    files_1 = [
        f"fullscore-gauss_pairs-gplvm-reinit20-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    files_2 = [
        f"fullscore-gauss_pairs-gplvm_adam-reinit2-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    files_3 = [
        f"fullscore-gauss_pairs-gplvmgeneralised-reinit20-numind200_start:{i}_end:{i+20}.p"
        for i in np.linspace(0, 280, 15, dtype=int)
    ]
    return [ (files_3, 1)]


def tueb_result_files():
    files_1 = [
        f"fullscore-cep-gplvmgeneralised-reinit20-numind200_start:{i}_end:{i+5}.p"
        for i in np.linspace(0, 100, 21, dtype=int)
    ]
    files_2 = [
        f"fullscore-cep-gplvmgeneralised-reinit10-numind200_start:{i}_end:{i+5}.p"
        for i in np.linspace(0, 100, 21, dtype=int)
    ]    
    return [(files_2, 1)]

In [5]:
# Functions to put the results in the right format
def extract_file(file_name):
    with open(f"{WORKDIR}/results/{file_name}", "rb") as f:
        result = dill.load(f)
    return result


def find_best_scores_tuples(all_files: list):
    all_best_scores = {}
    # Need a file counter so that the keys are the correct number
    file_counter = 0 
    for file in all_files:
        result = extract_file(file)
        for idx, scores in enumerate(result["scores"]):
            best_scores = BEST_SCORES(
                scores[0][0], scores[0][1], scores[1][0], scores[1][1]
            )
            all_best_scores[idx + file_counter] = best_scores
        # Need to find the index where the file ends
        ending_idx = file[:-2].split(':')[-1]
        file_counter = int(ending_idx)
    return all_best_scores
            

def find_best_scores_from_dict(all_files: list):
    all_results = {}
    for file in all_files:
        result = extract_file(file)
        all_results.update(result["final_scores"])

    all_x, all_y_x, all_y, all_x_y = return_all_scores(all_results)
    best_scores = return_best_causal_scores(all_x, all_y_x, all_y, all_x_y)
    return best_scores


def choose_best_scores(score_dict: dict):
    """Given multiple best score dicts, this will choose the best scores among
    them.

    Args:
        dict (dict): Values are dicts with value of form BEST_SCORES 
        which is a names tuple with arguements 'best_loss_x best_loss_y_x 
        best_loss_y best_loss_x_y'.
    """
    if len(score_dict.keys()) == 1:
        return score_dict[0]
    else:
        num_datasets = len(score_dict[0].keys())
        final_best_score = {}
        # For each run, find the best score
        all_run_idxs = score_dict[0].keys()
        for run_idx in list(all_run_idxs):
            all_dicts_this_run = [score_dict[i][run_idx] for i in list(score_dict.keys())]
            x_scores = [d_loop.best_loss_x for d_loop in all_dicts_this_run]
            y_x_scores = [d_loop.best_loss_y_x for d_loop in all_dicts_this_run]
            y_scores = [d_loop.best_loss_y for d_loop in all_dicts_this_run]
            x_y_scores = [d_loop.best_loss_x_y for d_loop in all_dicts_this_run]
            best_scores_this_run = BEST_SCORES(
                min(x_scores), min(y_x_scores), min(y_scores), min(x_y_scores)
            )
            final_best_score[run_idx] = best_scores_this_run
        return final_best_score


def get_final_scores_from_best_scores(best_scores):
    total_runs = len(list(best_scores.keys()))
    y_scores = {}
    y_scores_array = np.zeros(total_runs)
    for idx, run_idx in enumerate(list(best_scores.keys())):
        current_run = best_scores[run_idx]
        causal_score = current_run.best_loss_x + current_run.best_loss_y_x
        anticausal_score = current_run.best_loss_y + current_run.best_loss_x_y
        y_scores[run_idx] = - causal_score + anticausal_score
        y_scores_array[idx] = - causal_score + anticausal_score 
    return y_scores, y_scores_array


def balance_for_auc(target, pred_scores):
    # Targets are {-1, 1}, need to make sure it sums to zero
    balance = int(np.sum(target))
    if balance != 0:
        # There are more negative examples
        if balance < 0:
            switch_cand_idx = np.nonzero(target < 0)[0]
        # There are more positive examples here
        else:
            switch_cand_idx = np.nonzero(target > 0)[0]
        # get "balance" number of indices
        switch_idx = np.random.choice(
            switch_cand_idx, size=int(np.abs(balance) // 2), replace=False
        )
        final_target = target.copy()
        final_target[switch_idx] *= -1
        final_pred_scores = pred_scores.copy()
        final_pred_scores[switch_idx] *= -1
    else:
        final_target = target.copy()
        final_pred_scores = pred_scores.copy()
    if (balance % 2) == 0:
        assert np.sum(final_target) == 0
    else:
        assert np.abs(np.sum(final_target)) == 1
    return final_target, final_pred_scores
    

def calculate_auc(target, pred_scores, num_shuffles=1000):
    # Need to make sure that the classes are evenly balanced
    auc_all = []
    for i in trange(num_shuffles):
        total_runs = len(target)
        flip_idx = np.random.choice(np.arange(total_runs), total_runs // 2, replace=False)
        for i in range(total_runs):
            if i in flip_idx:
                target[i] *= -1
                pred_scores[i] *= -1
        final_target, final_pred_scores = balance_for_auc(target, pred_scores)
        roc_auc = roc_auc_score(final_target, final_pred_scores)
        auc_all.append(roc_auc)
    return np.mean(auc_all)


def return_auc_results(data_name):
    # Check the data name is correct
    all_data_names = ["cha", "multi", "net", "gauss", "tueb"]
    if data_name not in all_data_names:
        raise ValueError(f"data_name is not correct. Must be one of {all_data_names}")
    # Get the result files
    all_result_files = eval(f"{data_name}_result_files")()
    # processing of result file will be different for different data
    # if the number of files is more than 1, we need to select the best one
    best_score_dict = {}
    for idx, result_files in enumerate(all_result_files):
        actual_files, indicator = result_files
        if indicator == 0:
            best_scores = find_best_scores_tuples(all_files=actual_files)
        else:
            best_scores = find_best_scores_from_dict(all_files=actual_files)
        best_score_dict[idx] = best_scores
    final_best_score = choose_best_scores(best_score_dict)
    # Find the final score by adding the causla and anticausal scores
    _, final_scores = get_final_scores_from_best_scores(best_scores=final_best_score)
    if data_name != "tueb":
        _, _, _, target =  return_pairs(data_name=data_name)
    else:
        target = np.ones(99)
    auc = calculate_auc(target=target, pred_scores=final_scores)
    return auc, final_scores, target

In [6]:
# Multi is missing a run (run number 199 due to NaNs)
for dn in ["cha", "multi", "net", "gauss", "tueb"]:
    auc, score, target = return_auc_results(data_name=dn)
    print(f"{dn}: {auc}")

100%|██████████| 1000/1000 [00:01<00:00, 735.89it/s]


cha: 0.8191207111111112


100%|██████████| 1000/1000 [00:01<00:00, 702.15it/s]


multi: 0.9771990666666666


100%|██████████| 1000/1000 [00:01<00:00, 700.32it/s]


net: 0.9885765777777777


2023-01-20 10:10:54.875928: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-20 10:10:54.889659: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-20 10:10:54.891285: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-01-20 10:10:54.893119: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

gauss: 0.8803990666666666


100%|██████████| 1000/1000 [00:00<00:00, 1462.70it/s]

tueb: 0.7828681632653061





In [7]:
with open(f"{WORKDIR}/results/fullscore-multi_pairs-gplvm-reinit20-numind200_start:180_end:200.p", "rb") as f:
    result = dill.load(f)

In [8]:
result['scores']

[((2128.4078288203445, 1510.6619911914863),
  (1919.335969284949, 1653.3660872558987)),
 ((2128.4078162017154, 1394.383803719083),
  (1909.3119971328852, 1554.9223575693181)),
 ((1928.258421960225, -443.6584134761615),
  (33.3228250361326, 1499.022160586228)),
 ((1683.388408728003, 1055.2079998635677),
  (2128.4078190196906, 261.9700739423107)),
 ((2128.4078221497507, -125.04770111370397),
  (2128.4078481815973, -294.26330495430466)),
 ((2007.419390350712, 1559.8575973586412),
  (1889.0740192001977, 1896.1966678291365)),
 ((2128.4078342848843, -213.3390138456706),
  (1718.967175075185, 409.03829022199216)),
 ((2090.0999416530885, 607.4099149729673),
  (1889.2802452300093, 753.1940218845889)),
 ((2128.407815342783, -47.218483020004896),
  (338.35037724327685, 1819.9324041433656)),
 ((370.28288897569746, 1684.404157236906),
  (1934.5992372447909, -241.08342386278628)),
 ((1932.6792273422443, -153.16066690662637),
  (1451.1078732045626, 363.5866427548176)),
 ((2085.171991545442, 1760.5147