# Create Plots
Notebook to create all plots included in the paper, except for the DL experiments. The scripts ```LOVO_via_parent_adjustment.py```, ```LOVO_applied_to_RCD.py```, ```LOVO_via_LiNGAM.py``` need to be executed before to create the required CSV files with the simulation results.

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import itertools as it
from simulate_data import simulate_data
from scipy.stats import spearmanr

font = {'weight': 'bold', 'size': 16}
plt.rcParams['axes.titleweight'] = 'bold'
plt.rc('font', **font)
plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

## How often are the lemmas successful in excluding edges?

In [None]:
settings = {
    'PAG': 'Lemma 2 with PAGs',
    'PAG_varying_q': 'Lemma 2 for PAGs (varying q)',
    'ADMG': 'Lemma 2',
    'GX_ADMG_G_DAG': 'Lemma 3',
    'DAG': 'Lemma 7 with DAGs',
    'CPDAG': 'Lemma 7 with CPDAGs'
}
for filename, title in settings.items(): # ['PAG', ]:
    x_axis_q = filename == 'ADMG' or 'q' in filename
    try:
        results = pd.read_csv(f'simulation_results_camera_ready/excluded_edges_{filename}.csv')
    except FileNotFoundError:
        continue
    nr_potential_edges = math.comb(10, 2)
    grouped_results = results.groupby('q' if x_axis_q else 'p').agg(
        percentage_excluded_edges=('number_excluded_edges', lambda x: x.mean() ), 
        count_no_excluded_edges=('number_excluded_edges', lambda x: (x == 0).sum()) 
    ).reset_index()

    plt.plot(grouped_results['q' if x_axis_q else 'p'], grouped_results['percentage_excluded_edges'], marker='o', label='Percentage of excluded edges')
    
    ps = np.linspace(0.1, 0.9, 100)
    upper_bound = (1-0.3)*(1-ps)*nr_potential_edges if x_axis_q else nr_potential_edges - nr_potential_edges*ps
    plt.plot(ps, upper_bound, color='grey', zorder=-1)
    plt.xlabel('Bidirected edge probability' if x_axis_q else 'Directed edge probability')
    plt.ylabel('Number of excluded edges')
    plt.title(title, pad=10)
    plt.ylim(-1, 41)
    plt.tight_layout()
    plt.savefig(f'simulation_results_camera_ready/mean_excluded_edges_{filename}.eps')
    plt.show()

    plt.plot(grouped_results['q' if x_axis_q else 'p'], grouped_results['count_no_excluded_edges'], marker='o', color='tab:orange')
    plt.xlabel('Bidirected edge probability' if x_axis_q else 'Directed edge probability')
    plt.ylabel('Runs without excluded edges')
    plt.title(title, pad=10)
    plt.ylim(-25, 1000)
    plt.tight_layout()
    plt.savefig(f'simulation_results_camera_ready/no_excluded_edges_{filename}.eps')
    plt.show()

## LOVO via Parent Adjustment

In [None]:
settings = {
    'LOVO_via_LiNGAM': 'LOVO via LiNGAM',
    'GX_ADMG_G_DAG': 'Parent adjustment (Lemma 3)',
    'ADMG': 'Parent adjustment (Lemma 2)',
    'CPDAG': 'Path adjustment (CPDAGs)',
    'PAG': 'Path adjustment (PAGs)',
    'DAG': 'Parent adjustment (DAGs)'

}

for setting, title in settings.items():
    results = pd.read_csv('simulation_results_1st_version/LOVO_via_LiNGAM_p=0.3.csv') if setting == 'LOVO_via_LiNGAM' else pd.read_csv(f'simulation_results/parent_adj_{setting}.csv')
    results_grouped = results.groupby('sim').agg({'lovo_error': ['mean', 'count'], 'baseline_error': 'mean'})
    results_grouped.columns = ['lovo_error', 'nr_edges_considered', 'baseline_error']
    baseline_errors, lovo_errors = results_grouped['baseline_error'], results_grouped['lovo_error']
    color = 'tab:orange' if setting == 'LOVO_via_LiNGAM' else 'tab:blue'
    plt.scatter(baseline_errors, lovo_errors, color=color, alpha=.8)
    dotted_line = np.linspace(0, 0.23, 100)
    plt.plot(dotted_line, dotted_line, color='black', zorder=-1)
    plt.xlabel('Baseline loss')
    plt.ylabel('LOVO loss')
    plt.title(title, pad=10)
    plt.tight_layout()
    plt.savefig(f'simulation_results/parent_adj_{setting}.eps') 
    plt.show()

    print(f'LOVO loss higher in {np.mean(lovo_errors > baseline_errors)*100} % of the cases')
    print(f'No prediction made in {np.mean(results_grouped["lovo_error"].isna())*100} % of the cases')
    print(f'Average number of edges considered {np.mean(results_grouped["nr_edges_considered"])}')

## DirectLiNGAM and RCD

In [None]:
for alg in ['RCD', 'DirectLiNGAM']: #
    p = 0.5 if alg == 'DirectLiNGAM' else 0.3
    nr_nodes = 10 if alg == 'DirectLiNGAM' else 5
    results = pd.read_csv(f"simulation_results_camera_ready/{alg}_nr_nodes={nr_nodes}_p={p}_reps=1000_varying_nlearn.csv")
    results_grouped = results[(~results[f'lovo_error'].isna())].groupby(['simulation', 'n_learn']).agg({'lovo_error': 'mean', 'baseline_error': 'mean', 'SHD': 'mean', 'edge_exists': 'mean'})

    for n_learn in results['n_learn'].unique():
        results_grouped_n_learn = results_grouped.loc(axis=0)[:, n_learn]
        plt.scatter(results_grouped_n_learn[f'baseline_error'], results_grouped_n_learn[f'lovo_error'], alpha=.8, s=20, color='tab:orange')# 
        dotted_line = np.linspace(0, max(list(results_grouped[f'lovo_error'])+list(results_grouped[f'baseline_error'])), 100)
        plt.plot(dotted_line, dotted_line, color='black', zorder=-1)
        plt.xlabel('Baseline loss')
        plt.ylabel('Parent adjustment LOVO loss')
        plt.title(f'{alg}, n_learn={n_learn}', pad=10)
        plt.tight_layout()
        plt.savefig(f'simulation_results_camera_ready/{alg}_{n_learn}.eps')
        plt.show()
        print(f'SHD: {np.mean(results[results['n_learn']==n_learn]["SHD"])/2}')
        print(f'LOVO loss higher in {np.mean(results_grouped_n_learn[f'lovo_error'] > results_grouped_n_learn[f'baseline_error'])*100} % of the cases')
        print(f'No prediction made in {(1000 - len(results_grouped_n_learn))/10} % of the cases')

    for measure in ['SHD', 'edge_exists']:
        rho, pval = spearmanr(results_grouped[measure], results_grouped[f'lovo_error'])
        lovo_errors =  results_grouped[f'lovo_error']
        plt.figure(figsize=(5.5,5.5))
        if alg == 'DirectLiNGAM':
            plt.ylim(top=0.21)
            plt.yticks(np.linspace(0, 0.2, 5))
        plt.scatter(results_grouped[measure], lovo_errors, alpha=.8, color='tab:orange')
        plt.xlabel('Edge exists' if measure=='edge_exists' else 'SHD')
        plt.ylabel('LOVO loss')
        plt.title(f"{alg}, $\\rho$={rho:.2f}", pad=10)
        plt.tight_layout()
        plt.savefig(f'simulation_results_camera_ready/{alg}_corr_{measure}.eps') 
        plt.show()
        print(f'{alg}: Correlation of {measure} with lovo error is', spearmanr(results_grouped[measure], results_grouped[f'lovo_error']))


## Self-Compatibility

In [None]:
from scipy.stats import kendalltau

for df in ['DirectLiNGAM_nr_nodes=10_p=0.5_compatibility_varying_nlearn.csv',
           'RCD_nr_nodes=5_p=0.3_compatibility_varying_nlearn.csv']:
    data = pd.read_csv(f'simulation_results/{df}')
    data = data.dropna()
    data['lovo_loss'] = data['lovo_error'] - data['baseline_error']
    rho, pval = spearmanr(data['graphical_sc'], data['lovo_error'])
    plt.scatter(data['graphical_sc'], data['lovo_loss'])
    plt.xlabel("Self-compatibility score")
    plt.ylabel("Lovo loss")
    alg_name = df.split('_')[0]
    print(rho, pval)
    plt.title(f"{alg_name}, $\\rho$={rho:.2f}", pad=10)
    plt.tight_layout()
    plt.savefig(f'simulation_results/comparison_self_compatibility_{alg_name}.eps')
    plt.show()
    
    print(data[['graphical_sc', 'lovo_error']].corr(method='kendall'))