# Results of the tests on simulated data from random SCMs

### Imports

In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

## Utils functions

In [2]:
color_list=["rgb(238,44,44)",
            "rgb(0,0,205)",
            "rgb(127,255,0)",
            "rgb(191,62,255)",
            "rgb(255,20,147)",
            "rgb(255,48,48)",
            "rgb(255,215,0)",
            "rgb(0,0,255)"] 

transparent_color_list=["rgba(238,44,44,0.2)",
            "rgba(0,0,205,0.2)",
            "rgba(127,255,0,0.2)",
            "rgba(191,62,255,0.2)",
            "rgba(255,20,147,0.2)",
            "rgba(255,48,48,0.2)",
            "rgba(255,215,0,0.2)",
            "rgba(0,0,255,0.2)"]

In [3]:
def compute_stats_for_line_plot_paper(results_df, features_to_group_list, column_of_interest):
    """Compute mean, median and std for variable in column_of_interest when grouping by features_to_group_list

    Args:
        results_df (DataFrame): data table
        features_to_group_list (list): List of columns among which to group by
        column_of_interest (str): Column for which to compte stats

    Returns:
        DataFrame: Grouped table with stats
    """
    res_copy_df = results_df.groupby(features_to_group_list).agg([np.mean, np.std, np.median]).reset_index()
    res_copy_df[column_of_interest+'_mean'] = res_copy_df[column_of_interest]['mean']
    res_copy_df[column_of_interest+'_upper'] = res_copy_df[column_of_interest]['mean'] + res_copy_df[column_of_interest]['std']
    res_copy_df[column_of_interest+'_lower'] = res_copy_df[column_of_interest]['mean'] - res_copy_df[column_of_interest]['std']
    res_copy_df[column_of_interest+'_median'] = res_copy_df[column_of_interest]['median']
    res_copy_df = res_copy_df[features_to_group_list +  
                               [column_of_interest+'_mean',  
                               column_of_interest+'_upper', 
                               column_of_interest+'_lower', 
                               column_of_interest+'_median',
                              ]]
    return res_copy_df

In [4]:
# Specific functions for Variance treatment

def my_remove(liste, elmt):
    """Remove all occurences of elmt in liste

    Args:
        liste (list): list from whiwh to remove elmt
        elmt (any): element to remove

    Returns:
        list: list without elmt
    """
    while elmt in liste:
      liste.remove(elmt)
    return liste



def process_variance_features(df_input):
    """Process variance features. From list of variances to relative difference of variance computed among all variances in the list

    Args:
        df_input (DataFrame): data table

    Returns:
        DataFrame: processed data table
    """
    df = df_input.copy()
    
    # From string to numpy
    df['variance_train_NOT_weighted'] = df.variance_train_NOT_weighted.apply(lambda x: x[1:-1].replace("\n","").replace("[","").replace("]","").split(' ') if type(x)==str else x)
    df['variance_train_NOT_weighted'] = df.variance_train_NOT_weighted.apply(lambda x: np.array(my_remove(x, '')).astype(float) if (type(x)==list)&('' in x) else np.array(x).astype(float))
    df['variance_aug_NOT_weighted'] = df.variance_aug_NOT_weighted.apply(lambda x: x[1:-1].replace("\n","").replace("[","").replace("]","").split(' ') if type(x)==str else x)
    df['variance_aug_NOT_weighted'] = df.variance_aug_NOT_weighted.apply(lambda x: np.array(my_remove(x, '')).astype(float) if (type(x)==list)&('' in x) else np.array(x).astype(float))
    df['variance_aug_weighted'] = df.variance_aug_weighted.apply(lambda x: x[1:-1].replace("\n","").replace("[","").replace("]","").split(' ') if type(x)==str else x)
    df['variance_aug_weighted'] = df.variance_aug_weighted.apply(lambda x: np.array(my_remove(x, '')).astype(float) if (type(x)==list)&('' in x) else np.array(x).astype(float))
    
    # Compute delta var
    df = df.assign(relative_delta_var_NOT_weighted = lambda x: (x['variance_aug_NOT_weighted'] - x['variance_train_NOT_weighted'])/x['variance_train_NOT_weighted'])
    df['relative_delta_var_NOT_weighted_mean'] = df.relative_delta_var_NOT_weighted.apply(lambda x: np.mean(x))
    df['relative_delta_var_NOT_weighted_median'] = df.relative_delta_var_NOT_weighted.apply(lambda x: np.median(x))
    df = df.assign(relative_delta_var_weighted = lambda x: (x['variance_aug_weighted'] - x['variance_train_NOT_weighted'])/x['variance_train_NOT_weighted'])
    df['relative_delta_var_weighted_mean'] = df.relative_delta_var_weighted.apply(lambda x: np.mean(x))
    df['relative_delta_var_weighted_median'] = df.relative_delta_var_weighted.apply(lambda x: np.median(x))
    
    return df



def process_variance_features_robustness(df_input):
    """Process variance features. From list of variances to relative difference of variance computed among all variances in the list

    Args:
        df_input (DataFrame): data table

    Returns:
        DataFrame: processed data table
    """
    df = df_input.copy()
    
    # From string to numpy
    df['variance_train_NOT_weighted'] = df.variance_train_NOT_weighted.apply(lambda x: x[1:-1].replace("\n","").replace("[","").replace("]","").split(' ') if type(x)==str else x)
    df['variance_train_NOT_weighted'] = df.variance_train_NOT_weighted.apply(lambda x: np.array(my_remove(x, '')).astype(float) if (type(x)==list)&('' in x) else np.array(x).astype(float))
    
    df['variance_aug_weighted_true'] = df.variance_aug_weighted_true.apply(lambda x: x[1:-1].replace("\n","").replace("[","").replace("]","").split(' ') if type(x)==str else x)
    df['variance_aug_weighted_true'] = df.variance_aug_weighted_true.apply(lambda x: np.array(my_remove(x, '')).astype(float) if (type(x)==list)&('' in x) else np.array(x).astype(float))
    
    df['variance_aug_weighted_wrong'] = df.variance_aug_weighted_wrong.apply(lambda x: x[1:-1].replace("\n","").replace("[","").replace("]","").split(' ') if type(x)==str else x)
    df['variance_aug_weighted_wrong'] = df.variance_aug_weighted_wrong.apply(lambda x: np.array(my_remove(x, '')).astype(float) if (type(x)==list)&('' in x) else np.array(x).astype(float))
    
    # Compute delta var
    df = df.assign(relative_delta_var_weighted_true = lambda x: (x['variance_aug_weighted_true'] - x['variance_train_NOT_weighted'])/x['variance_train_NOT_weighted'])
    df['relative_delta_var_weighted_true_mean'] = df.relative_delta_var_weighted_true.apply(lambda x: np.mean(x))
    df['relative_delta_var_weighted_true_median'] = df.relative_delta_var_weighted_true.apply(lambda x: np.median(x))
    
    df = df.assign(relative_delta_var_weighted_wrong = lambda x: (x['variance_aug_weighted_wrong'] - x['variance_train_NOT_weighted'])/x['variance_train_NOT_weighted'])
    df['relative_delta_var_weighted_wrong_mean'] = df.relative_delta_var_weighted_wrong.apply(lambda x: np.mean(x))
    df['relative_delta_var_weighted_wrong_median'] = df.relative_delta_var_weighted_wrong.apply(lambda x: np.median(x))
    
    df = df.assign(relative_delta_var_weighted_true_VS_wrong = lambda x: (x['variance_aug_weighted_wrong'] - x['variance_aug_weighted_true'])/x['variance_aug_weighted_true'])
    df['relative_delta_var_weighted_true_VS_wrong_mean'] = df.relative_delta_var_weighted_true_VS_wrong.apply(lambda x: np.mean(x))
    df['relative_delta_var_weighted_true_VS_wrong_median'] = df.relative_delta_var_weighted_true_VS_wrong.apply(lambda x: np.median(x))
    return df

#### Chose wether to plot paper results or your results

In [5]:
use_paper_results = True

## Weight threshold

In [6]:
if use_paper_results:
    folder_to_res = 'paper_results'
    data_threshold = pd.read_csv(folder_to_res + '/intermediate_threshold_res_1e-05.csv')
else:
    folder_to_res = 'threshold_variation'
    data_threshold = pd.read_csv(folder_to_res + '/threshold_res.csv')

data_threshold['weight_threshold_log10'] = data_threshold.weight_threshold.apply(lambda x: np.log10(x))

In [56]:
### fraction of augmented data mean, median, std depending on the weight_threshold

# set x, y and color variables
color_columns_list=['frac_augmented']
x_column = 'weight_threshold_log10'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_threshold, [x_column], color_columns_list[0])

# Figure
fig = go.Figure()
  
### frac_augmented results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['frac_augmented' + '_mean'].tolist()
y1_upper = res_tot['frac_augmented' + '_upper'].tolist()
y1_lower = res_tot['frac_augmented' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['frac_augmented' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=True,
    name='std'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    showlegend=True,
    name='mean'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=True,
    name='median'
))
fig.update_traces(mode='lines')

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99
))

# axis titles
fig.update_yaxes(title_text='fraction of newly generated data')
fig.update_xaxes(title_text='log10 probability threshold')

fig.show()





In [55]:
# fraction of filtered data mean, median, std depending on the weight_threshold

# set x, y and color variables
color_columns_list=['frac_filtered']
x_column = 'weight_threshold_log10'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_threshold, [x_column], color_columns_list[0])

# Figure
fig = go.Figure()
  
### frac_filtered results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['frac_filtered' + '_mean'].tolist()
y1_upper = res_tot['frac_filtered' + '_upper'].tolist()
y1_lower = res_tot['frac_filtered' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['frac_filtered' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=True,
    name='std'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    showlegend=True,
    name='mean'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=True,
    name='median'
))
fig.update_traces(mode='lines')

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

# axis titles
fig.update_yaxes(title_text='fraction of filtered data')
fig.update_xaxes(title_text='log10 probability threshold')

fig.show()





In [9]:
### XGB median MAPE depending on the weight_threshold

# set x, y and color variables
color_columns_list=['MAPE_CausalDA', 'MAPE_Baseline']
x_column = 'weight_threshold_log10'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_threshold, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(data_threshold, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['MAPE_CausalDA' + '_mean'].tolist()
y1_upper = res_tot['MAPE_CausalDA' + '_upper'].tolist()
y1_lower = res_tot['MAPE_CausalDA' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['MAPE_CausalDA' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='CausalDA',
))
fig.update_traces(mode='lines')

### Baseline results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['MAPE_Baseline' + '_mean'].tolist()
y1_upper = res_tot['MAPE_Baseline' + '_upper'].tolist()
y1_lower = res_tot['MAPE_Baseline' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['MAPE_Baseline' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='Baseline',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.01,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=1600,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='MAPE')
fig.update_xaxes(title_text='log10 probability threshold')

fig.show()







In [54]:
### mean relative difference in varaince mean, median, std depending on the weight_threshold

# set x, y and color variables
color_columns_list=['relative_delta_var_weighted_mean']
x_column = 'weight_threshold_log10'

# compute variance
results_df_var = process_variance_features(data_threshold)[[x_column, 
                                                       'predicted_variable_node_type',
                                                       'relative_delta_var_weighted_mean',
                                                       'relative_delta_var_NOT_weighted_mean',
                                                       'relative_delta_var_NOT_weighted_median',
                                                       'relative_delta_var_weighted_median'
                                                      ]]

# aggregate results
res_tot = compute_stats_for_line_plot_paper(results_df_var, [x_column], color_columns_list[0])

# Figure
fig = go.Figure()
  
### relative_delta_var_weighted_mean results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['relative_delta_var_weighted_mean' + '_mean'].tolist()
y1_upper = res_tot['relative_delta_var_weighted_mean' + '_upper'].tolist()
y1_lower = res_tot['relative_delta_var_weighted_mean' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['relative_delta_var_weighted_mean' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=True,
    name='std'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    showlegend=True,
    name='mean'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=True,
    name='median'
))
fig.update_traces(mode='lines')

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# legend position
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.01,
    xanchor="right",
    x=0.99
))

# axis titles
fig.update_yaxes(title_text='Mean relative difference in variance')
fig.update_xaxes(title_text='log10 probability threshold')

fig.show()





In [53]:
# Wasserstein distance between original setand augmented set mean, median, std depending on the weight_threshold

# set x, y and color variables
color_columns_list=['Wasserstein_dist_train_vs_aug_weighted']
x_column = 'weight_threshold_log10'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_threshold, [x_column], color_columns_list[0])

# Figure
fig = go.Figure()
  
### Wasserstein_dist_train_vs_aug_weighted results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['Wasserstein_dist_train_vs_aug_weighted' + '_mean'].tolist()
y1_upper = res_tot['Wasserstein_dist_train_vs_aug_weighted' + '_upper'].tolist()
y1_lower = res_tot['Wasserstein_dist_train_vs_aug_weighted' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['Wasserstein_dist_train_vs_aug_weighted' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=True,
    name='std'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    showlegend=True,
    name='mean'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=True,
    name='median'
))
fig.update_traces(mode='lines')

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99
))

# axis titles
fig.update_yaxes(title_text='Wasserstein distance')
fig.update_xaxes(title_text='log10 probability threshold')

fig.show()





## Dataset size

In [12]:
if use_paper_results:
    folder_to_res = 'paper_results'
else:
    folder_to_res = 'nb_observations_variation'

data_size = pd.read_csv(folder_to_res + '/nb_observations_res.csv')
data_size['nb_observations_log'] = data_size.nb_observations.apply(lambda x: np.log(x))

In [13]:
### XGB median MAPE depending on the dataset size

# set x, y and color variables
color_columns_list=['MAPE_CausalDA', 'MAPE_Baseline']
x_column = 'nb_observations_log'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_size, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(data_size, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['MAPE_CausalDA' + '_mean'].tolist()
y1_upper = res_tot['MAPE_CausalDA' + '_upper'].tolist()
y1_lower = res_tot['MAPE_CausalDA' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['MAPE_CausalDA' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='CausalDA',
))
fig.update_traces(mode='lines')

### Baseline results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['MAPE_Baseline' + '_mean'].tolist()
y1_upper = res_tot['MAPE_Baseline' + '_upper'].tolist()
y1_lower = res_tot['MAPE_Baseline' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['MAPE_Baseline' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='Baseline',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.01,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='MAPE')
fig.update_xaxes(title_text='log dataset size')

fig.show()







In [14]:
### XGB median R2 depending on the weight_threshold

# set x, y and color variables
color_columns_list=['r2_CausalDA', 'r2_Baseline']
x_column = 'nb_observations_log'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_size, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(data_size, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_CausalDA' + '_mean'].tolist()
y1_upper = res_tot['r2_CausalDA' + '_upper'].tolist()
y1_lower = res_tot['r2_CausalDA' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_CausalDA' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='CausalDA',
))
fig.update_traces(mode='lines')

### Baseline results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_Baseline' + '_mean'].tolist()
y1_upper = res_tot['r2_Baseline' + '_upper'].tolist()
y1_lower = res_tot['r2_Baseline' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_Baseline' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='Baseline',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='R2 score')
fig.update_xaxes(title_text='log dataset size')

fig.show()







## Problem dimension

In [15]:
if use_paper_results:
    folder_to_res = 'paper_results'
else:
    folder_to_res = 'dimension_variation'

results_dimensions = pd.read_csv(folder_to_res + '/dimensions_res.csv')

In [16]:
### fraction of augmented and filtered data mean, median, std depending on the number of variables

# set x, y and color variables
color_columns_list=['frac_augmented', 'frac_filtered']
x_column = 'nb_nodes'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(results_dimensions, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(results_dimensions, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### frac_augmented results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['frac_augmented' + '_mean'].tolist()
y1_upper = res_tot['frac_augmented' + '_upper'].tolist()
y1_lower = res_tot['frac_augmented' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['frac_augmented' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=False
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    name='augmented',
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=False
))
fig.update_traces(mode='lines')

### frac_filtered results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['frac_filtered' + '_mean'].tolist()
y1_upper = res_tot['frac_filtered' + '_upper'].tolist()
y1_lower = res_tot['frac_filtered' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['frac_filtered' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[1],
    line_color='rgba(255,255,255,0)', 
    showlegend=False
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[1],
    name='filtered',
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    showlegend=False
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=1600,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='fraction of the original train set')
fig.update_xaxes(title_text='problem dimension')

fig.show()







In [17]:
### XGB median MAPE depending on the number of variables

# set x, y and color variables
color_columns_list=['MAPE_CausalDA', 'MAPE_Baseline']
x_column = 'nb_nodes'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(results_dimensions, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(results_dimensions, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['MAPE_CausalDA' + '_mean'].tolist()
y1_upper = res_tot['MAPE_CausalDA' + '_upper'].tolist()
y1_lower = res_tot['MAPE_CausalDA' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['MAPE_CausalDA' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='CausalDA',
))
fig.update_traces(mode='lines')

### Baseline results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['MAPE_Baseline' + '_mean'].tolist()
y1_upper = res_tot['MAPE_Baseline' + '_upper'].tolist()
y1_lower = res_tot['MAPE_Baseline' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['MAPE_Baseline' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='Baseline',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='MAPE')
fig.update_xaxes(title_text='problem dimension')

fig.show()







In [18]:
### XGB median R2 depending on the number of variables

# set x, y and color variables
color_columns_list=['r2_CausalDA', 'r2_Baseline']
x_column = 'nb_nodes'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(results_dimensions, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(results_dimensions, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_CausalDA' + '_mean'].tolist()
y1_upper = res_tot['r2_CausalDA' + '_upper'].tolist()
y1_lower = res_tot['r2_CausalDA' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_CausalDA' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='CausalDA',
))
fig.update_traces(mode='lines')

### Baseline results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_Baseline' + '_mean'].tolist()
y1_upper = res_tot['r2_Baseline' + '_upper'].tolist()
y1_lower = res_tot['r2_Baseline' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_Baseline' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='Baseline',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.01,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='R2 score')
fig.update_xaxes(title_text='problem dimension')

fig.show()







## Mecanisms

In [19]:
if use_paper_results:
    folder_to_res = 'paper_results'
else:
    folder_to_res = 'mechanisms_variation'

results_mechanisms = pd.read_csv(folder_to_res + '/mechanisms_res.csv')

In [20]:
### XGB median R2 depending on the mechanisms

# set x, y and color variables
color_columns_list=['r2_CausalDA', 'r2_Baseline']
x_column = 'mechanism'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(results_mechanisms, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(results_mechanisms, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_CausalDA' + '_mean'].tolist()
y1_upper = res_tot['r2_CausalDA' + '_upper'].tolist()
y1_lower = res_tot['r2_CausalDA' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_CausalDA' + '_median'].tolist()


fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='CausalDA'
))
fig.update_traces(mode='lines')

### Baseline results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_Baseline' + '_mean'].tolist()
y1_upper = res_tot['r2_Baseline' + '_upper'].tolist()
y1_lower = res_tot['r2_Baseline' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_Baseline' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='Baseline'
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.01,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='R2 score')
fig.update_xaxes(title_text='mecanism family')

fig.show()







In [21]:
### box plot KL-divergence between original set and augmented set mean, median, std depending on the mechanisms

# rename columns and set x, y and color variables
y_column = 'KL-divergence'
x_column = 'mechanism'
results_mechanisms_plot = results_mechanisms.rename(columns={"KL_div_train_vs_aug_weighted": "KL-divergence"}).sort_values(by=[x_column])

# box plot
fig = px.box(results_mechanisms_plot, 
             x=x_column, y=y_column,
             width=800, height=300
            )
fig.update_traces(quartilemethod="inclusive") # or "exclusive", or "linear" by default

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.60,
    xanchor="left",
    x=0.01
))

# magin size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10)
)

fig.show()

In [22]:
### box plot mean relative difference in varaince mean, median, std depending on the mechanisms

# set x, y and color variables
y_column = 'Mean relative difference in variance'
x_column = 'mechanism'

# compute variance
results_df_var = process_variance_features(results_mechanisms)[[x_column, 
                                                       'predicted_variable_node_type',
                                                       'relative_delta_var_weighted_mean',
                                                       'relative_delta_var_NOT_weighted_mean',
                                                       'relative_delta_var_NOT_weighted_median',
                                                       'relative_delta_var_weighted_median'
                                                      ]]

# rename columns and 
results_mechanisms_plot = results_df_var.rename(columns={"relative_delta_var_weighted_mean": "Mean relative difference in variance"}).sort_values(by=[x_column])

# box plot
fig = px.box(results_mechanisms_plot, 
             x=x_column, y=y_column,
             width=800, height=300
            )
fig.update_traces(quartilemethod="inclusive") # or "exclusive", or "linear" by default

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.60,
    xanchor="left",
    x=0.01
))

# magin size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10)
)

fig.show()

In [23]:
### box plot of the average weights of data in the augmented set belonging or not to the original training set 

# rename columns and set x, y and color variables
results_mechanisms_plot = results_mechanisms.rename(columns={"avg_weight_aug_in_train": "from train", 
                                   "avg_weight_aug_NOT_in_train": "new data"})
color_columns_list = ['from train', 'new data']
x_column = 'mechanism'

# pivot table
results_df_pivoted = results_mechanisms_plot[[x_column]+color_columns_list].melt(x_column, var_name='Color', value_name='Y').sort_values(by=[x_column])

# box plot
fig = px.box(results_df_pivoted, 
             x=x_column, y='Y', color='Color', 
             width=800, height=300, 
             labels={
                 "Y": "Average data weights",
                 "Color": ""
             })
fig.update_traces(quartilemethod="inclusive") # or "exclusive", or "linear" by default

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.60,
    xanchor="left",
    x=0.01
))

# magin size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10)
)

fig.show()

## SCM noise

In [24]:
if use_paper_results:
    folder_to_res = 'paper_results'
else:
    folder_to_res = 'noise_coeff_variation'

SCM_noise = pd.read_csv(folder_to_res + '/noise_coeff_res.csv')

In [52]:
# KL-divergence between original set and augmented set mean, median, std depending on the scm noise

# set x, y and color variables
color_columns_list=['KL_div_train_vs_aug_weighted']
x_column = 'noise_coeff'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(SCM_noise, [x_column], color_columns_list[0])

# Figure
fig = go.Figure()
  
### KL_div_train_vs_aug_weighted results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['KL_div_train_vs_aug_weighted' + '_mean'].tolist()
y1_upper = res_tot['KL_div_train_vs_aug_weighted' + '_upper'].tolist()
y1_lower = res_tot['KL_div_train_vs_aug_weighted' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['KL_div_train_vs_aug_weighted' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=True,
    name='std'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    showlegend=True,
    name='mean'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=True,
    name='median'
))
fig.update_traces(mode='lines')

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

# axis titles
fig.update_yaxes(title_text='KL-divergence')
fig.update_xaxes(title_text='SCM noise level')

fig.show()





In [26]:
### XGB median R2 depending on the scm noise

# set x, y and color variables
color_columns_list=['r2_CausalDA', 'r2_Baseline']
x_column = 'noise_coeff'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(SCM_noise, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(SCM_noise, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_CausalDA' + '_mean'].tolist()
y1_upper = res_tot['r2_CausalDA' + '_upper'].tolist()
y1_lower = res_tot['r2_CausalDA' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_CausalDA' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='CausalDA',
))
fig.update_traces(mode='lines')

### Baseline results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_Baseline' + '_mean'].tolist()
y1_upper = res_tot['r2_Baseline' + '_upper'].tolist()
y1_lower = res_tot['r2_Baseline' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_Baseline' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='Baseline',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.01,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='R2 score')
fig.update_xaxes(title_text='SCM noise level')

fig.show()







In [51]:
# mean relative difference in varaince mean, median, std depending on the scm noise

# set x, y and color variables
color_columns_list=['relative_delta_var_weighted_mean']
x_column = 'noise_coeff'

# compute variance
results_df_var = process_variance_features(SCM_noise)[[x_column, 
                                                       'predicted_variable_node_type',
                                                       'relative_delta_var_weighted_mean',
                                                       'relative_delta_var_NOT_weighted_mean',
                                                       'relative_delta_var_NOT_weighted_median',
                                                       'relative_delta_var_weighted_median'
                                                      ]]

# aggregate results
res_tot = compute_stats_for_line_plot_paper(results_df_var, [x_column], color_columns_list[0])

# Figure
fig = go.Figure()
  
### relative_delta_var_weighted_mean results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['relative_delta_var_weighted_mean' + '_mean'].tolist()
y1_upper = res_tot['relative_delta_var_weighted_mean' + '_upper'].tolist()
y1_lower = res_tot['relative_delta_var_weighted_mean' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['relative_delta_var_weighted_mean' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=True,
    name='std'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    showlegend=True,
    name='mean'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=True,
    name='median'
))
fig.update_traces(mode='lines')

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=1600,
    height=300,
)

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

# axis titles
fig.update_yaxes(title_text='Mean relative difference in variance')
fig.update_xaxes(title_text='SCM noise level')

fig.show()





## Causal graph density

In [28]:
if use_paper_results:
    folder_to_res = 'paper_results'
else:
    folder_to_res = 'graph_density_variation'

densities = pd.read_csv(folder_to_res + '/graph_density_res.csv')

In [29]:
### XGB median R2 depending on the causal graph expected degree

# set x, y and color variables
color_columns_list=['r2_CausalDA', 'r2_Baseline']
x_column = 'expected_degree'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(densities, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(densities, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_CausalDA' + '_mean'].tolist()
y1_upper = res_tot['r2_CausalDA' + '_upper'].tolist()
y1_lower = res_tot['r2_CausalDA' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_CausalDA' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='CausalDA',
))
fig.update_traces(mode='lines')

### Baseline results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_Baseline' + '_mean'].tolist()
y1_upper = res_tot['r2_Baseline' + '_upper'].tolist()
y1_lower = res_tot['r2_Baseline' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_Baseline' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='Baseline',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.9,
    xanchor="right",
    x=0.99
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=1600,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='R2 score')
fig.update_xaxes(title_text='causal graph expected degree')

fig.show()







In [50]:
# mean relative difference in varaince mean, median, std depending on the causal graph expected degree

# set x, y and color variables
color_columns_list=['relative_delta_var_weighted_mean']
x_column = 'expected_degree'

# compute variance
results_df_var = process_variance_features(densities)[[x_column, 
                                                       'predicted_variable_node_type',
                                                       'relative_delta_var_weighted_mean',
                                                       'relative_delta_var_NOT_weighted_mean',
                                                       'relative_delta_var_NOT_weighted_median',
                                                       'relative_delta_var_weighted_median'
                                                      ]]

# aggregate results
res_tot = compute_stats_for_line_plot_paper(results_df_var, [x_column], color_columns_list[0])

# Figure
fig = go.Figure()
  
### relative_delta_var_weighted_mean results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['relative_delta_var_weighted_mean' + '_mean'].tolist()
y1_upper = res_tot['relative_delta_var_weighted_mean' + '_upper'].tolist()
y1_lower = res_tot['relative_delta_var_weighted_mean' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['relative_delta_var_weighted_mean' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=True,
    name='std'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    showlegend=True,
    name='mean'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=True,
    name='median'
))
fig.update_traces(mode='lines')

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99
))

# axis titles
fig.update_yaxes(title_text='Mean relative difference in variance')
fig.update_xaxes(title_text='causal graph expected degree')

fig.show()





In [49]:
# Wasserstein distance between original set and augmented set mean, median, std depending on the causal graph expected degree

# set x, y and color variables
color_columns_list=['Wasserstein_dist_train_vs_aug_weighted']
x_column = 'expected_degree'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(densities, [x_column], color_columns_list[0])

# Figure
fig = go.Figure()
  
### Wasserstein_dist_train_vs_aug_weighted results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['Wasserstein_dist_train_vs_aug_weighted' + '_mean'].tolist()
y1_upper = res_tot['Wasserstein_dist_train_vs_aug_weighted' + '_upper'].tolist()
y1_lower = res_tot['Wasserstein_dist_train_vs_aug_weighted' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['Wasserstein_dist_train_vs_aug_weighted' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=True,
    name='std'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    showlegend=True,
    name='mean'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=True,
    name='median'
))
fig.update_traces(mode='lines')

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="right",
    x=0.99
))

# axis titles
fig.update_yaxes(title_text='Wasserstein distance')
fig.update_xaxes(title_text='causal graph expected degree')

fig.show()





## Outliers

In [32]:
if use_paper_results:
    folder_to_res = 'paper_results'
else:
    folder_to_res = 'outliers_variation'

data_outliers = pd.read_csv(folder_to_res + '/outliers_res.csv')

In [33]:
### median fraction augmented data with and without ouliers

# set x, y and color variables
color_columns_list=['frac_augmented_wrong', 'frac_augmented_true']
x_column = 'frac_outliers'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_outliers, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(data_outliers, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### frac_augmented_wrong results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['frac_augmented_wrong' + '_mean'].tolist()
y1_upper = res_tot['frac_augmented_wrong' + '_upper'].tolist()
y1_lower = res_tot['frac_augmented_wrong' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['frac_augmented_wrong' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='with outliers',
))
fig.update_traces(mode='lines')

### frac_augmented_true results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['frac_augmented_true' + '_mean'].tolist()
y1_upper = res_tot['frac_augmented_true' + '_upper'].tolist()
y1_lower = res_tot['frac_augmented_true' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['frac_augmented_true' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='without outliers',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='fraction of newly generated data')
fig.update_xaxes(title_text='fraction of outliers')

fig.show()







In [34]:
### KL div mean median std of augmented data with and without outliers

# set x, y and color variables
color_columns_list=['KL_div_train_vs_aug_weighted_wrong', 'KL_div_train_vs_aug_weighted_true']
x_column = 'frac_outliers'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_outliers, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(data_outliers, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### KL_div_train_vs_aug_weighted_wrong results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['KL_div_train_vs_aug_weighted_wrong' + '_mean'].tolist()
y1_upper = res_tot['KL_div_train_vs_aug_weighted_wrong' + '_upper'].tolist()
y1_lower = res_tot['KL_div_train_vs_aug_weighted_wrong' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['KL_div_train_vs_aug_weighted_wrong' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=False
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    name='with outliers',
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=False
))
fig.update_traces(mode='lines')

### KL_div_train_vs_aug_weighted_true results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['KL_div_train_vs_aug_weighted_true' + '_mean'].tolist()
y1_upper = res_tot['KL_div_train_vs_aug_weighted_true' + '_upper'].tolist()
y1_lower = res_tot['KL_div_train_vs_aug_weighted_true' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['KL_div_train_vs_aug_weighted_true' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[1],
    line_color='rgba(255,255,255,0)', 
    showlegend=False
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[1],
    name='without outliers',
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    showlegend=False
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=1600,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='KL-divergence')
fig.update_xaxes(title_text='fraction of outliers')

fig.show()







In [35]:
### median MAPE with and without outliers

# set x, y and color variables
color_columns_list=['MAPE_CausalDA_wrong', 'MAPE_CausalDA_true']
x_column = 'frac_outliers'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_outliers, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(data_outliers, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA_wrong results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['MAPE_CausalDA_wrong' + '_mean'].tolist()
y1_upper = res_tot['MAPE_CausalDA_wrong' + '_upper'].tolist()
y1_lower = res_tot['MAPE_CausalDA_wrong' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['MAPE_CausalDA_wrong' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='with outliers',
))
fig.update_traces(mode='lines')

### CausalDA_true results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['MAPE_CausalDA_true' + '_mean'].tolist()
y1_upper = res_tot['MAPE_CausalDA_true' + '_upper'].tolist()
y1_lower = res_tot['MAPE_CausalDA_true' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['MAPE_CausalDA_true' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='without outliers',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.6
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='MAPE')
fig.update_xaxes(title_text='fraction of outliers')

fig.show()







In [36]:
### median R2 with and without outliers

# set x, y and color variables
color_columns_list=['r2_CausalDA_wrong', 'r2_CausalDA_true']
x_column = 'frac_outliers'

# aggregate results
res_tot = compute_stats_for_line_plot_paper(data_outliers, [x_column], color_columns_list[0])
res_col = compute_stats_for_line_plot_paper(data_outliers, [x_column], color_columns_list[1])
res_tot = res_tot.merge(res_col, how="left", on=[x_column]).sort_values(by=[x_column])

# Figure
fig = go.Figure()
  
### CausalDA_wrong results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_CausalDA_wrong' + '_mean'].tolist()
y1_upper = res_tot['r2_CausalDA_wrong' + '_upper'].tolist()
y1_lower = res_tot['r2_CausalDA_wrong' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_CausalDA_wrong' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    name='with outliers',
))
fig.update_traces(mode='lines')

### CausalDA_true results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['r2_CausalDA_true' + '_mean'].tolist()
y1_upper = res_tot['r2_CausalDA_true' + '_upper'].tolist()
y1_lower = res_tot['r2_CausalDA_true' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['r2_CausalDA_true' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[1],
    line_dash='dot',
    name='without outliers',
))
fig.update_traces(mode='lines')

# legend position
fig.update_layout(legend=dict(
    yanchor="bottom",
    y=0.01,
    xanchor="left",
    x=0.01
))

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# axis titles
fig.update_yaxes(title_text='R2 score')
fig.update_xaxes(title_text='fraction of outliers')

fig.show()







In [48]:
### relative difference in variance mean, median, std of augmented data with outliers

# set x, y and color variables
color_columns_list=['relative_delta_var_weighted_true_VS_wrong_mean']
x_column = 'frac_outliers'

# compute variance
results_df_var = process_variance_features_robustness(data_outliers)[[x_column, 
                                                       'predicted_variable_node_type',
                                                       'relative_delta_var_weighted_true_VS_wrong_mean'
                                                      ]]

# aggregate results
res_tot = compute_stats_for_line_plot_paper(results_df_var, [x_column], color_columns_list[0])

# Figure
fig = go.Figure()
  
### relative_delta_var_weighted_true_VS_wrong_mean results
# abscisse
x = res_tot[x_column].tolist()
x_rev = x[::-1]
# Line 1
y1 = res_tot['relative_delta_var_weighted_true_VS_wrong_mean' + '_mean'].tolist()
y1_upper = res_tot['relative_delta_var_weighted_true_VS_wrong_mean' + '_upper'].tolist()
y1_lower = res_tot['relative_delta_var_weighted_true_VS_wrong_mean' + '_lower'].tolist()
y1_lower = y1_lower[::-1]
# Line 2
y2 = res_tot['relative_delta_var_weighted_true_VS_wrong_mean' + '_median'].tolist()

fig.add_trace(go.Scatter(
    x=x+x_rev,
    y=y1_upper+y1_lower,
    fill='toself',
    fillcolor=transparent_color_list[0],
    line_color='rgba(255,255,255,0)', 
    showlegend=True,
    name='std'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y1,
    line_color=color_list[0],
    showlegend=True,
    name='mean'
))
fig.add_trace(go.Scatter(
    x=x,
    y=y2,
    line_color=color_list[0],
    line_dash='dot',
    showlegend=True,
    name='median'
))
fig.update_traces(mode='lines')

# magin & size
fig.update_layout(
    margin=dict(l=20, r=10, t=10, b=10),
    width=800,
    height=300,
)

# legend position
fig.update_layout(legend=dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01
))

# axis titles
fig.update_yaxes(title_text='Mean relative difference in variance')
fig.update_xaxes(title_text='fraction of outliers')

fig.show()



