# Residuals visualization for DCE's computation using KNN on simulated data (Fig. S7)

residuals = y - (slope * x + intercept)

mean_residual = np.mean(np.abs(residuals))

In [15]:
import sys
from pathlib import Path

import numpy as np
import pandas as pd
import plotly.graph_objects as go

In [2]:
repo_root = Path().resolve().parent  # notebooks folder’s parent
sys.path.append(str(repo_root))

In [3]:
import functions.data_analysis as analysis

## Retrieve data

In [None]:
mypath = repo_root / 'simulations' / 'indicators'

In [5]:
#load data
base_data_dict = analysis.load_and_rename_files(mypath, 'base-*.dat')
confounderDecreasAC_data_dict = analysis.load_and_rename_files(mypath, 'confounderDecreasAC-*.dat')
falseAlarm_data_dict = analysis.load_and_rename_files(mypath, 'falseAlarm-*.dat')
flatControl_data_dict = analysis.load_and_rename_files(mypath, 'flatControl-*.dat')

In [6]:
all_data_dicts = {
    'base': base_data_dict,
    'confounderDecreasAC': confounderDecreasAC_data_dict,
    'falseAlarm': falseAlarm_data_dict,
    'flatControl': flatControl_data_dict,
}

## Aggregate residuals for each model

In [7]:
sigma_Y = "001"
methods=['KNeighborsRegressor',
         ]
window_length = "1000"
detrend='rmvDet'

In [8]:
# Choose which models and X_function to analyze
models = ['base', 
          'confounderDecreasAC', #'confounderIncreasAC', 
          'flatControl',
          'falseAlarm',
          ]

X_functions = ['linear', 
    'square', 
               ]

In [9]:
def aggregate_residuals_causalEE(method='KNeighborsRegressor', window_length=1000):
    residuals_agg_dict = {}

    for i, X_function in enumerate(X_functions, start=1):
        for j, model in enumerate(models, start=1):
            # Only add combinations we want to plot
            if (model == 'base' and X_function == 'linear') or \
            (model == 'confounderDecreasAC' and X_function in ['linear', 'square']) or \
            (model == 'flatControl' and X_function == 'linear') or \
            (model == 'falseAlarm' and X_function in ['linear', 'square']):
                model_data_dict = all_data_dicts[model]
                var_name = f"{model}_{X_function}_{method}_{detrend}"

                residuals_agg_dict[f"{model}_{X_function}"] = []

                for k in range(len(model_data_dict[var_name]['residuals'])):
                    residuals_agg_dict[f"{model}_{X_function}"].append(model_data_dict[var_name]['residuals'][k])

    return residuals_agg_dict

In [10]:
residuals_agg_dict = aggregate_residuals_causalEE(method='KNeighborsRegressor')

## Plot

In [11]:
def create_residuals_dataframe(agg_dict):
    """Create DataFrame for plotting residuals (no binning).

    Parameters
    - agg_dict: dictionary where each key maps to a list of residual arrays

    Returns
    - df: pandas DataFrame with columns ['Key','Residuals']
    """
    data = []

    # Define the desired order of keys
    key_order = [
        'base_linear',
        'confounderDecreasAC_linear',
        'confounderDecreasAC_square',
        'flatControl_linear',
        'falseAlarm_linear',
        'falseAlarm_square'
    ]

    for key in key_order:
        if key in agg_dict:
            # concatenate all simulation runs for this key
            residuals = np.concatenate(agg_dict[key])
            data.extend([(key, float(r)) for r in residuals])

    df = pd.DataFrame(data, columns=["Key", "Residuals"])

    # Apply custom labels for x-axis
    custom_labels = {
        'base_linear': '(1), g(x)=0',
        'confounderDecreasAC_linear': '(2), g(x)=x',
        'confounderDecreasAC_square': '(2), g(x)=x²',
        'flatControl_linear': '(3), g(x)=0',
        'falseAlarm_linear': '(4), g(x)=x',
        'falseAlarm_square': '(4), g(x)=x²',
    }
    df["Key"] = df["Key"].map(custom_labels)

    # Create categorical type to preserve order
    key_order_labels = [custom_labels[k] for k in key_order if k in agg_dict]
    df["Key"] = pd.Categorical(df["Key"], categories=key_order_labels, ordered=True)

    return df

In [12]:
# Create the DataFrame (original, no binning)
df_residuals = create_residuals_dataframe(residuals_agg_dict)

In [13]:
def plot_residuals_boxplot_fast(df):
    if hasattr(df['Key'].dtype, 'categories'):
        categories = list(df['Key'].cat.categories)
    else:
        # keep appearance order as first-seen
        categories = list(dict.fromkeys(df['Key'].astype(str)))

    fig = go.Figure()
    for key in categories:
        group = df[df['Key'].astype(str) == str(key)]
        y = group['Residuals'].dropna().to_numpy()
        if y.size == 0:
            continue
        q1 = np.percentile(y, 25)
        median = np.percentile(y, 50)
        q3 = np.percentile(y, 75)
        iqr = q3 - q1
        lowerf = q1 - 1.5 * iqr
        upperf = q3 + 1.5 * iqr

        fig.add_trace(go.Box(
            x=[str(key)],           # place box at category x
            q1=[q1],
            median=[median],
            q3=[q3],
            lowerfence=[lowerf],
            upperfence=[upperf],
            boxpoints=False,
            marker_color='#AFDDE9',
            line_color='#37ABC8',
            name=str(key),
            showlegend=False,
        ))

    fig.update_layout(
        width=1000,
        height=600,
        xaxis_title='Scenario',
        yaxis_title='Absolute Residuals',
        plot_bgcolor='rgba(215, 238, 244, 0.3)',
        paper_bgcolor='rgba(0, 0, 0, 0)',
    )
    # ensure x-axis preserves category order
    fig.update_xaxes(type='category', categoryorder='array', categoryarray=categories)
    return fig

In [16]:
# Create the plot
fig = plot_residuals_boxplot_fast(df_residuals)

In [17]:
fig.show()

In [None]:
# fig.write_image('fig_residuals_KNN_all_combinations.pdf')