# Control experiment: Fig. S8

In [11]:
import numpy as np
import plotly.graph_objects as go
import sys, os, glob
import pickle
from pathlib import Path

In [12]:
repo_root = Path().resolve().parent
sys.path.append(str(repo_root))

In [None]:
def load_and_rename_files(directory, pattern):
    search_pattern = os.path.join(directory, pattern)
    files = glob.glob(search_pattern)
    data_dict = {}
    
    for file_path in files:
        file_name = os.path.basename(file_path).replace('.dat', '')
        new_name = file_name.replace('-', '_')
        
        with open(file_path, 'rb') as file:
            data_dict[new_name] = pickle.load(file)
    
    return data_dict

## Retrieve simulated data

In [None]:
# Replace with own path to the indicators (see pipeline_indicator_controlXP.py)
path = repo_root / 'simulations' / 'indicators'

In [None]:
base_data_LinReg = load_and_rename_files(path, 'base-linear-LinearRegression-False.dat')
base_data_KNN = load_and_rename_files(path, 'base-linear-KNeighborsRegressor-False.dat')
control_data_LinReg = load_and_rename_files(path, 'control-linear-LinearRegression-False.dat')
control_data_KNN = load_and_rename_files(path, 'control-linear-KNeighborsRegressor-False.dat')

  data_dict[new_name] = pickle.load(file)


In [None]:
# Check if the Y in base_data matches control_data
for i in range(100):
    assert (base_data_KNN['base_linear_KNeighborsRegressor_False']['data'][i]['data'][:,1]).all() == (control_data_KNN['control_linear_KNeighborsRegressor_False']['data'][i]['data'][:,2]).all()

In [None]:
# Choose for Linear regression or KNN

# Linear Regression
# control_data = control_data_LinReg['control_linear_LinearRegression_False']
# base_data = base_data_LinReg['base_linear_LinearRegression_False']

# KNN
control_data = control_data_KNN['control_linear_KNeighborsRegressor_False']
base_data = base_data_KNN['base_linear_KNeighborsRegressor_False']

## Retrieve slope values and p values for each realisation

In [33]:
slopes_control = []
len_control = len(control_data['indicator'])
for i in range(len_control):
    slope, _ = np.polyfit(np.arange(len(control_data['indicator'][i])), 
                          control_data['indicator'][i], 1)
    slopes_control.append(slope)

In [34]:
slopes_base = []
for i in range(len_control):
    slope, _ = np.polyfit(np.arange(len(base_data['indicator'][i])), 
                          base_data['indicator'][i], 1)
    slopes_base.append(slope)

In [35]:
pvalues_control = np.array(control_data['pvalue'])
pvalues_base = np.array(base_data['pvalue'][:len_control])

## Boxplot of slope difference

In [36]:
slope_differences = (np.array(slopes_control) - np.array(slopes_base))/ np.array(slopes_base) * 100

In [37]:
# Boxplot of slope differences
fig = go.Figure()
fig.add_trace(go.Box(
    y=slope_differences,
    name='',
    marker_color='rgb(115,175,72)',
    boxpoints='all',
    jitter=0.3,
    pointpos=0,
    showlegend=False
))
fig.update_layout(
    title='Slope Differences between Control and Base Data',
    yaxis_title='Slope difference (%)',
    height=500,
    width=400,
    template='simple_white'
)
fig.update_layout(
    xaxis=dict(
        range=[-0.35, 0.35],
        tickvals=[],  # optional: removes tick labels
    ),
    yaxis=dict(
        range=[-150, 150],
    )
)

fig.show()

In [38]:
# fig.write_image('fig_controlXP_boxplot_knn.svg')

## Plot of slope for each realization

In [39]:
# Plot of slope for each realization
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=np.arange(len(slopes_base)),
    y=slopes_base,
    mode='lines+markers',
    name='Base',
    marker=dict(color='black', size=8)
)
)

fig.add_trace(go.Scatter(
    x=np.arange(len(slopes_control)),
    y=slopes_control,
    mode='lines+markers',
    name='Control',
    marker=dict(color='rgb(115,175,72)', size=8)
)
)

fig.update_layout(
    title='Slopes of Indicators for Control and Base Data',
    xaxis_title='Realisation',
    yaxis_title='Slope',
    height=500,
    width=800,
    template='simple_white',
    legend=dict(yanchor='bottom', y=0.86, x=0.05)
)
fig.show()

In [40]:
# fig.write_image('fig_controlXP_slopes_knn.svg')

## Plot of pvalue for each realisation

In [41]:
# Plot of pvalue for each realization
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=np.arange(len(slopes_base)),
    y=pvalues_base,
    mode='lines+markers',
    name='Base',
    marker=dict(color='black', size=8)
)
)
fig.add_trace(go.Scatter(
    x=np.arange(len(slopes_control)),
    y=pvalues_control,
    mode='lines+markers',
    name='Control',
    marker=dict(color='rgb(115,175,72)', size=8)
)
)
fig.add_trace(go.Scatter(x=[0,100], y=[0.05, 0.05],
    mode='lines',
    line=dict(color='red', width=2, dash='dash'),
    name='p = 0.05',
    showlegend=True))

fig.add_trace(go.Scatter(x=[0,100], y=[0.01, 0.01],
    mode='lines',
    line=dict(color='orange', width=2, dash='dash'),
    name='p = 0.01',
    showlegend=True))

fig.update_layout(
    title='pvalues of Indicators for Control and Base Data',
    xaxis_title='Realisation',
    yaxis_title='pvalue',
    height=500,
    width=800,
    template='simple_white',
    legend=dict(yanchor='bottom', y=0.75, x=0.05)

)

fig.show()

In [None]:
# fig.write_image('fig_controlXP_pvalues_knn.svg')