# Model Diagnosis Cross-Prediction

In [11]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Get Data

In [12]:
from os.path import join
from common.data import get_data
from common.paths import BIOBANK_LABELS

adhd_group_one = join(BIOBANK_LABELS, 'Subjects_with_WISC (adhd 1).csv')
X_adhd_one, Y_adhd_one, demographics, population = get_data(5, adhd_group_one)

adhd_group_two = join(BIOBANK_LABELS, 'Subjects_with_WISC (adhd 2).csv')
X_adhd_two, Y_adhd_two, demographics, population = get_data(5, adhd_group_two)

healthy = join(BIOBANK_LABELS, 'Subjects_with_WISC (healthy).csv')
X_healthy, Y_healthy, demographics, population = get_data(5, healthy)

print(f'X_adhd_one: {X_adhd_one.shape} | X_adhd_two: {X_adhd_two.shape} | X_healthy: {X_healthy.shape}')

X_adhd_one: (195, 34716) | X_adhd_two: (195, 34716) | X_healthy: (106, 34716)


In [13]:
from scipy import stats
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import RepeatedKFold, cross_validate, permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Run for one target, one age bin

In [14]:
age_group = 'all'
selected_target = "WISC_FSIQ"
y_adhd_one = Y_adhd_one[selected_target]
y_adhd_two = Y_adhd_two[selected_target]
y_healthy = Y_healthy[selected_target]

print(f'{selected_target}: {y_adhd_one.shape}, {y_adhd_two.shape}, {y_healthy.shape}')

WISC_FSIQ: (195,), (195,), (106,)


In [15]:
X = X_healthy
y = y_healthy
X_out_group_one = X_adhd_one
y_out_group_one = y_adhd_one
X_out_group_two = X_adhd_two
y_out_group_two = y_adhd_two

In [20]:
X = X_adhd_one
y = y_adhd_one
X_out_group_one = X_healthy
y_out_group_one = y_healthy
X_out_group_two = X_adhd_two
y_out_group_two = y_adhd_two

In [25]:
X = X_adhd_two
y = y_adhd_two
X_out_group_one = X_adhd_one
y_out_group_one = y_adhd_one
X_out_group_two = X_healthy
y_out_group_two = y_healthy

### Set up the model pipeline, metrics, and cross-validation approach

In [26]:
def regression_scorer(reg, X, y):
    y_pred = reg.predict(X)
#     y_pred = reg.predict(X)[:, 0]
    return stats.pearsonr(y, y_pred)[0]

rkf = RepeatedKFold(n_splits=5, n_repeats=10)
n_perm = 3000

In [27]:
%%time

estimators = [StandardScaler(), RidgeCV(alphas=[a for a in range(5000, 55000, 5000)], 
                                        scoring=regression_scorer, cv=rkf)]
pipe = make_pipeline(*estimators)
pipe.fit(X, y)
ridge_cv = pipe['ridgecv']
best_alpha = ridge_cv.alpha_

print(f'Target: {selected_target} | Alpha: {best_alpha} | Score: {ridge_cv.best_score_:.2f}')

Target: WISC_FSIQ | Alpha: 25000 | Score: 0.35
CPU times: user 1min 37s, sys: 2.03 s, total: 1min 39s
Wall time: 10 s


### Run cross-validation

In [28]:
%%time
train_scores, test_scores, out_group_one, out_group_two = [], [], [], []
coefs, inters = [], []

for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    pipe = make_pipeline(StandardScaler(), Ridge(alpha=best_alpha))
#     pipe = make_pipeline(StandardScaler(), PLSRegression(n_components=4))
    pipe.fit(X_train, y_train)
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)
    
    train_scores.append(regression_scorer(pipe, X_train, y_train))
    test_scores.append(regression_scorer(pipe, X_test, y_test))
    out_group_one.append(regression_scorer(pipe, X_out_group_one, y_out_group_one))
    out_group_two.append(regression_scorer(pipe, X_out_group_two, y_out_group_two))

    coefs.append(pipe['ridge'].coef_)
    inters.append(pipe['ridge'].intercept_)

avg_coef = np.mean(coefs, axis=0)
avg_inte = np.mean(inters, axis=0)

print(f'{population}_{selected_target}_{age_group}')
for metric in [train_scores, test_scores, out_group_one, out_group_two]:
    print(f'{np.mean(metric):.2f}')

healthy_WISC_FSIQ_all
0.93
0.35
0.23
0.25
CPU times: user 1min 23s, sys: 1.63 s, total: 1min 24s
Wall time: 8.49 s


In [24]:
adhd_one_coefs = coefs.copy()

In [29]:
adhd_two_coefs = coefs.copy()

In [19]:
healthy_coefs = coefs.copy()

### Transfer model weights

#### ICC

In [34]:
%%time
import pingouin as pg

group_one = pd.DataFrame(adhd_one_coefs)
group_one['group'] = 'g1'

group_two = pd.DataFrame(healthy_coefs)
group_two['group'] = 'g2'

group_one_two = pd.concat([group_one, group_two])
icc_data = pd.melt(group_one_two, id_vars='group', var_name='connection', 
                   value_name='weight', ignore_index=False)
# display(icc_data)

icc = pg.intraclass_corr(data=icc_data, targets='connection', raters='group', ratings='weight').round(3)
icc.set_index("Type")

CPU times: user 32.2 s, sys: 228 ms, total: 32.4 s
Wall time: 32.4 s


Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.091,1.2,34715,34716,0.0,"[0.08, 0.1]"
ICC2,Single random raters,0.091,1.2,34715,34715,0.0,"[0.08, 0.1]"
ICC3,Single fixed raters,0.091,1.2,34715,34715,0.0,"[0.08, 0.1]"
ICC1k,Average raters absolute,0.167,1.2,34715,34716,0.0,"[0.15, 0.18]"
ICC2k,Average random raters,0.167,1.2,34715,34715,0.0,"[0.15, 0.18]"
ICC3k,Average fixed raters,0.167,1.2,34715,34715,0.0,"[0.15, 0.18]"
