# Ridge

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

## Get Data

In [3]:
from os.path import join
from common.data import get_data
from common.paths import BIOBANK_LABELS

adhd_group_one = join(BIOBANK_LABELS, 'Subjects_with_WISC (set 1).csv')
X_adhd_one, Y_adhd_one, demographics, population = get_data(5, adhd_group_one)

adhd_group_two = join(BIOBANK_LABELS, 'Subjects_with_WISC (set 2).csv')
X_adhd_two, Y_adhd_two, demographics, population = get_data(5, adhd_group_two)

healthy = join(BIOBANK_LABELS, 'Subjects_with_WISC (healthy).csv')
X_healthy, Y_healthy, demographics, population = get_data(5, healthy)

print(f'X_adhd_one: {X_adhd_one.shape} | X_adhd_two: {X_adhd_two.shape} | X_healthy: {X_healthy.shape}')

X_adhd_one: (195, 34716) | X_adhd_two: (195, 34716) | X_healthy: (101, 34716)


In [6]:
from scipy import stats
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import RepeatedKFold, cross_validate, permutation_test_score
from sklearn.multioutput import MultiOutputRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Run for one target, one age bin

In [4]:
selected_target = "WISC_FSIQ"
y_adhd_one = Y_adhd_one[selected_target]
y_adhd_two = Y_adhd_two[selected_target]
y_healthy = Y_healthy[selected_target]

print(f'{selected_target}: {y_adhd_one.shape}, {y_adhd_two.shape}, {y_healthy.shape}')

WISC_FSIQ: (195,), (195,), (101,)


In [46]:
X = X_healthy
y = y_healthy
X_out_group_one = X_adhd_one
y_out_group_one = y_adhd_one
X_out_group_two = X_adhd_two
y_out_group_two = y_adhd_two

### Set up the model pipeline, metrics, and cross-validation approach

In [47]:
def regression_scorer(reg, X, y):
    y_pred = reg.predict(X)
    return stats.pearsonr(y, y_pred)[0]

rkf = RepeatedKFold(n_splits=10, n_repeats=10)
n_perm = 3000

In [48]:
%%time
age_group = 'all'

estimators = [StandardScaler(), RidgeCV(alphas=[a for a in range(5000, 55000, 5000)], 
                                        scoring=regression_scorer, cv=rkf)]
pipe = make_pipeline(*estimators)
pipe.fit(X, y)
ridge_cv = pipe['ridgecv']
best_alpha = ridge_cv.alpha_

print(f'Target: {selected_target} | Alpha: {best_alpha} | Score: {ridge_cv.best_score_:.2f}')

Target: WISC_FSIQ | Alpha: 5000 | Score: 0.09
CPU times: user 1min 42s, sys: 1.71 s, total: 1min 44s
Wall time: 10.5 s


### Run cross-validation

In [49]:
%%time
train_scores, test_scores, out_group_one, out_group_two = [], [], [], []
coefs, inters = [], []

for train_index, test_index in rkf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    pipe = make_pipeline(StandardScaler(), Ridge(alpha=best_alpha))
    pipe.fit(X_train, y_train)
    y_train_pred = pipe.predict(X_train)
    y_test_pred = pipe.predict(X_test)
    
    train_scores.append(regression_scorer(pipe, X_train, y_train))
    test_scores.append(regression_scorer(pipe, X_test, y_test))
    out_group_one.append(regression_scorer(pipe, X_out_group_one, y_out_group_one))
    out_group_two.append(regression_scorer(pipe, X_out_group_two, y_out_group_two))

    coefs.append(pipe['ridge'].coef_)
    inters.append(pipe['ridge'].intercept_)

avg_coef = np.mean(coefs, axis=0)
avg_inte = np.mean(inters, axis=0)

print(f'ridge_{population}_{selected_target}_{age_group}')
for metric in [train_scores, test_scores, out_group_one, out_group_two]:
    print(f'{np.mean(metric):.2f}')

ridge_healthy_WISC_FSIQ_all
1.00
0.05
0.26
0.29
CPU times: user 1min 58s, sys: 1.86 s, total: 2min
Wall time: 12 s


In [45]:
adhd_one_coefs = coefs.copy()

In [40]:
adhd_two_coefs = coefs.copy()

In [50]:
healthy_coefs = coefs.copy()

### Transfer model weights

#### ICC

In [77]:
%%time
import pingouin as pg

group_one = pd.DataFrame(adhd_two_coefs)
group_one['group'] = 'g1'

group_two = pd.DataFrame(healthy_coefs)
group_two['group'] = 'g2'

group_one_two = pd.concat([group_one, group_two])
icc_data = pd.melt(group_one_two, id_vars='group', var_name='connection', 
                   value_name='weight', ignore_index=False)
display(icc_data)

icc = pg.intraclass_corr(data=icc_data, targets='connection', raters='group', ratings='weight').round(3)
icc.set_index("Type")

Unnamed: 0,group,connection,weight
0,g1,0,0.001331
1,g1,0,-0.000645
2,g1,0,0.002651
3,g1,0,0.000743
4,g1,0,0.003804
...,...,...,...
95,g2,34715,-0.005430
96,g2,34715,-0.004315
97,g2,34715,-0.004543
98,g2,34715,-0.005158


CPU times: user 34 s, sys: 416 ms, total: 34.4 s
Wall time: 34.3 s


Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.121,1.275,34715,34716,0.0,"[0.11, 0.13]"
ICC2,Single random raters,0.121,1.276,34715,34715,0.0,"[0.11, 0.13]"
ICC3,Single fixed raters,0.121,1.276,34715,34715,0.0,"[0.11, 0.13]"
ICC1k,Average raters absolute,0.216,1.275,34715,34716,0.0,"[0.2, 0.23]"
ICC2k,Average random raters,0.216,1.276,34715,34715,0.0,"[0.2, 0.23]"
ICC3k,Average fixed raters,0.216,1.276,34715,34715,0.0,"[0.2, 0.23]"


### Run permutation statistic

In [None]:
%%time
score, _, pvalue = permutation_test_score(
    pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_permutations=n_perm, n_jobs=-1)

print(f'ridge_{population}_{selected_target}_{age_group}')
print(f'Alpha: {ridge_cv.alpha_} | Score: {score:.2f} | p-value: {pvalue:.4f}')

## Run for all targets, all age bins

In [None]:
%%time
from common.binning import bin_by_age
from common.wisc import FSIQ, PRIMARY_INDICES
from common.paths import RIDGE_RESULTS

results = []
targets = FSIQ + PRIMARY_INDICES

for target in targets:
    y = Y[target]
    bins = bin_by_age(X, y, ages, y)
    bin_1, bin_2, bin_3 = bins[0], bins[1], bins[2]
    X_all = [X, bin_1[0], bin_2[0], bin_3[0]]
    y_all = [y, bin_1[1], bin_2[1], bin_3[1]]
    bin_labels = ["All  ", "Bin 1", "Bin 2", "Bin 3"]
#     X_all = [X]
#     y_all = [y]
#     bin_labels = ["All  "]
    
    for X_cv, y_cv, bin_label in zip(X_all, y_all, bin_labels):
        # Find best alpha
        estimators = [StandardScaler(), RidgeCV(alphas=[a for a in range(1000, 21000, 1000)], 
                                        scoring=regression_scorer, cv=rkf)]
        pipe = make_pipeline(*estimators).fit(X_cv, y_cv)
        ridge_cv = pipe['ridgecv']
        
        # Do permutation test
        estimators = [StandardScaler(), Ridge(alpha=ridge_cv.alpha_)]
        pipe = make_pipeline(*estimators)
        
        score, _, pvalue = permutation_test_score(
            pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_permutations=n_perm, n_jobs=-1)
        results.append({    
            'Model': 'ridge',
            'Population': population,
            'Target': target,
            'Bin': bin_label,
            'Alpha': ridge_cv.alpha_,
            'Score': score,
            'P-value': pvalue,
        })
        print(results[-1])
        
results_df = pd.DataFrame(results)
display(results_df)
results_df.to_csv(RIDGE_RESULTS)

{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'All  ', 'Alpha': 16000, 'Score': 0.37091336391988167, 'P-value': 0.0003332222592469177}
{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'Bin 1', 'Alpha': 1000, 'Score': 0.1735052160991861, 'P-value': 0.07630789736754415}
{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'Bin 2', 'Alpha': 5000, 'Score': 0.34779987507031157, 'P-value': 0.0003332222592469177}
{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'Bin 3', 'Alpha': 1000, 'Score': 0.0789517941358648, 'P-value': 0.242919026991003}
