# Ridge

In [7]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Get Data

In [8]:
from common.data import get_data

X, Y, demographics, population = get_data()
ages = demographics['Age']

print(f'X: {X.shape} | Y: {len(Y.keys())} | Age: {ages.shape} | Population: {population}')

X: (390, 34716) | Y: 16 | Age: (390,) | Population: adhd


In [9]:
from scipy import stats
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import RepeatedKFold, cross_validate, permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Run for one target, one age bin

In [28]:
selected_target = "WISC_PSI"
y = Y[selected_target]

print(f'{selected_target}: {y.shape}')

WISC_PSI: (390,)


In [71]:
from common.binning import bin_by_age

bins = bin_by_age(X, y, ages, y)
bin_1, bin_2, bin_3 = bins[0], bins[1], bins[2]
print(f'Bin 1: {bin_1[0].shape} | Bin 2: {bin_2[0].shape} | Bin 3: {bin_3[0].shape}')

Bin 0 Range: 6.22 -> 8.80
Bin 1 Range: 9.34 -> 11.97
Bin 2 Range: 12.50 -> 15.87
---
Bin 0 Range: 80.00 -> 144.00
Bin 1 Range: 86.00 -> 123.00
Bin 2 Range: 66.00 -> 126.00
---
Bin 1: (19, 34716) | Bin 2: (16, 34716) | Bin 3: (16, 34716)


### Set up the model pipeline, metrics, and cross-validation approach

In [12]:
def regression_scorer(reg, X, y):
    y_pred = reg.predict(X)
    return stats.pearsonr(y, y_pred)[0]

scoring = ['train_score', 'test_score']
rkf = RepeatedKFold(n_splits=10, n_repeats=10)
n_perm = 3000

In [29]:
%%time
# X_cv = bin_1[0]
# y_cv = bin_1[1]
X_cv = X
y_cv = y
age_group = 'all'

estimators = [StandardScaler(), RidgeCV(alphas=[a for a in range(5000, 55000, 5000)], 
                                        scoring=regression_scorer, cv=rkf)]
pipe = make_pipeline(*estimators)
pipe.fit(X_cv, y_cv)
ridge_cv = pipe['ridgecv']

print(f'Best alpha: {ridge_cv.alpha_}')
print(f'Best score: {ridge_cv.best_score_:.2f}')

estimators = [StandardScaler(), Ridge(alpha=ridge_cv.alpha_)]
pipe = make_pipeline(*estimators)

Best alpha: 30000
Best score: 0.08
CPU times: user 10min 33s, sys: 28.2 s, total: 11min 1s
Wall time: 1min 20s


### Run cross-validation

In [30]:
%%time

scores = cross_validate(pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)

coefs = np.array([estimator['ridge'].coef_ for estimator in scores['estimator']])
avg_coef = np.mean(coefs, axis=0)

print(f'ridge_{population}_{selected_target}_{age_group}')
for metric in scoring:
    metric_values = scores[metric]
    print(f'Avg {metric}: {np.mean(metric_values):.3f}')

ridge_adhd_WISC_PSI_all
Avg train_score: 0.899
Avg test_score: 0.101
CPU times: user 2.33 s, sys: 123 ms, total: 2.45 s
Wall time: 7.26 s


In [31]:
from os.path import join
from common.paths import RIDGE_WEIGHTS

filename = f'ridge_{population}_{selected_target}_{age_group}.npy'

np.save(join(RIDGE_WEIGHTS, filename), avg_coef)

#### ICC

In [16]:
import pingouin as pg

icc_data = pd.DataFrame(coefs).melt(var_name='connection', value_name='weight', ignore_index=False)
icc_data['cv_run_num'] = icc_data.index
icc = pg.intraclass_corr(data=icc_data, targets='connection', raters='cv_run_num', ratings='weight').round(3)
icc.set_index("Type")

Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.906,968.25,34715,3436884,0.0,"[0.91, 0.91]"
ICC2,Single random raters,0.906,968.346,34715,3436785,0.0,"[0.91, 0.91]"
ICC3,Single fixed raters,0.906,968.346,34715,3436785,0.0,"[0.91, 0.91]"
ICC1k,Average raters absolute,0.999,968.25,34715,3436884,0.0,"[1.0, 1.0]"
ICC2k,Average random raters,0.999,968.346,34715,3436785,0.0,"[1.0, 1.0]"
ICC3k,Average fixed raters,0.999,968.346,34715,3436785,0.0,"[1.0, 1.0]"


### Run permutation statistic

In [37]:
%%time
score, _, pvalue = permutation_test_score(
    pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_permutations=n_perm, n_jobs=-1)

print(f'ridge_{population}_{selected_target}_{age_group}')
print(f'Alpha: {ridge_cv.alpha_} | Score: {score:.2f} | p-value: {pvalue:.4f}')

ridge_healthy_WISC_PSI_all
Alpha: 5000 | Score: -0.04 | p-value: 0.6268
CPU times: user 1min 13s, sys: 2.12 s, total: 1min 15s
Wall time: 1h 5min 49s


## Run for all targets, all age bins

In [5]:
%%time
from common.binning import bin_by_age
from common.wisc import FSIQ, PRIMARY_INDICES
from common.paths import RIDGE_RESULTS

results = []
targets = FSIQ + PRIMARY_INDICES

for target in targets[:3]:
    y = Y[target]
    bins = bin_by_age(X, y, ages, y)
    bin_1, bin_2, bin_3 = bins[0], bins[1], bins[2]
    X_all = [X, bin_1[0], bin_2[0], bin_3[0]]
    y_all = [y, bin_1[1], bin_2[1], bin_3[1]]
    bin_labels = ["All  ", "Bin 1", "Bin 2", "Bin 3"]
#     X_all = [X]
#     y_all = [y]
#     bin_labels = ["All  "]
    
    for X_cv, y_cv, bin_label in zip(X_all, y_all, bin_labels):
        # Find best alpha
        estimators = [StandardScaler(), RidgeCV(alphas=[a for a in range(1000, 21000, 1000)], 
                                        scoring=regression_scorer, cv=rkf)]
        pipe = make_pipeline(*estimators).fit(X_cv, y_cv)
        ridge_cv = pipe['ridgecv']
        
        # Do permutation test
        estimators = [StandardScaler(), Ridge(alpha=ridge_cv.alpha_)]
        pipe = make_pipeline(*estimators)
        
        score, _, pvalue = permutation_test_score(
            pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_permutations=n_perm, n_jobs=-1)
        results.append({    
            'Model': 'ridge',
            'Population': population,
            'Target': target,
            'Bin': bin_label,
            'Alpha': ridge_cv.alpha_,
            'Score': score,
            'P-value': pvalue,
        })
        print(results[-1])
        
results_df = pd.DataFrame(results)
display(results_df)
results_df.to_csv(RIDGE_RESULTS)

ridge_adhd_WISC_FSIQ_All   | Alpha: 3000 | Score: 0.42 | p-value: 0.0003
---
ridge_adhd_WISC_VSI_All   | Alpha: 4000 | Score: 0.28 | p-value: 0.0013
---
ridge_adhd_WISC_VCI_All   | Alpha: 14000 | Score: 0.41 | p-value: 0.0003
---
CPU times: user 1h 53min 20s, sys: 9min 39s, total: 2h 3min
Wall time: 12h 5min 6s
