# PLS Regression

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

## Get Data

In [2]:
from common.data import get_data

X, Y, demographics = get_data()
ages = demographics['Age']
population = 'adhd' if X.shape[0] == 390 else 'healthy'

print(f'X: {X.shape} | Y: {len(Y.keys())} | Age: {ages.shape}')

X: (390, 34716) | Y: 16 | Age: (390,) | Sex: (390,)


In [3]:
from scipy import stats
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RepeatedKFold, cross_validate, permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Run for one target, one age bin

In [4]:
selected_target = "WISC_FSIQ"
y = Y[selected_target]

print(f'y: {y.shape}')

y: (390,)


In [42]:
from common.binning import bin_by_age

bins = bin_by_age(X, y, ages, y, True)
bin_1, bin_2, bin_3 = bins[0], bins[1], bins[2]
print(f'Bin 1: {bin_1[0].shape} | Bin 2: {bin_2[0].shape} | Bin 3: {bin_3[0].shape}')

Bin 0 Range: 6.22 -> 8.80
Bin 1 Range: 9.34 -> 11.97
Bin 2 Range: 12.50 -> 15.87
---
Bin 0 Range: 80.00 -> 144.00
Bin 1 Range: 86.00 -> 123.00
Bin 2 Range: 66.00 -> 126.00
---
Bin 1: (19, 34716) | Bin 2: (16, 34716) | Bin 3: (16, 34716)


### Set up the model pipeline, metrics, and cross-validation approach

In [15]:
def regression_scorer(reg, X, y):
    y_pred = reg.predict(X)[:, 0]
    return stats.pearsonr(y, y_pred)[0]

estimators = [StandardScaler(), PLSRegression(n_components=4)]
pipe = make_pipeline(*estimators)

scoring = ['train_score', 'test_score']
# rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=251183)
rkf = RepeatedKFold(n_splits=10, n_repeats=10)

In [14]:
rkf = RepeatedKFold(n_splits=10, n_repeats=10, random_state=251183)

for train_index, test_index in rkf.split(X):
    print("TEST:", test_index)

TEST: [  1   7  32  49  56  69  96 101 110 121 122 128 138 149 150 152 168 172
 176 182 185 200 210 213 214 223 227 229 231 233 243 261 275 302 316 323
 328 336 386]
TEST: [  3  11  17  21  22  38  39  51  64  65  71 113 118 155 160 175 183 186
 211 219 224 238 245 254 264 269 277 284 290 292 293 304 308 321 325 337
 341 376 384]
TEST: [  2  12  15  29  36  53  61  70  73  80  82  85  86  90  98 120 134 141
 174 209 215 232 234 239 248 263 278 283 289 298 299 300 306 347 356 361
 366 383 389]
TEST: [  4   5  68  75  77  88 107 108 117 127 137 139 179 193 202 212 216 218
 228 230 252 271 282 285 288 296 307 309 311 326 327 329 330 338 345 352
 364 365 380]
TEST: [ 10  24  27  34  35  44  47  52  67  72  74  79  91  94  97 114 116 129
 132 146 177 190 207 226 240 241 242 246 250 260 266 270 272 294 312 344
 346 375 385]
TEST: [  9  14  23  37  46  48  54  81  93 100 109 126 143 144 153 157 159 161
 180 189 194 198 205 206 217 256 273 276 303 305 317 339 340 349 358 362
 377 379 382]
TEST

### Run cross-validation

In [16]:
%%time
# X_cv = bin_3[0]
# y_cv = bin_3[1]
X_cv = X
y_cv = y

scores = cross_validate(pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)

coefs = np.array([estimator['plsregression'].coef_ for estimator in scores['estimator']])
coefs = np.squeeze(coefs, axis=2)
avg_coef = np.mean(coefs, axis=0)

print(f'Target: {selected_target}')
for metric in scoring:
    metric_values = scores[metric]
    print(f'Avg {metric}: {np.mean(metric_values):.2f}')

Target: WISC_FSIQ
Avg train_score: 0.84
Avg test_score: 0.35
CPU times: user 1.93 s, sys: 868 ms, total: 2.8 s
Wall time: 24.2 s


In [44]:
from common.paths import PLS_WEIGHTS

np.save(PLS_WEIGHTS, avg_coef)

#### ICC

In [17]:
print(coefs.shape)
icc_data = pd.DataFrame(coefs).melt(
    var_name='connection', value_name='weight', ignore_index=False)
icc_data['cv_run_num'] = icc_data.index

(100, 34716)


In [32]:
display(icc_data)

Unnamed: 0,connection,weight,cv_run_num
0,0,0.005839,0
1,0,0.003551,1
2,0,0.004353,2
3,0,0.001609,3
4,0,0.004748,4
...,...,...,...
95,34715,0.002623,95
96,34715,0.000145,96
97,34715,-0.000327,97
98,34715,0.002205,98


In [18]:
%%time

import pingouin as pg

icc = pg.intraclass_corr(data=icc_data, targets='connection', raters='cv_run_num',
                         ratings='weight').round(3)
icc.set_index("Type")

CPU times: user 16min 38s, sys: 7.17 s, total: 16min 45s
Wall time: 16min 45s


Unnamed: 0_level_0,Description,ICC,F,df1,df2,pval,CI95%
Type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ICC1,Single raters absolute,0.91,1012.551,34715,3436884,0.0,"[0.91, 0.91]"
ICC2,Single random raters,0.91,1012.637,34715,3436785,0.0,"[0.91, 0.91]"
ICC3,Single fixed raters,0.91,1012.637,34715,3436785,0.0,"[0.91, 0.91]"
ICC1k,Average raters absolute,0.999,1012.551,34715,3436884,0.0,"[1.0, 1.0]"
ICC2k,Average random raters,0.999,1012.637,34715,3436785,0.0,"[1.0, 1.0]"
ICC3k,Average fixed raters,0.999,1012.637,34715,3436785,0.0,"[1.0, 1.0]"


### Run permutation statistic

In [46]:
%%time
# X_cv = bin_1[0]
# y_cv = bin_1[1]
X_cv = X
y_cv = y

score, _, pvalue = permutation_test_score(
    pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_permutations=3000, n_jobs=-1)

print(f'Target: {selected_target}')
print(f'Score: {score:.2f} | p-value: {pvalue:.4f}')

Target: WISC_PSI
Score: 0.06 | p-value: 0.3949
CPU times: user 29.5 s, sys: 1.13 s, total: 30.7 s
Wall time: 4min 43s


## Run for all targets, all age bins

In [38]:
def regression_scorer(reg, X, y):
    y_pred = reg.predict(X)[:, 0]
    return stats.pearsonr(y, y_pred)[0]

scoring = ['train_score', 'test_score']
rkf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=251183)

estimators = [StandardScaler(), PLSRegression(n_components=4)]
pipe = make_pipeline(*estimators)

In [39]:
%%time
from common.binning import bin_by_age
from common.wisc import FSIQ, PRIMARY_INDICES

targets = FSIQ + PRIMARY_INDICES

for target in targets:
    y = Y[target]
    bins = bin_by_age(X, y, ages, y)
    bin_1, bin_2, bin_3 = bins[0], bins[1], bins[2]
    X_all = [X, bin_1[0], bin_2[0], bin_3[0]]
    y_all = [y, bin_1[1], bin_2[1], bin_3[1]]
    age_bin_label = ["All  ", "Bin 1", "Bin 2", "Bin 3"]
    
    for X_cv, y_cv, bin_label in zip(X_all, y_all, age_bin_label):
        score, _, pvalue = permutation_test_score(
            pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_permutations=3000, n_jobs=-1)
        print(f'Bin: {bin_label} | Target: {target} | Score: {score:.2f} | p-value: {pvalue:.4f}')
    print('---')

Bin: All   | Target: WISC_FSIQ | Score: 0.36 | p-value: 0.0133
Bin: Bin 1 | Target: WISC_FSIQ | Score: -0.03 | p-value: 0.5475
Bin: Bin 2 | Target: WISC_FSIQ | Score: 0.22 | p-value: 0.2459
Bin: Bin 3 | Target: WISC_FSIQ | Score: -0.07 | p-value: 0.6015
---
Bin: All   | Target: WISC_VSI | Score: 0.34 | p-value: 0.0210
Bin: Bin 1 | Target: WISC_VSI | Score: -0.34 | p-value: 0.8254
Bin: Bin 2 | Target: WISC_VSI | Score: 0.15 | p-value: 0.3159
Bin: Bin 3 | Target: WISC_VSI | Score: 0.25 | p-value: 0.2206
---
Bin: All   | Target: WISC_FRI | Score: 0.20 | p-value: 0.1123
Bin: Bin 1 | Target: WISC_FRI | Score: -0.03 | p-value: 0.5315
Bin: Bin 2 | Target: WISC_FRI | Score: 0.23 | p-value: 0.2423
Bin: Bin 3 | Target: WISC_FRI | Score: -0.15 | p-value: 0.6924
---
Bin: All   | Target: WISC_WMI | Score: -0.00 | p-value: 0.4972
Bin: Bin 1 | Target: WISC_WMI | Score: -0.31 | p-value: 0.8064
Bin: Bin 2 | Target: WISC_WMI | Score: 0.50 | p-value: 0.0447
Bin: Bin 3 | Target: WISC_WMI | Score: -0.05 | 