# PLS Regression

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

## Get Data

In [2]:
from common.data import get_data

X, Y, demographics = get_data()
ages, sexes = demographics['Age'], demographics['Sex']

print(f'X: {X.shape} | Y: {len(Y.keys())} | Age: {ages.shape} | Sex: {sexes.shape}')

X: (65, 34716) | Y: 16 | Age: (65,) | Sex: (65,)


In [4]:
from scipy import stats
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RepeatedKFold, cross_validate, permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Run for one target, one age bin

In [41]:
selected_target = "WISC_PSI"
y = Y[selected_target]

print(f'y: {y.shape}')

y: (51,)


In [42]:
from common.binning import bin_by_age

bins = bin_by_age(X, y, ages, y, True)
bin_1, bin_2, bin_3 = bins[0], bins[1], bins[2]
print(f'Bin 1: {bin_1[0].shape} | Bin 2: {bin_2[0].shape} | Bin 3: {bin_3[0].shape}')

Bin 0 Range: 6.22 -> 8.80
Bin 1 Range: 9.34 -> 11.97
Bin 2 Range: 12.50 -> 15.87
---
Bin 0 Range: 80.00 -> 144.00
Bin 1 Range: 86.00 -> 123.00
Bin 2 Range: 66.00 -> 126.00
---
Bin 1: (19, 34716) | Bin 2: (16, 34716) | Bin 3: (16, 34716)


### Set up the model pipeline, metrics, and cross-validation approach

In [8]:
def regression_scorer(reg, X, y):
    y_pred = reg.predict(X)[:, 0]
    return stats.pearsonr(y, y_pred)[0]

estimators = [StandardScaler(), PLSRegression(n_components=4)]
pipe = make_pipeline(*estimators)

scoring = ['train_score', 'test_score']
rkf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=251183)

### Run cross-validation

In [17]:
%%time
# X_cv = bin_3[0]
# y_cv = bin_3[1]
X_cv = X
y_cv = y

scores = cross_validate(pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)

coefs = [estimator['plsregression'].coef_ for estimator in scores['estimator']]
avg_coef = np.mean(coefs, axis=0)[:, 0]

print(f'Target: {selected_target}')
for metric in scoring:
    metric_values = scores[metric]
    print(f'Avg {metric}: {np.mean(metric_values):.2f}')

Target: WISC_FSIQ
Avg train_score: 0.85
Avg test_score: 0.36
CPU times: user 115 ms, sys: 273 ms, total: 387 ms
Wall time: 2.81 s


In [26]:
from common.paths import PLS_WEIGHTS

np.save(PLS_WEIGHTS, avg_coef)

### Run permutation statistic

In [46]:
%%time
# X_cv = bin_1[0]
# y_cv = bin_1[1]
X_cv = X
y_cv = y

score, _, pvalue = permutation_test_score(
    pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_permutations=3000, n_jobs=-1)

print(f'Target: {selected_target}')
print(f'Score: {score:.2f} | p-value: {pvalue:.4f}')

Target: WISC_PSI
Score: 0.06 | p-value: 0.3949
CPU times: user 29.5 s, sys: 1.13 s, total: 30.7 s
Wall time: 4min 43s


## Run for all targets, all age bins

In [28]:
def regression_scorer(reg, X, y):
    y_pred = reg.predict(X)[:, 0]
    return stats.pearsonr(y, y_pred)[0]

scoring = ['train_score', 'test_score']
rkf = RepeatedKFold(n_splits=5, n_repeats=1, random_state=251183)

estimators = [StandardScaler(), PLSRegression(n_components=4)]
pipe = make_pipeline(*estimators)

In [None]:
%%time
from common.binning import bin_by_age
from common.wisc import FSIQ, PRIMARY_INDICES

targets = FSIQ + PRIMARY_INDICES

for target in targets:
    y = Y[target]
    bins = bin_by_age(X, y, ages, y)
    bin_1, bin_2, bin_3 = bins[0], bins[1], bins[2]
    X_all = [X, bin_1[0], bin_2[0], bin_3[0]]
    y_all = [y, bin_1[1], bin_2[1], bin_3[1]]
    age_bin_label = ["All  ", "Bin 1", "Bin 2", "Bin 3"]
    
    for X_cv, y_cv, bin_label in zip(X_all, y_all, age_bin_label):
        score, _, pvalue = permutation_test_score(
            pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_permutations=3000, n_jobs=-1)
        print(f'Bin: {bin_label} | Target: {target} | Score: {score:.2f} | p-value: {pvalue:.4f}')
    print('---')

Bin: All   | Target: WISC_FSIQ | Score: 0.12 | p-value: 0.2469


In [25]:
def regression_scorer(reg, X, y):
    y_pred = reg.predict(X)
    scores = []
    for measure in range(0, 6):
        scores.append(stats.pearsonr(y[:, measure], y_pred[:, measure])[0])
    
    return scores

In [26]:
%%time
from common.binning import bin_by_age
from common.wisc import FSIQ, PRIMARY_INDICES

keys = FSIQ + PRIMARY_INDICES
y = np.array(list(map(Y.get, keys))).T

bins = bin_by_age(X, y, ages, y)
bin_1, bin_2, bin_3 = bins[0], bins[1], bins[2]
X_all = [X, bin_1[0], bin_2[0], bin_3[0]]
y_all = [y, bin_1[1], bin_2[1], bin_3[1]]
age_bin_label = ["All  ", "Bin 1", "Bin 2", "Bin 3"]

for X_cv, y_cv, bin_label in zip(X_all, y_all, age_bin_label):
    score, _, pvalue = permutation_test_score(
        pipe, X_cv, y_cv, cv=rkf, scoring=regression_scorer, n_permutations=3000, n_jobs=-1)
    print(f'Bin: {bin_label} | Target: {target} | Score: {score:.2f} | p-value: {pvalue:.4f}')
print('---')

KeyboardInterrupt: 