# PLS Regression

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

## Get Data

In [3]:
from common.data import get_data
from common.paths import HEALTHY, ADHD

X, Y, demographics, population = get_data(label_path=HEALTHY)
ages, sexes = demographics['Age'], demographics['Sex']

print(f'X: {X.shape} | Y: {len(Y.keys())} | Age: {ages.shape} | Population: {population}')

X: (106, 34716) | Y: 6 | Age: (106,) | Population: healthy


In [12]:
selected_target = "WISC_FSIQ"
y = Y[selected_target]

print(f'y: {y.shape}')

y: (373,)


In [16]:
from common.binning import bin_data

X_bins, y_bins, bin_labels = bin_data(X, y, ages, False, 3)

for age_bin, bin_label in zip(X_bins, bin_labels):
    print(f'{bin_label}: {age_bin.shape}')

Bin 1: (114, 34716)
Bin 2: (147, 34716)
Bin 3: (112, 34716)


In [17]:
bin_2_subsample_indices = np.random.choice(147, 113, replace=False)
X_bins[1] = X_bins[1][bin_2_subsample_indices]
y_bins[1] = y_bins[1][bin_2_subsample_indices]
bin_labels[1] = 'Bin 2 Equal'

print(X_bins[1].shape, y_bins[1].shape)

(113, 34716) (113,)


## Run for one target, one age bin

In [4]:
from common.scoring import unimetric_scorer, multimetric_scorer, N_PERM, SCORING, RKF_10_10
from sklearn.cross_decomposition import PLSRegression
from sklearn.model_selection import cross_validate, permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

### Set up the model pipeline, metrics, and cross-validation approach

In [6]:
estimators = [StandardScaler(), PLSRegression(n_components=4)]
pipe = make_pipeline(*estimators)

### Run cross-validation

In [6]:
%%time

# X_cv = bin_3[0]
# y_cv = bin_3[1]
X_cv = X
y_cv = y
age_group = 'all'

scores = cross_validate(pipe, X_cv, y_cv, cv=RKF_10_10, scoring=unimetric_scorer, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)

coefs = np.array([estimator['plsregression'].coef_ for estimator in scores['estimator']])
coefs = np.squeeze(coefs, axis=2)
avg_coef = np.mean(coefs, axis=0)

print(f'pls_{population}_{selected_target}_{age_group}')
for metric in SCORING:
    metric_values = scores[metric]
    print(f'Avg {metric}: {np.mean(metric_values):.2f}')

0.18997362711567398
CPU times: user 1.94 s, sys: 1.07 s, total: 3.01 s
Wall time: 6.84 s


In [None]:
ax = sns.boxplot(data=scores['test_p_value'], palette="Set2")
ax = sns.swarmplot(data=scores['test_p_value'], color="0.3")
ax.set_ylabel('Test Score (Pearson r)')
ax.set_xlabel('Groups')
ax.set_title('Cross Validation Results ( -> )')

In [30]:
from os.path import join
from common.paths import PLS_WEIGHTS

filename = f'pls_{population}_{selected_target}_{age_group}.npy'

np.save(join(PLS_WEIGHTS, filename), avg_coef)

### Run permutation statistic

In [23]:
%%time
# X_cv = bin_1[0]
# y_cv = bin_1[1]
X_cv = X
y_cv = y
age_group = 'all'

score, _, pvalue = permutation_test_score(
    pipe, X_cv, y_cv, cv=RKF_10_10, scoring=unimetric_scorer, n_permutations=N_PERM, n_jobs=-1)

print(f'pls_{population}_{selected_target}_{age_group}')
print(f'Score: {score:.2f} | p-value: {pvalue:.4f}')

pls_healthy_WISC_PSI_all
Score: -0.06 | p-value: 0.6934
CPU times: user 2min 9s, sys: 4.18 s, total: 2min 13s
Wall time: 4h 47min 31s


In [9]:
%%time
X_cv = X
y_cv = ages

score, perm_scores, pvalue = permutation_test_score(pipe, X_cv, y_cv, cv=RKF_10_10, 
                                          scoring=cust_r2, n_permutations=N_PERM, n_jobs=-1)

print(f'Score: {score:.2f} | p-value: {pvalue:.4f}')

Score: 0.17 | p-value: 0.0020
CPU times: user 1min 44s, sys: 2.31 s, total: 1min 47s
Wall time: 39min 18s


## Run for all targets, all age bins

### Run cross-validation

In [7]:
%%time
from os.path import join
from common.binning import bin_data
from common.wisc import WISC_LEVEL
from common.paths import PLS_WEIGHTS, PLS_RESULTS
from common.results import save_results, save_perm_score

results = []
targets = WISC_LEVEL[5]
ages = None if population == 'healthy' else ages

for target in targets:
    y = Y[target]
    X_all, y_all, bin_labels = bin_data(X, y, ages, include_all=True)
    
    for X_cv, y_cv, bin_label in zip(X_all, y_all, bin_labels):
        estimators = [StandardScaler(), PLSRegression(n_components=4)]
        pipe = make_pipeline(*estimators)
        
        scores = cross_validate(pipe, X_cv, y_cv, cv=RKF_10_10, 
                                scoring=unimetric_scorer, n_jobs=-1, 
                                return_train_score=False, 
                                return_estimator=True)
        coefs = np.array([estimator['plsregression'].coef_ for estimator in scores['estimator']])
        coefs = np.squeeze(coefs, axis=2)
        avg_coef = np.mean(coefs, axis=0)
        
        results.append({    
            'Model': 'pls',
            'Population': population,
            'Target': target,
            'Bin': bin_label,
            'Score': np.mean(scores['test_score']),
        })
        print(results[-1])
        
        fn = f'pls_{population}_{target}_{bin_label}.npy'
        np.save(join(PLS_WEIGHTS, fn), avg_coef)

results_df = pd.DataFrame(results)
display(results_df.round(4))

{'Model': 'pls', 'Population': 'healthy', 'Target': 'WISC_FSIQ', 'Bin': 'All', 'Score': 0.053455548841894907}
{'Model': 'pls', 'Population': 'healthy', 'Target': 'WISC_VSI', 'Bin': 'All', 'Score': 0.1159526196215209}
{'Model': 'pls', 'Population': 'healthy', 'Target': 'WISC_VCI', 'Bin': 'All', 'Score': 0.24470233655032808}
{'Model': 'pls', 'Population': 'healthy', 'Target': 'WISC_FRI', 'Bin': 'All', 'Score': -0.10988945686470723}
{'Model': 'pls', 'Population': 'healthy', 'Target': 'WISC_WMI', 'Bin': 'All', 'Score': 0.08617518866321351}
{'Model': 'pls', 'Population': 'healthy', 'Target': 'WISC_PSI', 'Bin': 'All', 'Score': -0.09936570873375464}


Unnamed: 0,Model,Population,Target,Bin,Score
0,pls,healthy,WISC_FSIQ,All,0.0535
1,pls,healthy,WISC_VSI,All,0.116
2,pls,healthy,WISC_VCI,All,0.2447
3,pls,healthy,WISC_FRI,All,-0.1099
4,pls,healthy,WISC_WMI,All,0.0862
5,pls,healthy,WISC_PSI,All,-0.0994


CPU times: user 1min 43s, sys: 22.9 s, total: 2min 6s
Wall time: 2min 20s


In [9]:
filename = f'pls_cv_{population}'
save_results(results_df, filename, PLS_RESULTS)

'/home/bpho/Documents/MSc_Research-Project/scratch_data/model_results/PLS/pls_cv_healthy.csv'

### Run permutation test statistic (pts)

In [19]:
%%time
from common.binning import bin_data
from common.wisc import WISC_LEVEL
from common.paths import PLS_RESULTS, PLS_PSCORES
from common.results import save_results, save_perm_score

results = []
targets = WISC_LEVEL[5]
ages = None if population == 'healthy' else ages

for target in targets:
    y = Y[target]
    X_all, y_all, bin_labels = bin_data(X, y, ages, include_all=True)
    
    for X_cv, y_cv, bin_label in zip(X_all, y_all, bin_labels):
        score, permutation_scores, pvalue = permutation_test_score(
            pipe, X_cv, y_cv, cv=RKF_10_10, scoring=unimetric_scorer, n_permutations=N_PERM, 
            n_jobs=-1)
        results.append({    
            'Model': 'pls',
            'Population': population,
            'Target': target,
            'Bin': bin_label,
            'Score': score,
            'P-value': pvalue,
        })
        print(results[-1])
        save_perm_score(permutation_scores, f'pls_{population}_{target}_{bin_label}_perm_scores', PLS_PSCORES)

results_df = pd.DataFrame(results)
display(results_df.round(4))

{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'All', 'Score': 0.3519368647281619, 'P-value': 0.001996007984031936}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'Bin 1', 'Score': 0.242330533117219, 'P-value': 0.027944111776447105}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'Bin 2', 'Score': 0.3876011741186956, 'P-value': 0.001996007984031936}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'Bin 3', 'Score': 0.08624648992519683, 'P-value': 0.2275449101796407}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_VSI', 'Bin': 'All', 'Score': 0.27546671228488767, 'P-value': 0.001996007984031936}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_VSI', 'Bin': 'Bin 1', 'Score': 0.2434753169683444, 'P-value': 0.017964071856287425}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_VSI', 'Bin': 'Bin 2', 'Score': 0.21067601885844467, 'P-value': 0.017964071856287425}
{'Model': 'pls', 'Populatio

Unnamed: 0,Model,Population,Target,Bin,Score,P-value
0,pls,adhd,WISC_FSIQ,All,0.3519,0.002
1,pls,adhd,WISC_FSIQ,Bin 1,0.2423,0.0279
2,pls,adhd,WISC_FSIQ,Bin 2,0.3876,0.002
3,pls,adhd,WISC_FSIQ,Bin 3,0.0862,0.2275
4,pls,adhd,WISC_VSI,All,0.2755,0.002
5,pls,adhd,WISC_VSI,Bin 1,0.2435,0.018
6,pls,adhd,WISC_VSI,Bin 2,0.2107,0.018
7,pls,adhd,WISC_VSI,Bin 3,0.0822,0.2575
8,pls,adhd,WISC_VCI,All,0.3598,0.002
9,pls,adhd,WISC_VCI,Bin 1,0.191,0.0599


CPU times: user 1h 13min 28s, sys: 2min 15s, total: 1h 15min 44s
Wall time: 1d 5h 1min 43s


In [20]:
filename = f'pls_pts_{population}'
save_results(results_df, filename, PLS_RESULTS)

'/home/bpho/Documents/MSc_Research-Project/scratch_data/model_results/PLS/pls_pts_adhd.csv.csv'

### Run permutation test statistic (pts) with equal sample size

In [21]:
%%time
from common.binning import bin_data
from common.wisc import WISC_LEVEL
from common.paths import PLS_RESULTS, PLS_PSCORES
from common.results import save_results, save_perm_score

results = []
targets = WISC_LEVEL[5]
ages = None if population == 'healthy' else ages
bin_label = 'Bin 2 Equal'
bin_2_subsample_indices = np.random.choice(147, 113, replace=False)

for target in targets:
    y = Y[target]
    X_bins, y_bins, bin_labels = bin_data(X, y, ages, False)
    X_bin_2, y_bin_2 = X_bins[1][bin_2_subsample_indices], y_bins[1][bin_2_subsample_indices]
    
    score, permutation_scores, pvalue = permutation_test_score(
        pipe, X_bin_2, y_bin_2, cv=RKF_10_10, scoring=unimetric_scorer, n_permutations=N_PERM, 
        n_jobs=-1)
    results.append({    
        'Model': 'pls',
        'Population': population,
        'Target': target,
        'Bin': bin_label,
        'Score': score,
        'P-value': pvalue,
    })
    print(results[-1])
    save_perm_score(permutation_scores, f'pls_{population}_{target}_{bin_label}_perm_scores', PLS_PSCORES)

results_df = pd.DataFrame(results)
display(results_df.round(4))

{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'Bin 2 Equal', 'Score': 0.4491841116126827, 'P-value': 0.001996007984031936}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_VSI', 'Bin': 'Bin 2 Equal', 'Score': 0.14465678304229956, 'P-value': 0.1217564870259481}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_VCI', 'Bin': 'Bin 2 Equal', 'Score': 0.411203191954623, 'P-value': 0.001996007984031936}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_FRI', 'Bin': 'Bin 2 Equal', 'Score': 0.37123883320068274, 'P-value': 0.001996007984031936}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_WMI', 'Bin': 'Bin 2 Equal', 'Score': 0.3291763002361479, 'P-value': 0.007984031936127744}
{'Model': 'pls', 'Population': 'adhd', 'Target': 'WISC_PSI', 'Bin': 'Bin 2 Equal', 'Score': 0.1775488689394752, 'P-value': 0.06786427145708583}


Unnamed: 0,Model,Population,Target,Bin,Score,P-value
0,pls,adhd,WISC_FSIQ,Bin 2 Equal,0.4492,0.002
1,pls,adhd,WISC_VSI,Bin 2 Equal,0.1447,0.1218
2,pls,adhd,WISC_VCI,Bin 2 Equal,0.4112,0.002
3,pls,adhd,WISC_FRI,Bin 2 Equal,0.3712,0.002
4,pls,adhd,WISC_WMI,Bin 2 Equal,0.3292,0.008
5,pls,adhd,WISC_PSI,Bin 2 Equal,0.1775,0.0679


CPU times: user 10min 48s, sys: 11.1 s, total: 10min 59s
Wall time: 4h 17min 38s


In [23]:
filename = f'pls_pts_{population}_bin_two_equal_samples'
save_results(results_df, filename, PLS_RESULTS)

'/home/bpho/Documents/MSc_Research-Project/scratch_data/model_results/PLS/pls_pts_adhd_bin_two_equal_samples.csv'