# Ridge

This script trains and tests the Ridge regression model. The general workflow is to search for the optimal alpha hyperparameter and then cross-validate the model. The model can also undergo permutation testing to obtain a p-value (significance) for the model's testing score.

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper",
    style="whitegrid",
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

## Get Data

In [2]:
from common.data import get_data
from common.paths import HEALTHY, ADHD

X, Y, demographics, population = get_data(label_path=ADHD)
ages, sexes = demographics['Age'], demographics['Sex']

print(f'X: {X.shape} | Y: {len(Y.keys())} | Age: {ages.shape} | Population: {population}')

X: (373, 34716) | Y: 6 | Age: (373,) | Population: adhd


In [3]:
selected_target = "WISC_FSIQ"
y = Y[selected_target]

print(f'y: {y.shape}')

y: (373,)


In [4]:
from common.binning import bin_data

X_bins, y_bins, bin_labels = bin_data(X, y, ages, False, 3)

for age_bin, bin_label in zip(X_bins, bin_labels):
    print(f'{bin_label}: {age_bin.shape}')

Bin 1: (114, 34716)
Bin 2: (147, 34716)
Bin 3: (112, 34716)


In [5]:
# Randomly subsample Bin 2 to match number of samples
from common.binning import subsample_bin

X_bins[1], y_bins[1] = subsample_bin(X_bins[1], y_bins[1], 113)
bin_labels[1] = 'Bin 2 Equal'

print(X_bins[1].shape, y_bins[1].shape)

(113, 34716) (113,)


## Run for one target, one age bin

In [3]:
from common.scoring import unimetric_scorer, multimetric_scorer, N_PERM, SCORING, RKF_10_10
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import cross_validate, permutation_test_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

### Set up the model pipeline, metrics, and cross-validation approach

In [6]:
%%time

X_cv = X_bins[0]
y_cv = y_bins[0]
# X_cv = X
# y_cv = y
age_group = 'Bin 1'
model = 'ridge'

estimators = [StandardScaler(), RidgeCV(alphas=[a for a in range(1, 10000, 100)], 
                                        scoring=unimetric_scorer, cv=RKF_10_10)]
pipe = make_pipeline(*estimators)
pipe.fit(X_cv, y_cv)
ridge_cv = pipe['ridgecv']

print(f'Target: {selected_target} | Alpha: {ridge_cv.alpha_} | Score: {ridge_cv.best_score_:.2f}')

Target: WISC_FSIQ | Alpha: 9001 | Score: 0.36
CPU times: user 9min 8s, sys: 21.5 s, total: 9min 29s
Wall time: 57.5 s


### Run cross-validation

In [9]:
%%time

estimators = [StandardScaler(), Ridge(alpha=ridge_cv.alpha_)]
pipe = make_pipeline(*estimators)

scores = cross_validate(pipe, X_cv, y_cv, cv=RKF_10_10, scoring=unimetric_scorer, n_jobs=-1, 
                        return_train_score=True, return_estimator=True)

coefs = np.array([estimator['ridge'].coef_ for estimator in scores['estimator']])
avg_coef = np.mean(coefs, axis=0)
intercepts = np.array([estimator['ridge'].intercept_ for estimator in scores['estimator']])
avg_inte = np.mean(intercepts, axis=0)

print(f'ridge_{population}_{selected_target}_{age_group}')
print(f'Avg test score: {np.mean(scores["test_score"])}')

ridge_adhd_WISC_PSI_Bin 2
Avg test score: 0.64697580295615
CPU times: user 570 ms, sys: 280 ms, total: 850 ms
Wall time: 2.47 s


### Save model weights

In [38]:
from os.path import join
from common.paths import RIDGE_WEIGHTS

coef_fn = f'ridge_{population}_{selected_target}_{age_group}_coef.npy'
inte_fn = f'ridge_{population}_{selected_target}_{age_group}_inte.npy'

np.save(join(RIDGE_WEIGHTS, coef_fn), avg_coef)
np.save(join(RIDGE_WEIGHTS, inte_fn), avg_inte)

### Run permutation test statistic (pts)

In [11]:
%%time
estimators = [StandardScaler(), Ridge(alpha=ridge_cv.alpha_)]
pipe = make_pipeline(*estimators)

score, permutation_scores, pvalue = permutation_test_score(
    pipe, X_cv, y_cv, cv=RKF_10_10, scoring=unimetric_scorer, n_permutations=N_PERM, n_jobs=-1)

print(f'ridge_{population}_{selected_target}_{age_group}')
print(f'Alpha: {ridge_cv.alpha_} | Score: {score:.2f} | p-value: {pvalue:.4f}')

ridge_adhd_WISC_FSIQ_Bin 2
Alpha: 9901 | Score: 0.33 | p-value: 0.0100
CPU times: user 56.9 s, sys: 1.5 s, total: 58.4 s
Wall time: 15min 57s


## Run for all targets, all age bins

### Run cross-validation

In [7]:
%%time

from os.path import join
from common.binning import bin_data
from common.wisc import WISC_LEVEL
from common.paths import RIDGE_WEIGHTS, RIDGE_RESULTS
from common.results import save_results

results = []
targets = WISC_LEVEL[5]
ages = None if population == 'healthy' else ages

# alphas = {
#     'WISC_FSIQ': {"All": 9901, "Bin 1": 701, "Bin 2": 5701, "Bin 3": 1 },
#     'WISC_VSI': {"All": 9901, "Bin 1": 9901, "Bin 2": 9901, "Bin 3": 9901 },
#     'WISC_VCI': {"All": 9601, "Bin 1": 1, "Bin 2": 9901, "Bin 3": 7101 },
#     'WISC_FRI': {"All": 9901, "Bin 1": 2301, "Bin 2": 8401, "Bin 3": 1 },
#     'WISC_WMI': {"All": 5501, "Bin 1": 1, "Bin 2": 4401, "Bin 3": 9901 },
#     'WISC_PSI': {"All": 9901, "Bin 1": 9901, "Bin 2": 4401, "Bin 3": 1 },
# }

alphas = {
    'WISC_FSIQ': {"All": 1},
    'WISC_VSI': {"All": 1},
    'WISC_VCI': {"All": 9901},
    'WISC_FRI': {"All": 1},
    'WISC_WMI': {"All": 9901},
    'WISC_PSI': {"All": 1},
}

for target in targets:
    y = Y[target]
    X_all, y_all, bin_labels = bin_data(X, y, ages, include_all=True)
    
    for X_cv, y_cv, bin_label in zip(X_all, y_all, bin_labels):
        # Find best alpha
        best_alpha = alphas[target][bin_label]
        
        # Do cross-validation
        estimators = [StandardScaler(), Ridge(alpha=best_alpha)]
        pipe = make_pipeline(*estimators)
        
        scores = cross_validate(pipe, X_cv, y_cv, cv=RKF_10_10, 
                                scoring=unimetric_scorer, n_jobs=-1, 
                                return_train_score=False, 
                                return_estimator=True)
        coefs = np.array([estimator['ridge'].coef_ for estimator in scores['estimator']])
        avg_coef = np.mean(coefs, axis=0)
        intercepts = np.array([estimator['ridge'].intercept_ for estimator in scores['estimator']])
        avg_inte = np.mean(intercepts, axis=0)
        
        results.append({    
            'Model': 'ridge',
            'Population': population,
            'Target': target,
            'Bin': bin_label,
            'Alpha': best_alpha,
            'Score': np.mean(scores['test_score']),
        })
        print(results[-1])
        
        coef_fn = f'ridge_{population}_{target}_{bin_label}_coef.npy'
        inte_fn = f'ridge_{population}_{target}_{bin_label}_inte.npy'
        np.save(join(RIDGE_WEIGHTS, coef_fn), avg_coef)
        np.save(join(RIDGE_WEIGHTS, inte_fn), avg_inte)

results_df = pd.DataFrame(results)
display(results_df.round(4))

{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_FSIQ', 'Bin': 'All', 'Alpha': 1, 'Score': 0.03649812587586866}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_VSI', 'Bin': 'All', 'Alpha': 1, 'Score': 0.12715840734542766}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_VCI', 'Bin': 'All', 'Alpha': 9901, 'Score': 0.24395247322565755}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_FRI', 'Bin': 'All', 'Alpha': 1, 'Score': -0.06909279808880493}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_WMI', 'Bin': 'All', 'Alpha': 9901, 'Score': 0.1193958291669926}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_PSI', 'Bin': 'All', 'Alpha': 1, 'Score': -0.048414619120956175}


Unnamed: 0,Model,Population,Target,Bin,Alpha,Score
0,ridge,healthy,WISC_FSIQ,All,1,0.0365
1,ridge,healthy,WISC_VSI,All,1,0.1272
2,ridge,healthy,WISC_VCI,All,9901,0.244
3,ridge,healthy,WISC_FRI,All,1,-0.0691
4,ridge,healthy,WISC_WMI,All,9901,0.1194
5,ridge,healthy,WISC_PSI,All,1,-0.0484


CPU times: user 1min 28s, sys: 20.9 s, total: 1min 49s
Wall time: 2min 5s


In [8]:
filename = f'ridge_cv_{population}'
save_results(results_df, filename, RIDGE_RESULTS)

'/home/bpho/Documents/MSc_Research-Project/scratch_data/model_results/Ridge/ridge_cv_healthy.csv'

### Run permutation test statistic (pts)

In [13]:
%%time

from common.binning import bin_data
from common.wisc import WISC_LEVEL
from common.paths import RIDGE_RESULTS, RIDGE_PSCORES
from common.results import save_results, save_perm_score

results = []
targets = WISC_LEVEL[5]
ages = None if population == 'healthy' else ages
alphas = {
    'WISC_FSIQ': {"All": 1},
    'WISC_VSI': {"All": 1},
    'WISC_VCI': {"All": 9901},
    'WISC_FRI': {"All": 1},
    'WISC_WMI': {"All": 9901},
    'WISC_PSI': {"All": 1},
}

for target in targets:
    y = Y[target]
    X_bins, y_bins, bin_labels = bin_data(X, y, ages, include_all=True)
    
    for X_cv, y_cv, bin_label in zip(X_bins, y_bins, bin_labels):
        # Find best alpha
        best_alpha = alphas[target][bin_label]
#         estimators = [StandardScaler(), RidgeCV(alphas=[a for a in range(1, 10000, 100)], 
#                                         scoring=unimetric_scorer, cv=RKF_10_10)]
#         pipe = make_pipeline(*estimators).fit(X_cv, y_cv)
#         ridge_cv = pipe['ridgecv']
        
        # Do permutation test
        estimators = [Ridge(alpha=best_alpha)]
#         estimators = [Ridge(alpha=ridge_cv.alpha_)]
        pipe = make_pipeline(*estimators)
        
        score, permutation_scores, pvalue = permutation_test_score(
            pipe, X_cv, y_cv, cv=RKF_10_10, scoring=unimetric_scorer, n_permutations=N_PERM, 
            n_jobs=-1)
        results.append({    
            'Model': 'ridge',
            'Population': population,
            'Target': target,
            'Bin': bin_label,
            'Alpha': best_alpha,
#             'Alpha': ridge_cv.alpha_,
            'Score': score,
            'P-value': pvalue,
        })
        print(results[-1])
        save_perm_score(permutation_scores, f'ridge_{population}_{target}_{bin_label}_perm_scores', 
                        RIDGE_PSCORES)

results_df = pd.DataFrame(results)
display(results_df.round(4))

{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_FSIQ', 'Bin': 'All', 'Alpha': 1, 'Score': 0.03979761025110626, 'P-value': 0.4171656686626746}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_VSI', 'Bin': 'All', 'Alpha': 1, 'Score': 0.16181370001187687, 'P-value': 0.10778443113772455}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_VCI', 'Bin': 'All', 'Alpha': 9901, 'Score': 0.196704856115268, 'P-value': 0.05189620758483034}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_FRI', 'Bin': 'All', 'Alpha': 1, 'Score': -0.07283880181186352, 'P-value': 0.7345309381237525}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_WMI', 'Bin': 'All', 'Alpha': 9901, 'Score': 0.11698659032521132, 'P-value': 0.21357285429141717}
{'Model': 'ridge', 'Population': 'healthy', 'Target': 'WISC_PSI', 'Bin': 'All', 'Alpha': 1, 'Score': -0.05608048819314832, 'P-value': 0.6986027944111777}


Unnamed: 0,Model,Population,Target,Bin,Alpha,Score,P-value
0,ridge,healthy,WISC_FSIQ,All,1,0.0398,0.4172
1,ridge,healthy,WISC_VSI,All,1,0.1618,0.1078
2,ridge,healthy,WISC_VCI,All,9901,0.1967,0.0519
3,ridge,healthy,WISC_FRI,All,1,-0.0728,0.7345
4,ridge,healthy,WISC_WMI,All,9901,0.117,0.2136
5,ridge,healthy,WISC_PSI,All,1,-0.0561,0.6986


CPU times: user 34min 14s, sys: 4min 45s, total: 38min 59s
Wall time: 2h 27min 51s


In [16]:
filename = f'ridge_pts_{population}'
save_results(results_df, filename, RIDGE_RESULTS)

'/home/bpho/Documents/MSc_Research-Project/scratch_data/model_results/Ridge/ridge_pts_healthy.csv'

### Run permutation test statistic (pts) with equal sample size

In [8]:
%%time

from os.path import join
from common.binning import bin_data
from common.wisc import WISC_LEVEL
from common.paths import RIDGE_RESULTS, RIDGE_PSCORES
from common.results import save_results, save_perm_score

results = []
targets = WISC_LEVEL[5]
ages = None if population == 'healthy' else ages
bin_label = 'Bin 2 Equal'
bin_2_subsample_indices = np.random.choice(147, 113, replace=False)

for target in targets:
    y = Y[target]
    X_bins, y_bins, bin_labels = bin_data(X, y, ages, False)
    X_bin_2, y_bin_2 = X_bins[1][bin_2_subsample_indices], y_bins[1][bin_2_subsample_indices]
        
    # Find best alpha
    estimators = [StandardScaler(), RidgeCV(alphas=[a for a in range(1, 10000, 100)], 
                                    scoring=unimetric_scorer, cv=RKF_10_10)]
    pipe = make_pipeline(*estimators).fit(X_bin_2, y_bin_2)
    ridge_cv_alpha = pipe['ridgecv'].alpha_

    # Do permutation test
    estimators = [StandardScaler(), Ridge(alpha=ridge_cv_alpha)]
    pipe = make_pipeline(*estimators)

    score, permutation_scores, pvalue = permutation_test_score(
        pipe, X_bin_2, y_bin_2, cv=RKF_10_10, scoring=unimetric_scorer, n_permutations=N_PERM, 
        n_jobs=-1)
    results.append({    
        'Model': 'ridge',
        'Population': population,
        'Target': target,
        'Bin': bin_label,
        'Alpha': ridge_cv_alpha,
        'Score': score,
        'P-value': pvalue,
    })
    print(results[-1])
    save_perm_score(permutation_scores, f'ridge_{population}_{target}_{bin_label}_perm_scores', 
                    RIDGE_PSCORES)
        
results_df = pd.DataFrame(results)
display(results_df.round(4))

{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_FSIQ', 'Bin': 'Bin 2 Equal', 'Alpha': 9901, 'Score': 0.37268582592832233, 'P-value': 0.001996007984031936}
{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_VSI', 'Bin': 'Bin 2 Equal', 'Alpha': 9901, 'Score': 0.27402582973158046, 'P-value': 0.00998003992015968}
{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_VCI', 'Bin': 'Bin 2 Equal', 'Alpha': 6901, 'Score': 0.36517093118210603, 'P-value': 0.001996007984031936}
{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_FRI', 'Bin': 'Bin 2 Equal', 'Alpha': 9201, 'Score': 0.30226867233259, 'P-value': 0.005988023952095809}
{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_WMI', 'Bin': 'Bin 2 Equal', 'Alpha': 9901, 'Score': 0.34980147754878743, 'P-value': 0.001996007984031936}
{'Model': 'ridge', 'Population': 'adhd', 'Target': 'WISC_PSI', 'Bin': 'Bin 2 Equal', 'Alpha': 9901, 'Score': 0.04161685374305273, 'P-value': 0.3772455089820359}


Unnamed: 0,Model,Population,Target,Bin,Alpha,Score,P-value
0,ridge,adhd,WISC_FSIQ,Bin 2 Equal,9901,0.3727,0.002
1,ridge,adhd,WISC_VSI,Bin 2 Equal,9901,0.274,0.01
2,ridge,adhd,WISC_VCI,Bin 2 Equal,6901,0.3652,0.002
3,ridge,adhd,WISC_FRI,Bin 2 Equal,9201,0.3023,0.006
4,ridge,adhd,WISC_WMI,Bin 2 Equal,9901,0.3498,0.002
5,ridge,adhd,WISC_PSI,Bin 2 Equal,9901,0.0416,0.3772


CPU times: user 1h 59min 45s, sys: 1min 46s, total: 2h 1min 32s
Wall time: 1h 16min 48s


In [None]:
filename = f'ridge_pts_{population}_bin_two_equal_samples.csv'
results_df.to_csv(join(RIDGE_RESULTS, filename))