# Age Cross-Prediction

In [2]:
import numpy as np
import pandas as pd

%load_ext autoreload
%autoreload 2

%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

## Get Data

In [36]:
from common.data import get_data
from common.paths import ADHD

X, Y, demographics, population = get_data(wisc_level=5, label_path=ADHD)
ages = demographics['Age']

print(f'X: {X.shape} | Y: {len(Y.keys())} | Age: {ages.shape} | Population: {population}')

X: (373, 34716) | Y: 6 | Age: (373,) | Population: adhd


In [24]:
from common.cross_prediction import (get_group_cv_splits, get_group_order,
                                     cross_prediction_permutation_test_score)
from common.paths import CROSS_PRED_RESULTS, CROSS_PRED_PSCORES
from common.results import CVResult, save_results, save_perm_score
from common.scoring import unimetric_scorer, N_PERM, RKF_10_10
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Run for one target, one age bin

In [37]:
selected_target = "WISC_FSIQ"
y = Y[selected_target]

print(f'{selected_target}: {y.shape}')

WISC_PSI: (373,)


In [38]:
from common.binning import bin_data

X_bins, y_bins, bin_labels = bin_data(X, y, ages, False, 3)

for age_bin, bin_label in zip(X_bins, bin_labels):
    print(f'{bin_label}: {age_bin.shape}')

Bin 1: (114, 34716)
Bin 2: (147, 34716)
Bin 3: (112, 34716)


In [39]:
bin_2_subsample_indices = np.random.choice(147, 113, replace=False)
X_bins[1] = X_bins[1][bin_2_subsample_indices]
y_bins[1] = y_bins[1][bin_2_subsample_indices]
bin_labels[1] = 'Bin 2 Equal'

print(X_bins[1].shape, y_bins[1].shape)

(113, 34716) (113,)


In [34]:
bins = [(X_bin, y_bin) for X_bin, y_bin in zip(X_bins, y_bins)]
bins_cv = get_group_cv_splits(bins, RKF_10_10)

print(f'Bin 1: {bins[0][0].shape} | Bin 2: {bins[1][0].shape} | Bin 3: {bins[2][0].shape}')
print(f'bin_1_cv: {len(bins_cv[0])} | bin_2_cv: {len(bins_cv[1])} | bin_3_cv: {len(bins_cv[2])}')

Bin 1: (114, 34716) | Bin 2: (113, 34716) | Bin 3: (112, 34716)
bin_1_cv: 100 | bin_2_cv: 100 | bin_3_cv: 100


### Run permutation-test (train bin, test bin)

In [40]:
%%time

bin_alphas = [9901, 9901, 1]  # From previous results

bin_order, cv_order, label_order = get_group_order(bins, bins_cv, bin_labels)
results = []

for bin_alpha, bins, bins_cv, labels in zip(bin_alphas, bin_order, cv_order, label_order):
    train_bin, test_bin_one, test_bin_two = bins[0], bins[1], bins[2]
    train_bin_cv, test_bin_one_cv, test_bin_two_cv = bins_cv[0], bins_cv[1], bins_cv[2]
    
    pipe = make_pipeline(StandardScaler(), Ridge(alpha=bin_alpha))
    rs, perms, ps = cross_prediction_permutation_test_score(
        pipe, train_bin, test_bin_one, test_bin_two, 
        train_bin_cv, test_bin_one_cv, test_bin_two_cv, N_PERM, unimetric_scorer)
    
    train_group = labels[0]
    for r, p, test_group in zip(rs, ps, labels):
        results.append(
            CVResult('ridge', selected_target, train_group, test_group, r, p, population, N_PERM)
        )
    print(f'Train Group: {train_group}')
    save_perm_score(perms, f'ridge_{population}_{selected_target}_{train_group}_cross_prediction_perm_scores', CROSS_PRED_PSCORES)

results_df = pd.DataFrame([r.to_dict() for r in results])
display(results_df.round(4))
filename = 'ridge_pts_age_cross_prediction_bin_two_equal_samples'
results_fp = save_results(results_df, filename, CROSS_PRED_RESULTS, True)
print('Results saved to:', results_fp)

Train Group: Bin 1
Train Group: Bin 2 Equal
Train Group: Bin 3


Unnamed: 0,Model,Target,Train,Test,Score,P-value,Population,Num Permutations
0,ridge,WISC_PSI,Bin 1,Bin 1,0.1579,0.1158,adhd,500
1,ridge,WISC_PSI,Bin 1,Bin 2 Equal,-0.0259,0.6028,adhd,500
2,ridge,WISC_PSI,Bin 1,Bin 3,0.1409,0.0399,adhd,500
3,ridge,WISC_PSI,Bin 2 Equal,Bin 2 Equal,-0.0736,0.7026,adhd,500
4,ridge,WISC_PSI,Bin 2 Equal,Bin 3,0.0092,0.4631,adhd,500
5,ridge,WISC_PSI,Bin 2 Equal,Bin 1,-0.0052,0.505,adhd,500
6,ridge,WISC_PSI,Bin 3,Bin 3,0.272,0.012,adhd,500
7,ridge,WISC_PSI,Bin 3,Bin 1,0.1022,0.1078,adhd,500
8,ridge,WISC_PSI,Bin 3,Bin 2 Equal,0.0702,0.2335,adhd,500


Results saved to: /home/bpho/Documents/MSc_Research-Project/scratch_data/model_results/Cross Prediction/ridge_pts_age_cross_prediction_bin_two_equal_samples.csv
CPU times: user 5h 5min 13s, sys: 4min 20s, total: 5h 9min 33s
Wall time: 30min 58s
