# Age Cross-Prediction

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

## Get Data

In [28]:
from common.data import get_data
from common.paths import ADHD

X, Y, demographics, population = get_data(wisc_level=5, label_path=ADHD)
ages = demographics['Age']

print(f'X: {X.shape} | Y: {len(Y.keys())} | Age: {ages.shape} | Population: {population}')

X: (373, 34716) | Y: 6 | Age: (373,) | Population: adhd


In [29]:
from common.binning import bin_by_age, ONLY_BIN_LABELS
from common.cross_prediction import get_group_cv_splits, get_group_order
from common.paths import CROSS_PRED_RESULTS
from common.results import CVResult, save_results
from common.scoring import (unimetric_scorer, 
                            cross_prediction_permutation_test_score, 
                            N_PERM, SCORING, RKF_10_10)
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Run for one target, one age bin

In [30]:
selected_target = "WISC_PSI"
y = Y[selected_target]

print(f'{selected_target}: {y.shape}')

WISC_PSI: (373,)


In [31]:
bins = bin_by_age(X, y, ages)
bins_cv = get_group_cv_splits(bins, RKF_10_10)

print(f'Bin 1: {bins[0][0].shape} | Bin 2: {bins[1][0].shape} | Bin 3: {bins[2][0].shape}')
print(f'bin_1_cv: {len(bins_cv[0])} | bin_2_cv: {len(bins_cv[1])} | bin_3_cv: {len(bins_cv[2])}')

Bin 1: (114, 34716) | Bin 2: (147, 34716) | Bin 3: (112, 34716)
bin_1_cv: 100 | bin_2_cv: 100 | bin_3_cv: 100


### Run permutation-test (train bin, test bin)

In [32]:
%%time

bin_alphas = [9901, 4401, 1]  # From previous results

bin_order, bin_cv_order, bin_labels = get_group_order(bins, bins_cv, ONLY_BIN_LABELS)
results = []
perm_scores = []

for bin_alpha, bins, bins_cv, labels in zip(bin_alphas, bin_order, bin_cv_order, bin_labels):
    train_bin, test_bin_one, test_bin_two = bins[0], bins[1], bins[2]
    train_bin_cv, test_bin_one_cv, test_bin_two_cv = bins_cv[0], bins_cv[1], bins_cv[2]
    
    pipe = make_pipeline(StandardScaler(), Ridge(alpha=bin_alpha))
    rs, perms, ps = custom_permutation_test_score(
        pipe, train_bin, test_bin_one, test_bin_two, 
        train_bin_cv, test_bin_one_cv, test_bin_two_cv, N_PERM, unimetric_scorer)
    
    train_group = labels[0]
    for r, p, test_group in zip(rs, ps, labels):
        results.append(
            CVResult('ridge', selected_target, train_group, test_group, r, p, population, N_PERM)
        )
    perm_scores.append(perms)
    print(f'Train Group: {train_group}')

results_df = pd.DataFrame([r.to_dict() for r in results])
display(results_df.round(4))
filename = 'ridge_pts_age_cross_prediction'
results_fp = save_results(results_df, filename, CROSS_PRED_RESULTS)
print('Results saved to:', results_fp)

Train Group: Bin 1
Train Group: Bin 2
Train Group: Bin 3


Unnamed: 0,Model,Target,Train,Test,Score,P-value,Population,Num Permutations
0,ridge,WISC_PSI,Bin 1,Bin 1,-0.0791,0.7226,adhd,500
1,ridge,WISC_PSI,Bin 1,Bin 2,0.0662,0.1756,adhd,500
2,ridge,WISC_PSI,Bin 1,Bin 3,-0.0291,0.6427,adhd,500
3,ridge,WISC_PSI,Bin 2,Bin 2,0.0424,0.3433,adhd,500
4,ridge,WISC_PSI,Bin 2,Bin 3,-0.0533,0.7465,adhd,500
5,ridge,WISC_PSI,Bin 2,Bin 1,0.0256,0.3952,adhd,500
6,ridge,WISC_PSI,Bin 3,Bin 3,0.0918,0.2236,adhd,500
7,ridge,WISC_PSI,Bin 3,Bin 1,0.0279,0.4152,adhd,500
8,ridge,WISC_PSI,Bin 3,Bin 2,-0.063,0.7924,adhd,500


Results saved to: /home/bpho/Documents/MSc_Research-Project/scratch_data/model_results/Cross Prediction/ridge_pts_age_cross_prediction.csv
CPU times: user 1d 3h 28min 41s, sys: 28min 26s, total: 1d 3h 57min 7s
Wall time: 2h 47min 50s
