# Diagnosis Cross-Prediction

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

## Get Data

In [2]:
from os.path import join
from common.data import get_data
from common.paths import HEALTHY, ADHD_ONE, ADHD_TWO

X_healthy, Y_healthy, healthy_demo, healthy_pop = get_data(5, HEALTHY)
X_adhd_one, Y_adhd_one, adhd_one_demo, adhd_one_pop = get_data(5, ADHD_ONE)
X_adhd_two, Y_adhd_two, adhd_two_demo, adhd_two_pop = get_data(5, ADHD_TWO)

print(f'X_healthy: {X_healthy.shape} | X_adhd_one: {X_adhd_one.shape} | X_adhd_two: {X_adhd_two.shape}')

X_healthy: (106, 34716) | X_adhd_one: (106, 34716) | X_adhd_two: (106, 34716)


In [3]:
print(np.var(X_healthy), np.var(X_adhd_one), np.var(X_adhd_two))
print(np.mean(np.var(X_healthy, axis=0)), np.mean(np.var(X_adhd_one, axis=0)), np.mean(np.var(X_adhd_two, axis=0)))
print(np.mean(np.var(X_healthy, axis=1)), np.mean(np.var(X_adhd_one, axis=1)), np.mean(np.var(X_adhd_two, axis=1)))

0.056671523 0.059738882 0.06267111
0.03862412 0.041490708 0.043794744
0.049966704 0.05083683 0.052254844


In [4]:
from common.cross_prediction import get_group_cv_splits, get_group_order
from common.paths import CROSS_PRED_RESULTS
from common.results import CVResult, save_results
from common.scoring import (unimetric_scorer, 
                            custom_permutation_test_score, 
                            N_PERM, SCORING, RKF_10_10)
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Run for one target, one age bin

In [5]:
selected_target = "WISC_FSIQ"
y_healthy = Y_healthy[selected_target]
y_adhd_one = Y_adhd_one[selected_target]
y_adhd_two = Y_adhd_two[selected_target]

print(f'{selected_target}: {y_healthy.shape}, {y_adhd_one.shape}, {y_adhd_two.shape}')

WISC_FSIQ: (106,), (106,), (106,)


In [7]:
from common.data import generate_fake_data

X_healthy, y_healthy = generate_fake_data(X_healthy, y_healthy)
X_adhd_one, y_adhd_one = generate_fake_data(X_adhd_one, y_adhd_one)
X_adhd_two, y_adhd_two = generate_fake_data(X_adhd_two, y_adhd_two)

print(f'X_healthy: {X_healthy.shape} | X_adhd_one: {X_adhd_one.shape} | X_adhd_two: {X_adhd_two.shape}')

X_healthy: (106, 34716) | X_adhd_one: (106, 34716) | X_adhd_two: (106, 34716)


In [22]:
healthy = (X_healthy, y_healthy)
adhd_one = (X_adhd_one, y_adhd_one)
adhd_two = (X_adhd_two, y_adhd_two)

# From previous results
diag_alphas = [5000, 5000, 5000]
diag_labels = ['Healthy', 'ADHD_One', 'ADHD_Two']
diags = [healthy, adhd_one, adhd_two]
diags_cv = get_group_cv_splits(diags, RKF_10_10)

print(f'Healthy: {healthy[0].shape} | ADHD_ONE: {adhd_one[0].shape} | ADHD_TWO: {adhd_two[0].shape}')
print(f'healthy_cv: {len(diags_cv[0])} | adhd_one_cv: {len(diags_cv[1])} | adhd_two_cv: {len(diags_cv[2])}')

Healthy: (106, 34716) | ADHD_ONE: (106, 34716) | ADHD_TWO: (106, 34716)
healthy_cv: 100 | adhd_one_cv: 100 | adhd_two_cv: 100


### Run stats between the two ADHD groups

In [6]:
adhd_one_sex_counts = np.unique(adhd_one_demo['Sex'], return_counts=True)
adhd_two_sex_counts = np.unique(adhd_two_demo['Sex'], return_counts=True)
print(adhd_one_sex_counts)
print(adhd_two_sex_counts)

(array([0, 1]), array([74, 32]))
(array([0, 1]), array([76, 30]))


In [17]:
from scipy import stats

target_t_stat, target_p = stats.ttest_ind(y_adhd_one, y_adhd_two)
age_t_stat, age_p = stats.ttest_ind(adhd_one_demo['Age'], adhd_two_demo['Age'])
sex_chisq, sex_p, _, _ = stats.chi2_contingency([adhd_one_sex_counts[1], adhd_two_sex_counts[1]])

print(f'{selected_target} t-test: {target_t_stat:.2f}, p-value: {target_p:.2f}')
print(f'Age t-test: {age_t_stat:.2f}, p-value: {age_p:.2f}')
print(f'Sex chi-square: {sex_chisq:.2f}, p-value: {sex_p:.2f}')

WISC_FSIQ t-test: -0.95, p-value: 0.34
Age t-test: 0.41, p-value: 0.68
Sex chi-square: 0.05, p-value: 0.82


### Run permutation-test (train group, test group)

In [9]:
%%time

diag_order, diag_cv_order, diag_labels = get_group_order(diags, diags_cv, diag_labels)
results = []
perm_scores = []

for diag_alpha, diags, diags_cv, labels in zip(diag_alphas, diag_order, diag_cv_order, diag_labels):
    train_diag, test_diag_one, test_diag_two = diags[0], diags[1], diags[2]
    train_diag_cv, test_diag_one_cv, test_diag_two_cv = diags_cv[0], diags_cv[1], diags_cv[2]
    
    pipe = make_pipeline(StandardScaler(), Ridge(alpha=diag_alpha))
    rs, perms, ps = custom_permutation_test_score(
        pipe, train_diag, test_diag_one, test_diag_two, 
        train_diag_cv, test_diag_one_cv, test_diag_two_cv, N_PERM, unimetric_scorer)
    
    train_group = labels[0]
    for r, p, test_group in zip(rs, ps, labels):
        results.append(
            CVResult('ridge', selected_target, train_group, test_group, r, p, train_group, N_PERM)
        )
    perm_scores.append(perms)
    print(f'Train Group: {train_group}')

results_df = pd.DataFrame([r.to_dict() for r in results])
display(results_df.round(4))
# filename = f'ridge_pts_diagnosis_cross_prediction.csv'
filename = f'ridge_pts_random_diagnosis_cross_prediction.csv'
results_fp = save_results(results_df, filename, CROSS_PRED_RESULTS)
print('Results saved to:', results_fp)

Train Group: Healthy
Train Group: ADHD_One
Train Group: ADHD_Two


Unnamed: 0,Model,Target,Train,Test,Score,P-value,Population,Num Permutations
0,ridge,WISC_FSIQ,Healthy,Healthy,0.0438,0.3373,Healthy,500
1,ridge,WISC_FSIQ,Healthy,ADHD_One,0.0943,0.1397,Healthy,500
2,ridge,WISC_FSIQ,Healthy,ADHD_Two,0.0717,0.2335,Healthy,500
3,ridge,WISC_FSIQ,ADHD_One,ADHD_One,0.1015,0.2435,ADHD_One,500
4,ridge,WISC_FSIQ,ADHD_One,ADHD_Two,0.1681,0.0379,ADHD_One,500
5,ridge,WISC_FSIQ,ADHD_One,Healthy,0.0915,0.1437,ADHD_One,500
6,ridge,WISC_FSIQ,ADHD_Two,ADHD_Two,-0.023,0.5709,ADHD_Two,500
7,ridge,WISC_FSIQ,ADHD_Two,Healthy,0.0611,0.2635,ADHD_Two,500
8,ridge,WISC_FSIQ,ADHD_Two,ADHD_One,0.1381,0.0459,ADHD_Two,500


Results saved to: /home/bpho/Documents/MSc_Research-Project/scratch_data/model_results/Cross Prediction/ridge_pts_random_diagnosis_cross_prediction.csv
CPU times: user 1d 4h 17min 27s, sys: 30min 32s, total: 1d 4h 48min
Wall time: 2h 52min 48s
