# Diagnosis Cross-Prediction

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

## Get Data

In [2]:
from os.path import join
from common.data import get_data
from common.paths import BIOBANK_LABELS

healthy = join(BIOBANK_LABELS, 'Subjects_with_WISC (healthy).csv')
X_healthy, Y_healthy, demographics, population = get_data(5, healthy)

adhd_group_one = join(BIOBANK_LABELS, 'Subjects_with_WISC (adhd 1).csv')
X_adhd_one, Y_adhd_one, demographics, population = get_data(5, adhd_group_one)

adhd_group_two = join(BIOBANK_LABELS, 'Subjects_with_WISC (adhd 2).csv')
X_adhd_two, Y_adhd_two, demographics, population = get_data(5, adhd_group_two)

print(f'X_healthy: {X_healthy.shape} | X_adhd_one: {X_adhd_one.shape} | X_adhd_two: {X_adhd_two.shape}')
# print(Y_adhd_one["WISC_FSIQ"][:5])
# print(Y_adhd_two["WISC_FSIQ"][:5])

X_healthy: (106, 34716) | X_adhd_one: (190, 34716) | X_adhd_two: (190, 34716)


In [3]:
from os.path import exists
from common.paths import CROSS_PRED_RESULTS
from common.scoring import (unimetric_scorer, 
                            custom_permutation_test_score, 
                            N_PERM, SCORING, RKF_10_10)
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Run for one target, one age bin

In [5]:
selected_target = "WISC_FSIQ"
y_healthy = Y_healthy[selected_target]
y_adhd_one = Y_adhd_one[selected_target]
y_adhd_two = Y_adhd_two[selected_target]

print(f'{selected_target}: {y_healthy.shape}, {y_adhd_one.shape}, {y_adhd_two.shape}')

WISC_FSIQ: (106,), (190,), (190,)


In [6]:
healthy = (X_healthy, y_healthy)
adhd_one = (X_adhd_one, y_adhd_one)
adhd_two = (X_adhd_two, y_adhd_two)

healthy_cv = [(train, test) for train, test in RKF_10_10.split(healthy[0])]
adhd_one_cv = [(train, test) for train, test in RKF_10_10.split(adhd_one[0])]
adhd_two_cv = [(train, test) for train, test in RKF_10_10.split(adhd_two[0])]

### Run permutation-test (train group, test group)

In [None]:
%%time
# From previous results
diag_alphas = [5000, 15000, 15000]
diag_labels = [
    ['Healthy', 'ADHD_One', 'ADHD_Two'],
    ['ADHD_One', 'ADHD_Two', 'Healthy'],
    ['ADHD_Two', 'Healthy', 'ADHD_One']
]
diag_order = [
    [healthy, adhd_one, adhd_two], 
    [adhd_one, adhd_two, healthy], 
    [adhd_two, healthy, adhd_one]
]
diag_cv_order = [
    [healthy_cv, adhd_one_cv, adhd_two_cv], 
    [adhd_one_cv, adhd_two_cv, healthy_cv], 
    [adhd_two_cv, healthy_cv, adhd_one_cv]
]
results = []

for diag_alpha, diags, diags_cv, label in zip(diag_alphas, diag_order, diag_cv_order, diag_labels):
    train_diag, test_diag_one, test_diag_two = diags[0], diags[1], diags[2]
    train_diag_cv, test_diag_one_cv, test_diag_two_cv = diags_cv[0], diags_cv[1], diags_cv[2]
    
    pipe = make_pipeline(StandardScaler(), Ridge(alpha=diag_alpha))
    result = custom_permutation_test_score(
        pipe, train_diag, test_diag_one, test_diag_two, 
        train_diag_cv, test_diag_one_cv, test_diag_two_cv, N_PERM, unimetric_scorer)
    
    for r, p, group in zip(result[0], result[2], label):
        results.append({
            'Model': 'ridge',
            'Target': selected_target,
            'Num Permutations': N_PERM,
            'Train Group': label[0],
            'Test Group': group,
            'Score': r,
            'P-value': p,
        })
#     print(results[-3:])
    print(f'Train Group: {label[0]}')

results_df = pd.DataFrame(results)
display(results_df.round(4))
filename = f'ridge_pts_diagnosis_cross_prediction.csv'
output_path = join(CROSS_PRED_RESULTS, filename)
mode = 'a' if exists(output_path) else 'w'
results_df.to_csv(output_path, mode=mode, header=not exists(output_path))

### Visualize permutation results