# Age and Sex Prediction

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Get Data

In [14]:
from common.data import get_data
from common.paths import HEALTHY, ADHD

X, Y, demographics, population = get_data(label_path=ADHD)
ages, sexes = demographics['Age'], demographics['Sex']

print(f'X: {X.shape} | Y: {len(Y.keys())} | Age: {ages.shape} | Population: {population}')

X: (373, 34716) | Y: 6 | Age: (373,) | Population: adhd


## Age

In [9]:
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.model_selection import permutation_test_score

from common.scoring import RKF_10_10
from common.paths import RIDGE_RESULTS, RIDGE_PSCORES
from common.results import save_results, save_perm_score

### Ridge

In [15]:
%%time

selected_target = "age"
age_group = 'all'

pipe = make_pipeline(StandardScaler(), RidgeCV(alphas=[a for a in range(1, 10000, 100)],
                                                       scoring='r2', cv=RKF_10_10))
pipe.fit(X, ages)

print(f'ridge_{population}_{selected_target}_{age_group}')
print(f'best score: {pipe["ridgecv"].best_score_:.2f}, best alpha: {pipe["ridgecv"].alpha_}')

ridge_adhd_age_all
best score: 0.46, best alpha: 4501
CPU times: user 1h 22min 38s, sys: 2min 16s, total: 1h 24min 54s
Wall time: 8min 29s


In [16]:
%%time

selected_target = "age"
age_group = 'all'
best_alpha = 1

pipe = make_pipeline(StandardScaler(), Ridge(alpha=best_alpha))
score, perm_scores, pvalue = permutation_test_score(pipe, X, ages, scoring='r2', cv=RKF_10_10)
results = pd.DataFrame([{    
            'Model': 'ridge',
            'Population': population,
            'Target': selected_target,
            'Bin': age_group,
            'Alpha': best_alpha,
            'Score': score,
            'P-value': pvalue,
        }])
save_perm_score(perm_scores, f'ridge_{population}_{selected_target}_{age_group}_perm_scores', 
                RIDGE_PSCORES)
display(results)

filename = f'ridge_pts_{population}_{selected_target}'
save_results(results, filename, RIDGE_RESULTS)

Unnamed: 0,Model,Population,Target,Bin,Alpha,Score,P-value
0,ridge,adhd,age,all,1,0.445012,0.009901


CPU times: user 5h 22min 32s, sys: 7min 33s, total: 5h 30min 6s
Wall time: 34min 2s


'/home/bpho/Documents/MSc_Research-Project/scratch_data/model_results/Ridge/ridge_pts_adhd_age.csv'

## Sex

In [37]:
from sklearn.linear_model import RidgeClassifierCV
from scipy.stats import binom
from common.scoring import RKF_10_10

### Ridge

In [33]:
%%time

selected_target = "sex"
age_group = 'all'

pipe = make_pipeline(StandardScaler(), RidgeClassifierCV(alphas=[a for a in range(1, 10000, 100)],
                                                         class_weight='balanced', cv=RKF_10_10))
pipe.fit(X, sexes)

print(f'ridge_{population}_{selected_target}_{age_group}')
print(f'best score: {pipe["ridgeclassifiercv"].best_score_:.2f}')
print(f'best alpha: {pipe["ridgeclassifiercv"].alpha_}')

ridge_adhd_sex_all
best score: 0.74
CPU times: user 3h 28min 51s, sys: 14min 21s, total: 3h 43min 13s
Wall time: 22min 20s


In [36]:
print(sexes.shape, pipe["ridgeclassifiercv"].alpha_)

(373,) 2301


In [None]:
%%time

selected_target = "sex"
age_group = 'all'

pipe = make_pipeline(StandardScaler(), RidgeClassifier(alpha=2301, class_weight='balanced', 
                                                       cv=RKF_10_10))
pipe.fit(X, sexes)

print(f'ridge_{population}_{selected_target}_{age_group}')
print(f'best score: {pipe["ridgeclassifiercv"].best_score_:.2f}')

In [58]:
q, n, p = 0.95, 106, 0.5
print(binom.ppf(q, n, p))
print(1 - binom.cdf(65, n, p))

61.0
0.007389705258519119
