# Age and Sex Prediction

In [1]:
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_theme(
    context="paper", 
    style="whitegrid", 
    font_scale=1.2,
    rc={'figure.figsize': (10, 10), 'figure.dpi': 300}
)

from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

## Get Data

In [2]:
from common.data import get_data
from common.paths import HEALTHY, ADHD

X, Y, demographics, population = get_data(label_path=HEALTHY)
ages, sexes = demographics['Age'], demographics['Sex']

print(f'X: {X.shape} | Y: {len(Y.keys())} | Age: {ages.shape} | Population: {population}')

X: (106, 34716) | Y: 6 | Age: (106,) | Population: healthy


## Age

In [6]:
from common.scoring import RKF_10_10
from sklearn.cross_decomposition import PLSRegression
from sklearn.linear_model import RidgeCV
from sklearn.metrics import r2_score

In [4]:
def r2(model, X, y):
    return r2_score(y, model.predict(X))

### PLS

In [None]:
# UNUSED
scores = []

for _ in range(100):
    subsample_idx = np.random.choice(len(X), 106)
    X_sub, ages_sub = X[subsample_idx], ages[subsample_idx]
    
    pipe = make_pipeline(StandardScaler(), PLSRegression(n_components=4))
    score = cross_validate(pipe, X_sub, ages_sub, cv=RKF_10_10, scoring=r2, n_jobs=-1, 
                           return_train_score=False, return_estimator=False)
    scores.append(np.mean(score['test_score']))

print(f'PLS age prediction: {np.mean(scores)}')

### Ridge

In [9]:
X_cv = X
y_cv = ages
selected_target = "age"
age_group = 'all'
scores = []

pipe = make_pipeline(StandardScaler(), RidgeCV(alphas=[a for a in range(1, 10000, 100)],
                                                       scoring=r2, cv=RKF_10_10))
if population == 'adhd':

    for _ in range(100):
        subsample_idx = np.random.choice(len(X_cv), 106)
        X_cv_sub, y_cv_sub = X_cv[subsample_idx], y_cv[subsample_idx]

        pipe.fit(X_cv_sub, y_cv_sub)
        score = pipe['ridgecv'].best_score_
        scores.append(score)
else:
    pipe.fit(X_cv, y_cv)
    scores.append(pipe['ridgecv'].best_score_)

print(f'ridge_{population}_{selected_target}_{age_group}')
print(np.mean(scores))

ridge_healthy_age_all
0.19233819888661383


## Sex

In [10]:
from sklearn.linear_model import RidgeClassifierCV

### Ridge

In [11]:
%%time

X_cv = X
y_cv = sexes
selected_target = "sex"
age_group = 'all'
scores = []

pipe = make_pipeline(StandardScaler(), RidgeClassifierCV(alphas=[a for a in range(1, 10000, 100)],
                                                         class_weight='balanced', cv=RKF_10_10))
if population == 'adhd':

    for _ in range(100):
        subsample_idx = np.random.choice(len(X_cv), 106)
        X_cv_sub, y_cv_sub = X_cv[subsample_idx], y_cv[subsample_idx]

        pipe.fit(X_cv_sub, y_cv_sub)
        score = pipe['ridgeclassifiercv'].best_score_
        scores.append(score)
else:
    pipe.fit(X_cv, y_cv)
    scores.append(pipe['ridgeclassifiercv'].best_score_)

print(f'ridge_{population}_{selected_target}_{age_group}')
print(np.mean(scores))

ridge_healthy_sex_all
0.5899090909090909
CPU times: user 54min 48s, sys: 1min 47s, total: 56min 35s
Wall time: 6min 2s
