In [75]:
import nibabel as nib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn import metrics
from scipy.stats import zscore
from src.utils.data import getDataPandas

In [7]:
data = getDataPandas()
data = data.drop_duplicates(subset=['PATNO', 'EVENT_ID'], keep='first').reset_index(drop=True)

In [8]:
data = data.drop(data[data['NUPDR3OF'] < 5].index).reset_index(drop=True)

In [9]:
def load_img(rec):
    img_data = np.array(nib.load(rec.T1_GM_PATH).get_fdata())
    return img_data

In [10]:
data['T1'] = data.apply(load_img, axis=1)

In [11]:
vox = np.array([np.array(l) for l in data['T1']])
vox = np.reshape(vox, (data.shape[0], -1))

In [12]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.9)
features = pca.fit_transform(vox)
fe = pd.DataFrame(features)

In [13]:
fe.shape

(236, 135)

In [109]:
x = data[['NUPDR3OF', 'NUPDR3ON', 'SCORE', 'AGE_AT_VISIT', 'SEX', 'DURATION']]
x = x.join(fe.apply(zscore))
#x[['NUPDR3OF', 'AGE_AT_VISIT', 'DURATION']] = x[['NUPDR3OF', 'AGE_AT_VISIT', 'DURATION']].apply(zscore)
y = data[['CAT']]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=12)

In [110]:
from scipy.stats import ttest_ind, chi2_contingency, normaltest, ranksums
print(normaltest(x_train['NUPDR3OF']))
print(normaltest(x_train['NUPDR3ON']))
print(normaltest(x_train['AGE_AT_VISIT']))
print(normaltest(x_train['DURATION']))
print(normaltest(x_train['SCORE']))
print()
print(normaltest(x_test['NUPDR3OF']))
print(normaltest(x_test['NUPDR3ON']))
print(normaltest(x_test['AGE_AT_VISIT']))
print(normaltest(x_test['DURATION']))
print(normaltest(x_test['SCORE']))
print()
print(ranksums(x_train['NUPDR3OF'], x_test['NUPDR3OF']))
print(ranksums(x_train['NUPDR3ON'], x_test['NUPDR3ON']))
print(ttest_ind(x_train['AGE_AT_VISIT'], x_test['AGE_AT_VISIT']))
print(ranksums(x_train['DURATION'], x_test['DURATION']))
print(ttest_ind(x_train['SCORE'], x_test['SCORE']))
_, p, _, _ = chi2_contingency([[len(x_test[x_test['SEX']==0]), len(x_train[x_train['SEX']==0])], [len(x_test[x_test['SEX']==1]), len(x_train[x_train['SEX']==1])]])
print(p)
_, p, _, _ = chi2_contingency([[len(y_test[y_test['CAT']==0]), len(y_train[y_train['CAT']==0])], [len(y_test[y_test['CAT']==1]), len(y_train[y_train['CAT']==1])]])
print(p)

NormaltestResult(statistic=7.665538969853419, pvalue=0.021649574356674188)
NormaltestResult(statistic=19.63985660250209, pvalue=5.4357481419856984e-05)
NormaltestResult(statistic=4.57309756441518, pvalue=0.10161655825076794)
NormaltestResult(statistic=12.774408014190307, pvalue=0.0016829551598869497)
NormaltestResult(statistic=4.9940877685637854, pvalue=0.08232801038703247)

NormaltestResult(statistic=4.69628848103336, pvalue=0.09554630876260099)
NormaltestResult(statistic=19.030603023005114, pvalue=7.371520199116631e-05)
NormaltestResult(statistic=4.953061641536057, pvalue=0.08403425036791416)
NormaltestResult(statistic=5.3725560313142475, pvalue=0.06813406193217383)
NormaltestResult(statistic=0.6712030263173397, pvalue=0.7149079304659041)

RanksumsResult(statistic=-0.5234909527662854, pvalue=0.6006326453262173)
RanksumsResult(statistic=-0.35175749541082985, pvalue=0.7250201375324923)
Ttest_indResult(statistic=0.34693694405823267, pvalue=0.7289504945388563)
RanksumsResult(statistic=-0

In [111]:
x1 = x[y['CAT']==1]
x0 = x[y['CAT']==0]
print(normaltest(x1['NUPDR3OF']))
print(normaltest(x1['NUPDR3ON']))
print(normaltest(x1['AGE_AT_VISIT']))
print(normaltest(x1['DURATION']))
print()
print(normaltest(x0['NUPDR3OF']))
print(normaltest(x0['NUPDR3ON']))
print(normaltest(x0['AGE_AT_VISIT']))
print(normaltest(x0['DURATION']))
print()
print(ranksums(x0['NUPDR3OF'], x1['NUPDR3OF']))
print(ranksums(x0['NUPDR3ON'], x1['NUPDR3ON']))
print(ranksums(x0['AGE_AT_VISIT'], x1['AGE_AT_VISIT']))
print(ttest_ind(x0['DURATION'], x1['DURATION']))
_, p, _, _ = chi2_contingency([[len(x1[x1['SEX']==0]), len(x0[x0['SEX']==0])], [len(x1[x1['SEX']==1]), len(x0[x0['SEX']==1])]])
print(p)

NormaltestResult(statistic=4.349438844218418, pvalue=0.11364003407915704)
NormaltestResult(statistic=6.651504515461585, pvalue=0.03594546832382435)
NormaltestResult(statistic=2.1933814126138955, pvalue=0.333974476594561)
NormaltestResult(statistic=4.07495714050031, pvalue=0.13035698282177208)

NormaltestResult(statistic=6.50169085089067, pvalue=0.03874144098271398)
NormaltestResult(statistic=15.138369784001751, pvalue=0.000516112965012224)
NormaltestResult(statistic=6.188144537159751, pvalue=0.04531703500166281)
NormaltestResult(statistic=5.933532013824612, pvalue=0.0514694934556066)

RanksumsResult(statistic=-0.20308790076037553, pvalue=0.8390663284689516)
RanksumsResult(statistic=7.249189246390306, pvalue=4.192737282670555e-13)
RanksumsResult(statistic=2.2015491213882963, pvalue=0.02769717324567565)
Ttest_indResult(statistic=-4.246434784214753, pvalue=3.134735560315373e-05)
1.0


In [57]:
model = XGBClassifier()
parameters = {'nthread': [4],
              'objective': ['binary:logistic'],
              'learning_rate': [0.1, 0.15, 0.2],
              'max_depth': [5, 10, 15],
              'min_child_weight': [5, 10, 15],
              'subsample': [0.8, 0.85, 0.9],
              'colsample_bytree': [0.75, 0.8, 0.85],
              'n_estimators': [50, 100, 200],
              'missing': [-999],
              'seed': [1]}
clf = GridSearchCV(model, parameters,
                        n_jobs=5,
                        cv=5,
                        verbose=3,
                        #n_iter=200,
                        scoring='roc_auc')

In [68]:
clf.fit(x_train, y_train)

Fitting 5 folds for each of 729 candidates, totalling 3645 fits


In [69]:
clf.best_params_

{'colsample_bytree': 0.75,
 'learning_rate': 0.1,
 'max_depth': 5,
 'min_child_weight': 10,
 'missing': -999,
 'n_estimators': 50,
 'nthread': 4,
 'objective': 'binary:logistic',
 'seed': 1,
 'subsample': 0.85}

In [70]:
y_prob = clf.best_estimator_.predict_proba(x_test)
print('AUC train {}, test {}'.format(clf.best_score_, metrics.roc_auc_score(list(y_test['CAT']), y_prob[:, 1])))

AUC train 0.6875723299476763, test 0.5731922398589064


In [71]:
y_pred = clf.best_estimator_.predict(x_train)
print(metrics.classification_report(list(y_train['CAT']), y_pred))
y_pred = clf.best_estimator_.predict(x_test)
print(metrics.classification_report(list(y_test['CAT']), y_pred))

              precision    recall  f1-score   support

           0       0.71      0.72      0.71        97
           1       0.70      0.68      0.69        91

    accuracy                           0.70       188
   macro avg       0.70      0.70      0.70       188
weighted avg       0.70      0.70      0.70       188

              precision    recall  f1-score   support

           0       0.55      0.57      0.56        21
           1       0.65      0.63      0.64        27

    accuracy                           0.60        48
   macro avg       0.60      0.60      0.60        48
weighted avg       0.61      0.60      0.61        48

