In [1]:
import pandas as pd
import os
import os.path
from sklearn.model_selection import GroupShuffleSplit, StratifiedShuffleSplit
from src.utils.data import getDataPandas, getConfig
from pipe.stats import stats_analyze

In [4]:
data_config = getConfig('data')
group = data_config['data_group']

In [15]:
data = getDataPandas()
data = data.sample(frac=1, random_state=1)
data = data.drop_duplicates(subset=['PATNO', 'EVENT_ID'], keep='first')
data = data.drop(data[data['NUPDR3OF'] < 5].index)
data = data.drop(data[data['LEDD'] > 5000].index)

In [16]:
data['KEY'] = data['PATNO'].astype(str) + data['EVENT_ID']
data = data.reset_index()

In [17]:
splitter = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=0)
split = splitter.split(data, groups=data['PATNO'])
train_inds, test_inds = next(split)

In [18]:
x = data[group['demo'] + group['clinic'] + ['PATNO']]
y = data[['CAT']]

In [19]:
x_clinic_train = x.iloc[train_inds].reset_index(drop=True)
x_clinic_test = x.iloc[test_inds].reset_index(drop=True)
y_train = y.iloc[train_inds].reset_index(drop=True)
y_test = y.iloc[test_inds].reset_index(drop=True)

In [20]:
stats_analyze(x_clinic_train, x_clinic_test, y_train, y_test, data_config)

SEX chi2 p: 0.5414803876006901
CAT chi2 p: 0.9510706628888684
AGE_AT_VISIT Normaltest p_train: [0.07505303], p_test: [0.48389882]
AGE_AT_VISIT t-test p: Ttest_indResult(statistic=array([1.54953415]), pvalue=array([0.12271054]))
NUPDR3OF Normaltest p_train: [0.01407325], p_test: [0.53859285]
NUPDR3OF ranksums p: RanksumsResult(statistic=array([0.53408144]), pvalue=array([0.59328519]))
LEDD Normaltest p_train: [2.56158925e-12], p_test: [0.1798008]
LEDD ranksums p: RanksumsResult(statistic=array([0.51837316]), pvalue=array([0.60419794]))
DURATION Normaltest p_train: [0.00553476], p_test: [0.13860719]
DURATION ranksums p: RanksumsResult(statistic=array([-0.45554005]), pvalue=array([0.64872076]))



In [21]:
x_clinic_train.describe()

Unnamed: 0,AGE_AT_VISIT,SEX,NUPDR3OF,LEDD,DURATION,PATNO
count,173.0,173.0,173.0,173.0,173.0,173.0
mean,64.116763,0.67052,27.514451,645.397595,41.884393,26864.751445
std,9.248878,0.471389,12.413237,428.209154,21.673541,28148.970004
min,35.1,0.0,7.0,50.0,2.0,3107.0
25%,57.3,0.0,18.0,300.0,26.0,3558.0
50%,65.0,1.0,26.0,550.0,44.0,4019.0
75%,71.3,1.0,35.0,850.0,54.0,50027.0
max,86.3,1.0,62.0,2579.5,112.0,149511.0


In [22]:
x_clinic_test.describe()

Unnamed: 0,AGE_AT_VISIT,SEX,NUPDR3OF,LEDD,DURATION,PATNO
count,46.0,46.0,46.0,46.0,46.0,46.0
mean,61.736957,0.608696,25.913043,579.815217,44.152174,21970.23913
std,9.293112,0.493435,11.796094,321.575262,23.766426,29019.844196
min,38.7,0.0,5.0,30.0,4.0,3124.0
25%,55.625,0.0,16.25,300.0,25.25,3309.0
50%,62.2,1.0,26.5,555.0,43.0,3700.0
75%,68.425,1.0,34.25,717.5,55.75,40744.0
max,77.1,1.0,53.0,1396.5,107.0,101295.0


In [23]:
data_config['train_index'] = data.iloc[train_inds]['index'].tolist()
data_config['test_index'] = data.iloc[test_inds]['index'].tolist()

In [24]:
import json
with open(os.path.join('pipe', 'data_config.json'), 'w+', encoding="utf-8") as f:
        json.dump(data_config, f, ensure_ascii=False, indent=4)

In [25]:
data = getDataPandas()
data = data.sample(frac=1, random_state=1)
data = data.drop_duplicates(subset=['PATNO'], keep='first')
data = data.drop(data[data['NUPDR3OF'] < 5].index)
data = data.drop(data[data['LEDD'] > 5000].index)

In [26]:
data['KEY'] = data['PATNO'].astype(str) + data['EVENT_ID']
data = data.reset_index()

In [30]:
splitter = StratifiedShuffleSplit(test_size=0.2, n_splits=1, random_state=0)
split = splitter.split(data, y=data['CAT'])
train_inds, test_inds = next(split)

In [31]:
x_clinic_train = x.iloc[train_inds].reset_index(drop=True)
x_clinic_test = x.iloc[test_inds].reset_index(drop=True)
y_train = y.iloc[train_inds].reset_index(drop=True)
y_test = y.iloc[test_inds].reset_index(drop=True)

In [32]:
stats_analyze(x_clinic_train, x_clinic_test, y_train, y_test, data_config)

SEX chi2 p: 0.20052523173968406
CAT chi2 p: 0.6621696457420525
AGE_AT_VISIT Normaltest p_train: [0.45248025], p_test: [0.07807652]
AGE_AT_VISIT t-test p: Ttest_indResult(statistic=array([-1.20141059]), pvalue=array([0.23156421]))
NUPDR3OF Normaltest p_train: [0.16106472], p_test: [0.57039518]
NUPDR3OF t-test p: Ttest_indResult(statistic=array([-1.16355465]), pvalue=array([0.24652891]))
LEDD Normaltest p_train: [4.04550757e-10], p_test: [0.0412787]
LEDD ranksums p: RanksumsResult(statistic=array([0.26638209]), pvalue=array([0.78994496]))
DURATION Normaltest p_train: [0.00391606], p_test: [0.39589411]
DURATION ranksums p: RanksumsResult(statistic=array([-0.89843414]), pvalue=array([0.36895414]))



In [33]:
data_config['p_train_index'] = data.iloc[train_inds]['index'].tolist()
data_config['p_test_index'] = data.iloc[test_inds]['index'].tolist()

In [34]:
import json
with open(os.path.join('pipe', 'data_config.json'), 'w+', encoding="utf-8") as f:
        json.dump(data_config, f, ensure_ascii=False, indent=4)

In [46]:
data_config = getConfig('data')
group = data_config['data_group']
data = getDataPandas()
data = data.sample(frac=1, random_state=1)
data = data.drop_duplicates(subset=['PATNO', 'EVENT_ID'], keep='first')
data = data.drop(data[data['NUPDR3OF'] < 5].index)
data = data.drop(data[data['LEDD'] > 5000].index)
data['KEY'] = data['PATNO'].astype(str) + data['EVENT_ID']
#data = data.reset_index()

In [47]:
data['CAT10'] = pd.qcut(data['SCORE'], 10, labels=[i for i in range(10)])

In [67]:
pd.qcut(data['SCORE'], 10)

139     (0.231, 0.292]
232     (0.292, 0.367]
213      (-0.274, 0.0]
184       (0.5, 0.649]
95        (0.5, 0.649]
            ...       
71        (0.438, 0.5]
133    (0.0617, 0.167]
72        (0.438, 0.5]
235     (0.167, 0.231]
37       (-0.274, 0.0]
Name: SCORE, Length: 219, dtype: category
Categories (10, interval[float64, right]): [(-0.274, 0.0] < (0.0, 0.0617] < (0.0617, 0.167] < (0.167, 0.231] ... (0.367, 0.438] < (0.438, 0.5] < (0.5, 0.649] < (0.649, 0.957]]

In [48]:
x = data[group['demo'] + group['clinic'] + ['PATNO']]
y = data[['CAT10']]

In [51]:
splitter = StratifiedShuffleSplit(test_size=0.2, n_splits=1, random_state=0)
split = splitter.split(data, y=data['CAT10'])
train_inds, test_inds = next(split)
x_clinic_train = x.iloc[train_inds].reset_index(drop=True)
x_clinic_test = x.iloc[test_inds].reset_index(drop=True)
y_train = y.iloc[train_inds].reset_index(drop=True)
y_test = y.iloc[test_inds].reset_index(drop=True)

In [55]:
stats_analyze(x_clinic_train, x_clinic_test, y_train, y_test, data_config)

SEX chi2 p: 0.610929162141215
AGE_AT_VISIT Normaltest p_train: [0.03490096], p_test: [0.56331825]
AGE_AT_VISIT ranksums p: RanksumsResult(statistic=array([1.61555821]), pvalue=array([0.10618986]))
NUPDR3OF Normaltest p_train: [0.01972879], p_test: [0.38520182]
NUPDR3OF ranksums p: RanksumsResult(statistic=array([0.1224311]), pvalue=array([0.90255761]))
LEDD Normaltest p_train: [9.62697236e-13], p_test: [0.03043977]
LEDD ranksums p: RanksumsResult(statistic=array([-0.93952562]), pvalue=array([0.34746095]))
DURATION Normaltest p_train: [0.00030812], p_test: [0.1821253]
DURATION ranksums p: RanksumsResult(statistic=array([-0.17832356]), pvalue=array([0.85846888]))



In [59]:
data_config['train_cat10_index'] = train_inds.tolist()
data_config['test_cat10_index'] = test_inds.tolist()

In [60]:
import json
with open(os.path.join('pipe', 'data_config.json'), 'w+', encoding="utf-8") as f:
        json.dump(data_config, f, ensure_ascii=False, indent=4)

In [61]:
ori = getDataPandas()

In [62]:
ori['CAT10'] = data['CAT10']

In [63]:
ori

Unnamed: 0,PATNO,EVENT_ID,INFODT,NUPDR3OF,NUPDR3ON,IMG_ID,IMG_REL_PATH,AGE_AT_VISIT,SEX,ORIG_ENTRY,...,IMG_ROOT,T1_MNI_PATH,T1_GM_PATH,T1_SGM_PATH,IQR,TIV,GM_VOL,WM_VOL,ANTS_T1_MNI_PATH,CAT10
0,3107,V04,2012-03-01,2,2,I296431,../t1raw/3107/MPRAGE_GRAPPA/2012-03-28_10_35_2...,70.6,1,03/2011,...,../t1/3107V04I296431/,../t1/3107V04I296431/mri/wmt1.nii,../t1/3107V04I296431/mri/mwp1t1.nii,../t1/3107V04I296431/mri/smwp1t1.nii,78.43,1609.965165,688.315612,559.248086,../t1/3107V04I296431/reg_Warped.nii.gz,
1,3107,V06,2013-05-01,18,6,I378218,../t1raw/3107/MPRAGE_GRAPPA/2013-05-15_10_04_1...,71.7,1,03/2011,...,../t1/3107V06I378218/,../t1/3107V06I378218/mri/wmt1.nii,../t1/3107V06I378218/mri/mwp1t1.nii,../t1/3107V06I378218/mri/smwp1t1.nii,78.22,1615.716149,685.614294,553.943910,../t1/3107V06I378218/reg_Warped.nii.gz,9
2,3107,V10,2015-05-01,19,6,I498876,../t1raw/3107/Sag_MPRAGE_GRAPPA/2015-05-08_09_...,73.7,1,03/2011,...,../t1/3107V10I498876/,../t1/3107V10I498876/mri/wmt1.nii,../t1/3107V10I498876/mri/mwp1t1.nii,../t1/3107V10I498876/mri/smwp1t1.nii,79.42,1565.091831,656.401612,538.581196,../t1/3107V10I498876/reg_Warped.nii.gz,9
3,3108,V06,2013-04-01,13,15,I378222,../t1raw/3108/MPRAGE_GRAPPA/2013-04-24_10_04_3...,51.8,0,04/2011,...,../t1/3108V06I378222/,../t1/3108V06I378222/mri/wmt1.nii,../t1/3108V06I378222/mri/mwp1t1.nii,../t1/3108V06I378222/mri/smwp1t1.nii,78.86,1359.360710,632.728941,464.780004,../t1/3108V06I378222/reg_Warped.nii.gz,0
4,3108,V10,2015-05-01,22,12,I498885,../t1raw/3108/Sag_MPRAGE_GRAPPA/2015-05-06_09_...,53.8,0,04/2011,...,../t1/3108V10I498885/,../t1/3108V10I498885/mri/wmt1.nii,../t1/3108V10I498885/mri/mwp1t1.nii,../t1/3108V10I498885/mri/smwp1t1.nii,80.30,1333.413412,624.616871,457.013478,../t1/3108V10I498885/reg_Warped.nii.gz,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
320,101038,V04,2022-05-01,37,34,I1616556,../t1raw/101038/3D_T1/2022-06-09_16_58_22.0/S1...,65.5,1,04/2021,...,../t1/101038V04I1616556/,../t1/101038V04I1616556/mri/wmt1.nii,../t1/101038V04I1616556/mri/mwp1t1.nii,../t1/101038V04I1616556/mri/smwp1t1.nii,81.63,1631.955648,657.348178,601.336097,../t1/101038V04I1616556/reg_Warped.nii.gz,2
321,101175,V04,2022-05-01,53,40,I1582565,../t1raw/101175/3D_T1-weighted/2022-05-10_10_1...,72.0,1,04/2021,...,../t1/101175V04I1582565/,../t1/101175V04I1582565/mri/wmt1.nii,../t1/101175V04I1582565/mri/mwp1t1.nii,../t1/101175V04I1582565/mri/smwp1t1.nii,84.09,1570.875112,622.725396,521.507827,../t1/101175V04I1582565/reg_Warped.nii.gz,4
322,101179,V04,2022-04-01,45,25,I1571515,../t1raw/101179/3D_T1-weighted/2022-04-04_10_4...,45.0,0,03/2021,...,../t1/101179V04I1571515/,../t1/101179V04I1571515/mri/wmt1.nii,../t1/101179V04I1571515/mri/mwp1t1.nii,../t1/101179V04I1571515/mri/smwp1t1.nii,84.97,1221.022532,574.971436,465.306622,../t1/101179V04I1571515/reg_Warped.nii.gz,7
323,101295,V04,2022-08-01,31,23,I1616170,"../t1raw/101295/T1-weighted,_3D_VOLUMETRIC/202...",66.3,1,04/2021,...,../t1/101295V04I1616170/,../t1/101295V04I1616170/mri/wmt1.nii,../t1/101295V04I1616170/mri/mwp1t1.nii,../t1/101295V04I1616170/mri/smwp1t1.nii,81.80,1704.232476,688.404724,568.994237,../t1/101295V04I1616170/reg_Warped.nii.gz,4


In [65]:
data_json = ori.to_dict(orient='records')
with open('data.json', 'w+') as f:
    json.dump(data_json, f, ensure_ascii=False, indent=4)