In [13]:
import pandas as pd
import os
import os.path
from sklearn.model_selection import GroupShuffleSplit
from src.utils.data import getDataPandas, getConfig
from pipe.stats import stats_analyze

In [20]:
data_config = getConfig('data')
group = data_config['data_group']

In [3]:
data = getDataPandas()
data = data.sample(frac=1, random_state=1)
data = data.drop_duplicates(subset=['PATNO', 'EVENT_ID'], keep='first').reset_index(drop=True)
data = data.drop(data[data['NUPDR3OF'] < 5].index).reset_index(drop=True)
data = data.drop(data[data['LEDD'] > 5000].index).reset_index(drop=True)

In [6]:
data['KEY'] = data['PATNO'].astype(str) + data['EVENT_ID']

In [14]:
splitter = GroupShuffleSplit(test_size=0.2, n_splits=1, random_state=0)
split = splitter.split(data, groups=data['PATNO'])
train_inds, test_inds = next(split)

In [25]:
x = data[group['demo'] + group['clinic'] + ['PATNO']]
y = data[['CAT']]

In [26]:
x_clinic_train = x.iloc[train_inds].reset_index(drop=True)
x_clinic_test = x.iloc[test_inds].reset_index(drop=True)
y_train = y.iloc[train_inds].reset_index(drop=True)
y_test = y.iloc[test_inds].reset_index(drop=True)

In [27]:
stats_analyze(x_clinic_train, x_clinic_test, y_train, y_test, data_config)

SEX chi2 p: 0.5414803876006902
CAT chi2 p: 0.9510706628888684
AGE_AT_VISIT Normaltest p_train: [0.07505303], p_test: [0.48389882]
AGE_AT_VISIT t-test p: Ttest_indResult(statistic=array([1.54953415]), pvalue=array([0.12271054]))
NUPDR3OF Normaltest p_train: [0.01407325], p_test: [0.53859285]
NUPDR3OF ranksums p: RanksumsResult(statistic=array([0.53408144]), pvalue=array([0.59328519]))
LEDD Normaltest p_train: [2.38365408e-12], p_test: [0.16081336]
LEDD ranksums p: RanksumsResult(statistic=array([0.1754091]), pvalue=array([0.86075819]))
DURATION Normaltest p_train: [0.00553476], p_test: [0.13860719]
DURATION ranksums p: RanksumsResult(statistic=array([-0.45554005]), pvalue=array([0.64872076]))



In [29]:
x_clinic_train.describe()

Unnamed: 0,AGE_AT_VISIT,SEX,NUPDR3OF,LEDD,DURATION,PATNO
count,173.0,173.0,173.0,173.0,173.0,173.0
mean,64.116763,0.67052,27.514451,604.188925,41.884393,26864.751445
std,9.248878,0.471389,12.413237,417.367176,21.673541,28148.970004
min,35.1,0.0,7.0,40.0,2.0,3107.0
25%,57.3,0.0,18.0,300.0,26.0,3558.0
50%,65.0,1.0,26.0,504.0,44.0,4019.0
75%,71.3,1.0,35.0,765.0,54.0,50027.0
max,86.3,1.0,62.0,2379.5,112.0,149511.0


In [30]:
x_clinic_test.describe()

Unnamed: 0,AGE_AT_VISIT,SEX,NUPDR3OF,LEDD,DURATION,PATNO
count,46.0,46.0,46.0,46.0,46.0,46.0
mean,61.736957,0.608696,25.913043,560.032609,44.152174,21970.23913
std,9.293112,0.493435,11.796094,319.334055,23.766426,29019.844196
min,38.7,0.0,5.0,30.0,4.0,3124.0
25%,55.625,0.0,16.25,300.0,25.25,3309.0
50%,62.2,1.0,26.5,489.4,43.0,3700.0
75%,68.425,1.0,34.25,717.5,55.75,40744.0
max,77.1,1.0,53.0,1396.5,107.0,101295.0


In [36]:
data_config['train_index'] = train_inds.tolist()
data_config['test_index'] = test_inds.tolist()

In [37]:
import json
with open(os.path.join('pipe', 'data_config.json'), 'w+', encoding="utf-8") as f:
        json.dump(data_config, f, ensure_ascii=False, indent=4)