In [12]:
import pandas as pd
import numpy as np

In [13]:
# Load patient physiological features and health outcomes
patient_features = pd.read_csv('/home/ryan/dl-data-exploration/patient_features.csv')
patient_info = pd.read_csv('/home/ryan/dl-data-exploration/patient_info.csv')

# Drop index column
patient_features = patient_features.drop(patient_features.columns[0], axis=1)
patient_info = patient_info.drop(patient_info.columns[0], axis=1)

In [14]:
# Find the difference between the number of patients in each table
num_patients_no_feats = np.abs(patient_features['eDWID'].nunique() - patient_info['eDWID'].nunique())
print('Number of patients without features: {}'.format(num_patients_no_feats))

Number of patients without features: 5522


In [15]:
# Extract death outcomes
dflags = patient_info[['eDWID', 'dflag']].drop_duplicates()
dflags = dflags[dflags.eDWID.isin(patient_features.eDWID.unique())] # Remove patients with no features

In [20]:
# Sort according to patient
patient_features = patient_features.sort_values(by=['eDWID', 'YRM']) 
dflags = dflags.sort_values(by=['eDWID'])

# Compute measurement frequency statistics
tmp = patient_features.groupby('eDWID').count()['YRM']
avg_recordings = tmp.mean()
std_recordings = tmp.std()
max_recordings = tmp.max()
min_recordings = tmp.min()

print('Average number of recordings per patient: {}'.format(avg_recordings))
print('Standard deviation of the number of recordings per patient: {}'.format(std_recordings))
print('Maximum number of recordings per patient: {}'.format(max_recordings))
print('Minimum number of recordings per patient: {}'.format(min_recordings))

Average number of recordings per patient: 11.8159893220417
Standard deviation of the number of recordings per patient: 7.867254489336622
Maximum number of recordings per patient: 30
Minimum number of recordings per patient: 1


In [6]:
# Compute number of rows and columns with missing values
def count_nan(x):
    return np.isnan(x).sum() > 0

num_incomplete_rows = patient_features.apply(count_nan, axis=1).sum()
num_incomplete_cols = patient_features.apply(count_nan, axis=0).sum()

num_rows = patient_features.shape[0]
num_cols = patient_features.shape[1]

print('Number of incomplete rows: {}'.format(num_incomplete_rows))
print('Number of incomplete cols: {}'.format(num_incomplete_cols))

Number of incomplete rows: 513452
Number of incomplete cols: 36


In [9]:
# Drop incomplete features
cols_to_drop = patient_features.apply(count_nan, axis=0)
cols_to_drop = patient_features.columns[cols_to_drop]
complete_patient_features = patient_features.drop(cols_to_drop, axis=1)

In [11]:
'''
is_binary = lambda x: x.nunique() == 2
num_binary_vars = complete_patient_features.apply(is_binary, axis=0) - 2 
'''

eDWID                     False
YRM                       False
mssa                       True
mrsa                       True
h_flu                      True
pseudo                     True
burkho_complex             True
alcalig                    True
steno                      True
enterobacter               True
serratia_marcescens        True
aspergillus                True
candida                    True
scedosporium               True
mabscessus                 True
mai                        True
bd_age                    False
sex                        True
suff                       True
trunc03                   False
all_tob                    True
all_mod                    True
all_bd                     True
dnase                      True
inhcolistin                True
inhaztreonam               True
hypersaline                True
chronic_macrolide          True
oral_steroids              True
inh_steroids               True
inhsteroids_bronchodil     True
oral_oth