In [99]:
import pandas as pd
import numpy as np
import pickle

In [73]:
# Load patient physiological features and health outcomes
patient_features = pd.read_csv('/home/ryan/dl-data-exploration/patient_features.csv')
patient_info = pd.read_csv('/home/ryan/dl-data-exploration/patient_info.csv')

# Drop index column
patient_features = patient_features.drop(patient_features.columns[0], axis=1)
patient_info = patient_info.drop(patient_info.columns[0], axis=1)

print('Feature Shape: {}'.format(patient_features.shape))
print('Patient Info Shape: {}'.format(patient_info.shape))

Feature Shape: (513452, 70)
Patient Info Shape: (2993116, 12)


In [74]:
patient_features.head()

Unnamed: 0,eDWID,YRM,mssa,mrsa,h_flu,pseudo,burkho_complex,alcalig,steno,enterobacter,...,hypersaline,chronic_macrolide,oral_steroids,inh_steroids,inhsteroids_bronchodil,oral_other_abx,txflag,othertx,nexttxflag,nextothertx
0,900000702,2003,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0.0,0.0
1,900000702,2004,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0.0,0.0
2,900000702,2005,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,0.0,0.0
3,900000702,2006,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0.0,0.0
4,900000702,2007,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,0,0,0.0,0.0


In [75]:
patient_info.head()

Unnamed: 0,eDWID,YRM,jedate,jbdate,jddate,mut_name1,mut_name2,mut_name3,survdays,dflag,surv5,dflag5
0,900000702,1997,13669,13623,,F508del,F508del,,6784,0,1826.21,0
1,900000702,1997,13696,13623,,F508del,F508del,,6757,0,1826.21,0
2,900000702,1997,13697,13623,,F508del,F508del,,6756,0,1826.21,0
3,900000702,1997,13788,13623,,F508del,F508del,,6665,0,1826.21,0
4,900000702,1998,13913,13623,,F508del,F508del,,6540,0,1826.21,0


In [76]:
# Find the difference between the number of patients in each table
num_patients_no_feats = np.abs(patient_features['eDWID'].nunique() - patient_info['eDWID'].nunique())
print('Number of patients without features: {}'.format(num_patients_no_feats))

Number of patients without features: 5522


In [78]:
# Extract death outcomes
dflags = patient_info[['eDWID', 'dflag']].drop_duplicates()
dflags = dflags[dflags.eDWID.isin(patient_features.eDWID.unique())] # Remove patients with no features
print('dflags Shape: {}'.format(dflags.shape))

dflags Shape: (43454, 2)


In [79]:
# Sort according to patient
patient_features = patient_features.sort_values(by=['eDWID']) 
dflags = dflags.sort_values(by=['eDWID'])

# Compute measurement frequency statistics
tmp = patient_features.groupby('eDWID').count()['YRM']
avg_recordings = tmp.mean()
std_recordings = tmp.std()
max_recordings = tmp.max()
min_recordings = tmp.min()

print('Average number of recordings per patient: {}'.format(avg_recordings))
print('Standard deviation of the number of recordings per patient: {}'.format(std_recordings))
print('Maximum number of recordings per patient: {}'.format(max_recordings))
print('Minimum number of recordings per patient: {}'.format(min_recordings))

Average number of recordings per patient: 11.8159893220417
Standard deviation of the number of recordings per patient: 7.867254489336622
Maximum number of recordings per patient: 30
Minimum number of recordings per patient: 1


In [84]:
# Compute number of rows and columns with missing values
def count_nan(x):
    return np.isnan(x).sum() > 0

num_incomplete_rows = patient_features.apply(count_nan, axis=1).sum()
num_incomplete_cols = patient_features.apply(count_nan, axis=0).sum()

print('Number of incomplete rows: {}'.format(num_incomplete_rows))
print('Number of incomplete cols: {}'.format(num_incomplete_cols))

Number of incomplete rows: 513452
Number of incomplete cols: 36


In [93]:
# Drop incomplete features
cols_to_drop = patient_features.apply(count_nan, axis=0)
cols_to_drop = patient_features.columns[cols_to_drop]
complete_patient_features = patient_features.drop(cols_to_drop, axis=1).sort_values(by=['eDWID', 'YRM'])

In [96]:
# Count the number of binary categorical variables
is_binary = lambda x: x.nunique() <= 2
num_binary_vars = complete_patient_features.apply(is_binary, axis=0).sum() - 2
print('Number of binary categorical variables: {}'.format(num_binary_vars))

Number of binary categorical variables: 28


In [98]:
# Evaluate the variable type of all complete features
num_vals = lambda x: x.nunique() 
complete_patient_features.apply(num_vals, axis=0)

eDWID                     43454
YRM                          30
mssa                          2
mrsa                          2
h_flu                         2
pseudo                        2
burkho_complex                2
alcalig                       2
steno                         2
enterobacter                  2
serratia_marcescens           2
aspergillus                   2
candida                       2
scedosporium                  2
mabscessus                    2
mai                           2
bd_age                       82
sex                           2
suff                          2
trunc03                       6
all_tob                       2
all_mod                       2
all_bd                        2
dnase                         2
inhcolistin                   2
inhaztreonam                  2
hypersaline                   2
chronic_macrolide             2
oral_steroids                 2
inh_steroids                  2
inhsteroids_bronchodil        2
oral_oth

In [104]:
# Create a pickle file for the processed data containing only complete features
complete_patient_features.to_pickle('/home/ryan/dl-data-exploration/processed_data_complete_feats.pkl')