In [1]:
#my stuff
import icu_data_defs
import transformers
import utils
import features
from constants import column_names,variable_type,clinical_source
import units
import mimic
import logger

#other stuff
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit
from sklearn.linear_model import LinearRegression,ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

#make pretty pictures
import seaborn as sns
%matplotlib inline

In [16]:
#HELPER FUNCTIONS

def run_crossval(pipeline,X,y):
    scores_r2 = cross_val_score(pipeline,X,y, scoring='r2',cv=10)
    scores_nmse = cross_val_score(pipeline,X,y, scoring='neg_mean_squared_error',cv=10)

    print 'Cross Validation, K-Fold'
    print 'R^2: {}, {}'.format(scores_r2.mean(),scores_r2.std())
    print 'RMSE: {}, {}'.format(np.sqrt(-1.0*scores_nmse).mean(),np.sqrt(-1.0*scores_nmse).std())

    cv_shuffle = ShuffleSplit(n_splits=10,test_size=0.1)

    scores_r2 = cross_val_score(pipeline,X,y, scoring='r2',cv=cv_shuffle)
    scores_nmse = cross_val_score(pipeline,X,y, scoring='neg_mean_squared_error', cv=cv_shuffle)

    print '\nCross Validation, ShuffleSplit'
    print 'R^2: {}, {}'.format(scores_r2.mean(),scores_r2.std())
    print 'RMSE: {}, {}'.format(np.sqrt(-1.0*scores_nmse).mean(),np.sqrt(-1.0*scores_nmse).std())
    return

"""
Getting lactate labels (next and delta)
"""
def get_labels_next_lac(lactate_series):
    return lactate_series.shift(-1).dropna().iloc[:,0]

def get_labels_delta_lac(lactate_series):
    id_grouped = lactate_series.groupby(level='id')
    lac_filled = id_grouped.ffill()
    lac_next = id_grouped.shift(-1)
    lac_all = lac_filled
    lac_all.columns = ['last']
    lac_all['next'] = lac_next
    lac_all = lac_all.dropna()
    return lac_all['next'] - lac_all['last']


"""
Visualize data
"""
#Visualize
def viz_per_feature(df_features,label_dict):  
    plot_cnt = len(label_dict)+1
    
    df_corr = pd.DataFrame(index=df_features.columns,columns=label_dict.keys())
    
    for col_name in df_features:
        print col_name
        col = df_features[col_name]
        display(col.describe().apply(lambda x: '%.4f' % x).to_frame())
        #determine # of filled values
        mode = col.mode()
        print mode
        mode_count = (col == mode).sum()
        print "MODE:",mode
        print mode_count
        print mode_count/float(col.shape[0])


        # plot histogram of column (all of df_train)
        fig, axarr  = plt.subplots(1,plot_cnt,figsize=(5*(plot_cnt), 5))
        ax = plt.subplot(1, plot_cnt, 1)
        std = col.std()
        mean = col.mean()
        col.loc[(col < (mean + 3.0*std)) & (col > (mean - 3.0*std))].hist()
        ax.set_title('{}_{}\n{}'.format(col_name[0],col_name[1],col_name[2:]))
        ax.set_xlabel(col[-2])
        ax.set_ylabel('COUNT')

        #plot this column vs. each label
        for i,label_name in label_dict.keys():
            y = label_dict[label_name]
            
            x = col.loc[y.index]
            ax = plt.subplot(1, plot_cnt, 1+i)
            sns.regplot(x, y)
            corr = np.corrcoef(x, y)[0][1]
            ax.set_title('{} \n PCC (r) = {}'.format(label_name,corr))
            df_corr.loc[col_name,label_name]=corr
        
        plt.tight_layout()
        plt.show()
    
    return df_corr
        
"""
Test/train/validate split
"""

def test_train_val_split(all_ids=None,test_size=0.1,random_state=42,print_ids=False):

    if all_ids is None:
        all_ids = mimic.get_all_hadm_ids()
    
    validate_size = test_size/(1-test_size)
    train_size = (1-test_size)*(1-validate_size)
    #these test IDs will never be touched again. They are sacred
    train_val_ids,test_ids = train_test_split(all_ids,test_size=test_size,random_state=random_state)
    train_ids,validate_ids = train_test_split(train_val_ids,test_size=validate_size,random_state=random_state)

    if print_ids:
        print 'Train {}:'.format(int(train_size*100)), len(train_ids),'>',train_ids[:5],'...'
        print 'Validate {}:'.format(int(train_size*100)), len(validate_ids),'>',validate_ids[:5],'...'
        print 'Test {}:'.format(int(test_size*100)), len(test_ids),'>',test_ids[:5],'...'
    return train_ids,validate_ids,test_ids

# Setup

In [7]:
random_state=42
#test/train/val split
train_ids,validate_ids,test_ids = test_train_val_split(print_ids=True,random_state=random_state);

# Load Our Data Dict
data_dict = icu_data_defs.data_dictionary('config/data_definitions.xlsx')
display(data_dict.get_defs())

#init ETL Manager => mimic_extract data
hdf5_fname = 'data/mimic_extract.h5'
mimic_etlM = mimic.MimicETLManager(hdf5_fname,'config/mimic_item_map.csv',data_dict)

#init feature factory using data_dict, etl_manager
factory = features.DataSetFactory(features=None,
                                      resample_freq=None,
                                      data_dict=data_dict,
                                      ETL_manager=mimic_etlM,
                                      hdf5_fname_target=None,
                                      panel_id=12 #limit to simple data
                                  )

#create all features
m_ureg = units.MedicalUreg()
is_summable = lambda x: m_ureg.is_volume(str(x)) or m_ureg.is_mass(str(x))

"""
Data Specs
"""
qn_not_sum = {
    column_names.VAR_TYPE : variable_type.QUANTITATIVE,
    column_names.UNITS: lambda units: not is_summable(units)
}

weight = {
    column_names.COMPONENT : data_dict.components.WEIGHT_BODY
}

intervention_summable = {
    column_names.CLINICAL_SOURCE : clinical_source.INTERVENTION,
    column_names.UNITS: is_summable
}

uop_summable = {
    column_names.COMPONENT : data_dict.components.OUTPUT_URINE,
    column_names.UNITS: is_summable
}

not_nominal = {
    column_names.VAR_TYPE : [variable_type.QUANTITATIVE, variable_type.ORDINAL]
}

is_nominal = {
    column_names.VAR_TYPE : variable_type.NOMINAL
}

"""
Features
"""
# MEAN & LAST
# For all non-summable quantities: 
# 1. ffill values
# 2. Then resample and aggregate 
# 3. Then fill with mean of means
f_qn_mean = features.Feature('MEAN','mean',
                               data_specs=[qn_not_sum,weight],
                               pre_processor=transformers.GroubyAndFFill(level=column_names.ID),
                               fillna_method=transformers.fill_mean()
                            )

f_qn_most_recent = features.Feature('LAST','last',
                                       data_specs=[qn_not_sum,weight],
                                       pre_processor=transformers.GroubyAndFFill(level=column_names.ID),
                                       fillna_method=transformers.fill_mean()
                                   )
# STD - fill NaN with 0
f_qn_std = features.Feature('STD','std',
                                data_specs=[not_nominal],
                                fillna_method=transformers.fill_zero()
                           )

# SUM - for UOP volumes and intervention volumes/masses
f_sum = features.Feature('SUM','sum',
                             data_specs=[intervention_summable,uop_summable],
                             fillna_method=transformers.fill_zero()
                        )

#only COUNT ordinal or quantitative data
f_count = features.Feature('COUNT','count',
                           data_specs=[not_nominal],
                           fillna_method=transformers.fill_zero())

#use SUM for nominal data (0's will be counted if we use count)
f_count_nom = features.Feature('COUNT','sum',
                             data_specs=[is_nominal],
                             fillna_method=transformers.fill_zero()
                        )

"""
Lactate Label
"""
# Label is lactate. just resampling; no preprocessing or filling
label = features.Feature('LABEL','mean',{
                                column_names.COMPONENT : data_dict.components.LACTATE,
                                column_names.VAR_TYPE : variable_type.QUANTITATIVE
                            })

Train 80: 47180 > [139698, 127590, 178959, 139276, 196600] ...
Validate 80: 5898 > [112338, 107467, 158733, 144544, 115417] ...
Test 10: 5898 > [167957, 164747, 124147, 184424, 136508] ...


Unnamed: 0_level_0,component,units,variable_type,clinical_source,lower,upper,list_id
def_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,heart rate,beats/min,qn,observation,0.0,500.0,
1,blood pressure systolic,mmHg,qn,observation,0.0,500.0,
2,blood pressure diastolic,mmHg,qn,observation,0.0,500.0,
3,blood pressure mean,mmHg,qn,observation,0.0,500.0,
4,respiratory rate,insp/min,qn,observation,0.0,150.0,
5,temperature body,degF,qn,observation,0.0,150.0,
6,oxygen saturation pulse oximetry,percent,qn,observation,0.0,100.0,
7,weight body,kg,qn,observation,0.0,700.0,
8,output urine,mL,qn,observation,0.0,30000.0,
9,output urine,mL/hr,qn,observation,0.0,5000.0,


# Get features

In [8]:
#start with a smaller data set
reload(logger)

train_subset = pd.Series(train_ids).sample(frac=0.2, random_state=random_state).sort_values().tolist()

print train_subset[:5], len(train_subset)

[100014L, 100029L, 100039L, 100046L, 100052L] 9436


In [9]:
factory.hdf5_fname_target = 'data/combine_like.h5'
factory.resample_freq='2H'
factory.pre_processors = Pipeline([
                                ('drop_small_columns',transformers.remove_small_columns(threshold=100)),
                                ('drop_low_id_count',transformers.record_threshold(threshold=20)),
                                ('combine_like_columns',transformers.combine_like_cols())
                            ])
factory.features = [f_qn_mean,f_qn_most_recent,f_qn_std,f_sum,f_count,f_count_nom,label]
factory.get_params()

{'ETL_manager': <mimic.MimicETLManager at 0xe103630>,
 'data_dict': <icu_data_defs.data_dictionary at 0x1067d940>,
 'features': [<features.Feature at 0xe65ae48>,
  <features.Feature at 0x10698f60>,
  <features.Feature at 0xebd4518>,
  <features.Feature at 0xebd44a8>,
  <features.Feature at 0xebd4358>,
  <features.Feature at 0xebd4630>,
  <features.Feature at 0xebd4748>],
 'force_preprocessing': True,
 'hdf5_fname_target': 'data/combine_like.h5',
 'panel_id': 12,
 'pre_processors': Pipeline(steps=[('drop_small_columns', remove_small_columns(threshold=100)), ('drop_low_id_count', record_threshold(threshold=20)), ('combine_like_columns', combine_like_cols())]),
 'pre_processors__combine_like_columns': combine_like_cols(),
 'pre_processors__drop_low_id_count': record_threshold(threshold=20),
 'pre_processors__drop_low_id_count__threshold': 20,
 'pre_processors__drop_small_columns': remove_small_columns(threshold=100),
 'pre_processors__drop_small_columns__threshold': 100,
 'pre_processors_

In [10]:
df_train = factory.fit_transform(train_subset)

(2017-08-03 11:32:22) FEATURIZE... #F=7, #ids=9436, fit->True
(2017-08-03 11:32:22)>> PRE-PROCESSING & JOIN: #C=18, ['blood pressure diastolic', 'blood pressure mean', 'blood pressure systolic', 'glasgow coma scale eye opening', 'glasgow coma scale motor', 'glasgow coma scale verbal', 'heart rate', 'hemoglobin', 'lactate', 'lactated ringers', 'norepinephrine', 'normal saline', 'output urine', 'oxygen saturation pulse oximetry', 'respiratory rate', 'temperature body', 'vasopressin', 'weight body']
(2017-08-03 11:32:24)>>>> blood pressure diastolic - 1/18
(2017-08-03 11:32:24)>>>>>> READ DF...
(2017-08-03 11:32:45)<<<<<< --- (21.0s)
(2017-08-03 11:32:45)>>>>>> PREPROCESS...
(2017-08-03 11:32:45)<<<<<< --- (0.0s)
(2017-08-03 11:32:45)>>>>>> *fit* Filter columns (remove_small_columns) (985854, 42)
(2017-08-03 11:32:45)<<<<<< --- (0.0s)
(2017-08-03 11:32:45)>>>>>> *transform* Filter columns (remove_small_columns) (985854, 42)
(2017-08-03 11:32:45)<<<<<< --- (0.0s)
(2017-08-03 11:32:45)>>>>>

In [12]:
y_next = get_labels_next_lac(df_train.loc[:,'LABEL'])
y_delta = get_labels_delta_lac(df_train.loc[:,'LABEL'])
label_dict = {'NEXT_lactate' : y_next, 'DELTA_lactate':y_delta}

df_features = df_train.drop('LABEL',axis=1)

In [17]:
df_corr = viz_per_feature(df_features,label_dict)

('MEAN', 'blood pressure diastolic', 'known', 'qn', 'mmHg', 'all')


Unnamed: 0_level_0,MEAN
Unnamed: 0_level_1,blood pressure diastolic
Unnamed: 0_level_2,known
Unnamed: 0_level_3,qn
Unnamed: 0_level_4,mmHg
Unnamed: 0_level_5,all
count,1052741.0
mean,58.4419
std,10.5781
min,0.0
25%,58.4419
50%,58.4419
75%,58.4419
max,228.0


ValueError: Can only compare identically-labeled Series objects

In [None]:
df_corr