### Load data from csv using pandas

In [1]:
import pandas as pd

In [2]:
#encounters = pd.read_csv('/data/MIMIC/mini_encounter_vectors_processed.csv') 
encounters = pd.read_csv('/data/MIMIC/encounter_vectors_processed.csv')

In [5]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import roc_auc_score

def run_cross_val(train_X, train_y, test_X, test_y, roc_auc):
    print('mlp')
    mlp = MLPClassifier(hidden_layer_sizes=(100, ), activation='relu',
                        solver='adam', batch_size=256, max_iter=20,
                        early_stopping=True, verbose=True)
    mlp_pred = mlp.fit(train_X, train_y).predict(test_X)
    roc_auc['mlp'].append(roc_auc_score(test_y, mlp_pred,
                                        average='macro',
                                        sample_weight=None))
    
    print('rf')
    rf = RandomForestClassifier(n_jobs=-1, n_estimators=5, max_features='log2',
                                random_state=123)
    rf_pred = rf.fit(train_X, train_y).predict(test_X)
    roc_auc['rf'].append(roc_auc_score(test_y, rf_pred,
                                        average='macro',
                                        sample_weight=None))
    
    print('lr via hinge SGD')
    lr = SGDClassifier(loss="hinge", penalty="l2", random_state=123)
    lr_pred = lr.fit(train_X, train_y).predict(test_X)
    roc_auc['lr'].append(roc_auc_score(test_y, lr_pred,
                                        average='macro',
                                        sample_weight=None))
    
    print('svm via log SGD')
    svm = SGDClassifier(loss="log", penalty="l2", random_state=123)
    svm_pred = svm.fit(train_X, train_y).predict(test_X)
    roc_auc['svm'].append(roc_auc_score(test_y, svm_pred,
                                        average='macro',
                                        sample_weight=None))
    
    
    return roc_auc

In [6]:
from sklearn.cross_validation import StratifiedKFold
import pickle as pkl

# for 1, 3, 5, 10, 20, 30 Encounters
# for i in [1, 3, 5, 10, 20, 30]:
all_scores = {}
for i in [1, 3, 5, 10, 20, 30, 50]:
    print(i)
    
    encounters['6MONTH'] = 0
    encounters['1YEAR'] = 0
    encounters.loc[(encounters['SURVIVAL'] > 0) & (encounters['SURVIVAL'] < 183), '6MONTH'] = 1
    encounters.loc[(encounters['SURVIVAL'] > 0) & (encounters['SURVIVAL'] < 366), '1YEAR'] = 1
    print('Survival :', encounters['6MONTH'].sum(), encounters['1YEAR'].sum())

    y_1year = encounters['1YEAR'].values
    
    encounters_grouped = (encounters.groupby('HADM_ID').head(i).groupby('HADM_ID',
                          as_index=False).mean().drop(['SUBJECT_ID', 'HADM_ID', 'ENCOUNTER_ID',
                                                       'SURVIVAL', '6MONTH'], axis=1))    
    
    # do 1 year survival after grouping
    y_1year = encounters_grouped['1YEAR'].values
    encounters_grouped.drop(['1YEAR'], axis=1, inplace=True)
    e = encounters_grouped.values
    
    print(type(e), e.shape)
    
    cv = StratifiedKFold(y_1year, n_folds=5, random_state=123)
    roc_auc = {'svm':[], 'lr': [], 'mlp':[], 'rf':[]}
    
    for j, (train, test) in enumerate(cv):
        roc_auc = run_cross_val(e[train], y_1year[train],
                                e[test], y_1year[test],
                                roc_auc)
        print('Cross fold: ', j, roc_auc)
        
    all_scores[i] = roc_auc
pkl.dump(all_scores, open('/data/MIMIC/encounter_scores_mlp_svm_lr_rf.p', 'wb'))

1
('Survival :', 344049L, 411879L)
(<type 'numpy.ndarray'>, (58438, 1196))
mlp
Iteration 1, loss = 7.69646541
Validation score: 0.775615
Iteration 2, loss = 7.45237615
Validation score: 0.779251
Iteration 3, loss = 7.29281908
Validation score: 0.403422
Iteration 4, loss = 6.93997604
Validation score: 0.785241
Iteration 5, loss = 6.92599588
Validation score: 0.695187
Iteration 6, loss = 6.81351608
Validation score: 0.644064
Iteration 7, loss = 6.96382583
Validation score: 0.787594
Iteration 8, loss = 6.92453485
Validation score: 0.786096
Iteration 9, loss = 6.93762674
Validation score: 0.787807
Iteration 10, loss = 6.93649341
Validation score: 0.778610
Iteration 11, loss = 6.78405246
Validation score: 0.787166
Iteration 12, loss = 6.89997719
Validation score: 0.780107
Validation score did not improve more than tol=0.000100 for two consecutive epochs. Stopping.
rf
lr via hinge SGD
svm via log SGD
('Cross fold: ', 0, {'mlp': [0.54416107743459141], 'svm': [0.50752947213691357], 'lr': [0.50