In [1]:
#TO RE-RUN
%reset -f

In [2]:
from sklearn import preprocessing
from time import time
import numpy as np
import csv
from sklearn import metrics
from sklearn.preprocessing import scale
from sklearn.feature_selection import VarianceThreshold
from sklearn.cross_validation import StratifiedShuffleSplit, cross_val_score

from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB

from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE,ADASYN, RandomOverSampler
from imblearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline

from IPython.display import display, HTML
from operator import truediv
from datetime import datetime
import pandas as pd
import time
import os

from pylab import *
import seaborn as sns
import matplotlib.pyplot as plt

np.set_printoptions(suppress=True)
pd.options.display.float_format = '{:,.4f}'.format
plt.style.use('classic')

%matplotlib inline



#### Required domain methods

In [3]:
import sys
sys.path.insert(1, "../src/")
from TypeFeatImputer import TypeFeatImputer
from UnivCombineFilter import UnivCombineFilter
import MLpipeline as MLpipeline
import readmision_methods as rm

In [4]:
typeEncounter = "last" # ['first','last']
typeHypothesis = "early_readmission_vs_none" # ['all_readmisssion_vs_none','early_readmission_vs_none']
typeDataFeatures = "extended_extra_diag_3" # ["reduced","extended','extended_extra','extended_extra_diag_1','extended_extra_diag_3']
typeDataExperiment = "all" #["all", "disease"] 

In [5]:
verbose = True
cv_thr = 0.3
cv_folds = 5

tr_thrs = [1.0] # [0.1,0.2,0.4,0.6,1.0]
ts_thr = 0.30

fs_methods = ["none"] #["none","combine_fs","lasso_fs","rfe_rf_fs"]
cls_methods = ["logReg"] #["rf","svmRBF","logReg","knn","nn","gbt"]
lms = ["recall"] #["f1_weighted","average_precision","roc_auc","recall"]
sm_types = ["none"] #["none","after"]
sm_method = "sm_smote"

#### Prepare initial data

In [6]:
#Load data
df_all = rm.load_data(typeEncounter, typeDataFeatures)
print "\nSHAPE:"
print df_all.shape

#Filter data by class
df_all = rm.filter_data_by_class(df_all, typeHypothesis)
print "\nSHAPE FILTERED:"
print df_all.shape

print "\nRows by class type:"
print df_all.iloc[:,-1].sort_values().unique(), np.sum(df_all["readmitted"] == 0), np.sum(df_all["readmitted"] == 1)

#Train & Test
X_train, X_test, y_train, y_test = MLpipeline.train_test_partition(df_all)
df_train = pd.DataFrame(np.hstack((X_train, y_train.reshape(-1,1))), columns=df_all.columns)
df_test = pd.DataFrame(np.hstack((X_test, y_test.reshape(-1,1))), columns=df_all.columns)

print "\nTrain:", X_train.shape, "Test:",  X_test.shape

#Create filters
featFilters = rm.create_filters(df_all)
print [[f[0],np.sum(f[1])] for f in featFilters]


SHAPE:
(67182, 69)

SHAPE FILTERED:
(45779, 69)

Rows by class type:
[0 1] 39785 5994

Train: (32045, 68) Test: (13734, 68)
[['patient_filter', 5], ['admision_discharge_filter', 29], ['hospital_filter', 9], ['Visits_filter', 8], ['diagnosis_filter', 14], ['medicines_filter', 15], ['none_filter', 68]]


#### Train ensemble

In [84]:
def train_ensemble(df_train, ts_thr, cv_folds, cv_thr, fs_methods, sm_method, sm_types, 
                                            cls_methods, lms, featFilters):
    #Split data
    X_train_cv, X_test_ts, y_train_cv, y_test_ts = MLpipeline.train_test_partition(df_train, ts_thr)
    df_train_cv = pd.DataFrame(np.hstack((X_train_cv, y_train_cv.reshape(-1,1))), columns=df_train.columns)
    df_train_cv.readmitted =  df_train_cv.readmitted.astype(int)
    
    #Train inner models
    models, cv_preds, ts_preds = train_inner_models(df_train, X_train_cv, X_test_ts, y_train_cv, y_test_ts, cv_folds, 
                                    cv_thr, fs_methods, sm_method, sm_types, cls_methods, lms, featFilters)
    
    #Train stack model
    stacker = train_stacker(cv_preds, y_train_cv, ts_preds, y_test_ts)
    
    return models, stacker, cv_preds, y_train_cv, ts_preds, y_test_ts

def train_inner_models(df_train_cv, X_train_cv, X_test_ts, y_train_cv, y_test_ts, cv_folds, cv_thr, fs_methods, 
                       sm_method, sm_types, cls_methods, lms, featFilters):
    
    models = []
    cv_preds = []
    ts_preds = []
    for n,f in featFilters[:]:
        
        #Get categoric/numeric
        f_cols = df_train_cv.columns[:-1][f==1].values.tolist()
        f_cols.append("readmitted")       
        catCols, reducedCols = rm.compute_type_features(df_train_cv[f_cols])

        #Get hyperparams
        hyperparams = np.load("../src/default_hyperparams.npy")

        #Create pipeline
        pipe = MLpipeline.create_pipelines(catCols, reducedCols, hyperparams, fs_methods, sm_method, sm_types, 
                                            cls_methods, lms, f)
         
        #Run pipeline
        y_train_cv = y_train_cv.astype(int)
        y_test_ts = y_test_ts.astype(int)
        res = MLpipeline.run_pipeline(n, pipe, X_train_cv, X_test_ts, y_train_cv, y_test_ts, cv_folds, cv_thr, 
                                      verbose=True, save=False)[0]
        
        #Get prediction
        tr_pred = res[-1].predict(X_train_cv).tolist()
        ts_pred = res[-1].predict(X_test_ts).tolist()
        
        models.append(res)
        cv_preds.append(tr_pred)
        ts_preds.append(ts_pred)
        
    return models, cv_preds, ts_preds
    
    
def train_stacker(cv_preds, y_cv, ts_preds, y_test):
        
    #Train stacker
    cv_preds = np.array(cv_preds).astype(int).T
    ts_preds = np.array(ts_preds).astype(int).T
    y_cv = y_cv.astype(int)    
    y_test = y_test.astype(int)
    
    #cls = SVC(random_state=13)
    #params = {'kernel':['linear'], 'C':[1e-8,1e-5,0.001,0.01, 0.1, 1]}
    
    cls = RandomForestClassifier(random_state=13,)
    params = {'n_estimators':[10,50,100,125,150,200,300],"class_weight":["balanced"],"max_depth":[1,2]}
    grid = GridSearchCV(cls, param_grid=params, verbose=True, 
                                 n_jobs=-1, cv=5, scoring= "recall", error_score = 0) 
    grid.fit(cv_preds, y_cv)    
    y_pred = grid.best_estimator_.predict(ts_preds)
    
    print "STACKER:"
    print "*******"
    print "y==0", np.sum(y_test == 0)
    print "y==1", np.sum(y_test == 1)    
    print grid.best_params_
    print grid.best_score_
    
    #Evaluate results
    evaluate("train", y_test, y_pred)
    
    return grid

def evaluate(name, y_test, y_pred):

    test_f1_w = metrics.f1_score(y_test, y_pred, average='weighted', pos_label=None)
    test_p, test_r, test_f1, test_s = metrics.precision_recall_fscore_support(y_test, y_pred,labels=None,average=None, sample_weight=None)
    fpr, tpr, _ = metrics.roc_curve(y_test, y_pred)
    test_auc = metrics.auc(fpr, tpr)                
    cm_test = metrics.confusion_matrix(y_test, y_pred)
    tn = cm_test[0,0]
    fp = cm_test[0,1]
    fn = cm_test[1,0]
    tp = cm_test[1,1]
    test_sens = test_r[1]
    test_spec = tn / float(tn+fp)

    print "\Evaluation:", name
    print "*******************"
    
    print "TEST f1 (weighted): %0.3f" % (test_f1_w)
    print "TEST Precision [c=0,1]:", test_p
    print "TEST Recall [c=0,1]:", test_r                
    print "TEST AUC: %0.3f" % (test_auc)                
    print "TEST sensitivity:", test_sens
    print "TEST Specificity:", test_spec
    print "Confussion matrix:"
    print "         | PRED"
    print "REAL-->  v "
    print cm_test
    
def test_ensemble(df_test, models, stacker):
    
    test_preds = []
    for m in models:
        test_preds.append(m[-1].predict(df_test.iloc[:,:-1].values).tolist())
    
    test_preds = np.array(test_preds).astype(int).T
    test_preds = test_preds.astype(int)
    print test_preds.shape
    y_pred = stacker.predict(test_preds)
    y_test = df_test.iloc[:,-1].values.astype(int)
    print type(y_pred), np.unique(y_pred)
    print type(y_test), np.unique(y_test)
    
    #Evaluate results
    evaluate("test", y_test, y_pred.astype(int))

In [None]:
#Train ensemble
models, stacker, cv_preds, y_train_cv, ts_preds, y_test_ts = train_ensemble(df_train, 0.3, cv_folds, cv_thr, 
                                                                            fs_methods, sm_method, sm_types, 
                                                                            cls_methods, lms, featFilters)  


DataSet:
**********
**********
SIZE: 5
NAME: patient_filter
ALL TRAIN: (22431, 68)
TRAIN: [0's: 19494 1's: 2937 ]
ALL TEST: (9614, 68)
TEST: [0's: 8355 1's: 1259 ]

Num experiment: 0 / 0
****************
FS: none
SM: none
CLS: logReg
METRIC: recall
Fitting 5 folds for each of 56 candidates, totalling 280 fits


[Parallel(n_jobs=-1)]: Done  23 tasks      | elapsed:    7.5s


In [None]:
#Test ensemble
test_ensemble(df_test, models, stacker)

In [None]:
#Train stacker
train_stacker(cv_preds, y_train_cv, ts_preds, y_test_ts)

In [None]:
print len(models)

In [None]:
res = pd.DataFrame(np.array(models).reshape(-1,35), columns=
                              ["exp", "name",
                               "size_tr","fs","sm","cls","metric","params",
                               "tr_sens","tr_spec","tr_auc",
                               "tr_prec","tr_rec","tr_f1",
                               "cv_sens_mean","cv_sens_std","cv_spec_mean","cv_spec_std","cv_auc_mean","cv_auc_std",
                               "cv_prec_mean","cv_prec_std","cv_rec_mean","cv_rec_std",
                               "cv_f1_mean","cv_f1_std",
                               "test_sens","test_spec","test_auc",
                               "test_rec","test_prec","test_f1",
                               "cm_test",
                               "time","pipeline"])
res[["name","size_tr","fs","sm","cls","metric","params","tr_sens","tr_spec","tr_auc",
    "tr_prec","tr_rec","tr_f1","cv_sens_mean","cv_sens_std","cv_spec_mean","cv_spec_std","cv_auc_mean","cv_auc_std",
    "cv_prec_mean","cv_prec_std","cv_rec_mean","cv_rec_std","cv_f1_mean","cv_f1_std",
    "test_sens","test_spec","test_auc","test_rec","test_prec","test_f1","cm_test"]]