In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import os
os.chdir('..')
os.getcwd()

'C:\\GitHub\\DeepComputationalPhenotyping'

In [None]:
import theano
from tengwar.data import make_theano_shared

In [None]:
import theano
import tengwar.nnet.NewSdA # multitask logistic regression
import tengwar.nnet # all neural network training functions 
import tengwar.eval # most of the functions to compute performance metrics

# all non-neural network baselines
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from pandas import Series, DataFrame

from tengwar.data import make_theano_shared # make theano shared_variables for input
from tengwar.eval import do_per_frame_performance, do_per_episode_combined_performance, make_firstN_indeces # performance evaluation functions

In [None]:
theano.config.optdb.max_use_ratio # A ratio that prevents infinite loop in EquilibriumOptimizer

In [20]:
PATH = '/home/mldata/KDD/physionet' # path to data folder
SUB_FOLDER = 'frames-60min-frame12-stride6'
LABEL_FILE = 'labels_etc.npz' # file that contains all the raw data except the featues
FRAME_FILE = 'frames-imputed.npy' # file that contains raw frames (different level from FEATURE_FILE)

ndata = np.load(os.path.join(PATH, SUB_FOLDER, LABEL_FILE))

L = ndata['islabeled'].ravel()
Y = ndata['Y'][L].astype(theano.config.floatX)
FU = ndata['fold10'].ravel()
F = ndata['fold10'][L].ravel()
Ep = ndata['Ep'][L].ravel()
S = ndata['S'][L].astype(theano.config.floatX)
V = ndata['V'][L].astype(theano.config.floatX)

X = np.load(os.path.join(PATH, SUB_FOLDER, FRAME_FILE)) # raw frames
X = X.reshape(X.shape[0],-1).astype(theano.config.floatX)
XU = X
X = X[L]
SU = S
S = S[L]
VU = V
V = V[L]

trixU = FU>2
trix = F>2
vix = F==2
teix = F==1

In [None]:
# get the Laplacian prior
Ytr = Y[trix,...]
yden = np.sqrt(Ytr.sum(axis=0)[None,:])
Lap = Ytr.T.dot(Ytr) / yden.T.dot(yden)
#Lap = (Ytr[Ltr,...].T.dot(Ytr[Ltr,...]) + (1-Ytr[Ltr,...]).T.dot(1-Ytr[Ltr,...])) / Ytr[Ltr,...].shape[0]
Lap = Lap.astype(theano.config.floatX)
print Lap

In [22]:
ndata = np.load(os.path.join(PATH, 'physionet_challenge-60min.npz'))
LE = ndata['islabeled'].ravel()
YE = ndata['Y'][LE].astype(int)
VE = ndata['V'][LE]
SE = ndata['S'][LE]
EpE = ndata['Ep'][LE].ravel()
FE = ndata['fold10'][LE].ravel()

trixE = FE>2
vixE  = FE==2
teixE = FE==1

In [None]:
# describe all hyperparameters and make neural net object
HL = [300, 300]
CL = [0.2, 0.4]
lambda_h_l2 = 0 #0.0001

fn_prefix = 'weights-physio-hl{0}-co{1}'.format('_'.join([ str(h) for h in HL ]),
                                  '_'.join([ str(c) for c in CL ]))

reload(tengwar.nnet)
reload(tengwar.nnet.NewSdA)

numpy_rng = np.random.RandomState()
sda = tengwar.nnet.NewSdA.NewSdA(numpy_rng, hidden_layers_sizes=HL, corruption_levels=CL,
                                 n_ins=X.shape[1], n_outs=Y.shape[1], Py_emp=0.5,
                                 lambda_h_l2 = lambda_h_l2,
                                 S_matrix = Lap)


In [None]:
# do unsupervised pretraining
# will use all (labeled and unlabeled) data for unsupervised training
XtrS,_ = make_theano_shared(XU[trixU])

fn_prefix_unsup = fn_prefix + '-pretrain'
sda.do_unsupervised_pretraining(train_set_x=XtrS, epochs=50,
                                    learn_rate=0.001, batch_size=50,
                                    save_fnbase=fn_prefix_unsup)

In [42]:
# do supervised finetuning
XtrS,YtrS = make_theano_shared(X[trix],Y[trix])
XvS,YvS   = make_theano_shared(X[vix],Y[vix])
XteS,YteS = make_theano_shared(X[teix],Y[teix])

fn_prefix_sup = fn_prefix + '-finetuned'
sda.do_supervised_finetuning(train_set_x=XtrS, train_set_y=YtrS,
                             valid_set_x=XvS, valid_set_y=YvS,
                             test_set_x=XteS, test_set_y=YteS,
                             epochs=50, batch_size=50, learn_rate=0.1,
                             use_auc=False, save_fnbase=fn_prefix_sup)

In [44]:
from tengwar.nnet import FeedForwardNetwork

from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [45]:
nn = FeedForwardNetwork.from_saved_weights(fn_prefix_sup + '-best.npz')
Yh = nn.predict(X)
Yd = nn.decision_function(X)
print 'FRAC POS:', Yh.mean(axis=0)
print 'MEAN DEC:', Yd.mean(axis=0)
print ''

print 'ALL ACC', accuracy_score(Y[trix].ravel(), Yh[trix].ravel()), accuracy_score(Y[teix].ravel(), Yh[teix].ravel())
print 'MAC AUC', roc_auc_score(Y[trix], Yd[trix], average='macro'), roc_auc_score(Y[teix], Yd[teix], average='macro')
print 'MIC AUC', roc_auc_score(Y[trix], Yd[trix], average='micro'), roc_auc_score(Y[teix], Yd[teix], average='micro')
print ''

for task in range(Y.shape[1]):
    print task, 'ACC', accuracy_score(Y[trix,task], Yh[trix,task]), accuracy_score(Y[teix,task], Yh[teix,task])
    print task, 'AUC', roc_auc_score(Y[trix,task], Yd[trix,task]), roc_auc_score(Y[teix,task], Yd[teix,task])
    print ''

FRAC POS: [ 0.  0.  0.  0.]
MEAN DEC: [ 0.09141014  0.0925527   0.44607225  0.2474691 ]

ALL ACC 0.71875 0.711875
MAC AUC 0.550479124923 0.546661362309
MIC AUC 0.550479124923 0.715871287162

0 ACC 0.861528822055 0.86
0 AUC 0.723376724116 0.726155163062

1 ACC 0.858709273183 0.8575
1 AUC 0.721867554018 0.721939688872

2 ACC 0.516917293233 0.4875
2 AUC 0.36713673702 0.368950492017

3 ACC 0.637844611529 0.6425
3 AUC 0.389535484539 0.369600105287



In [46]:
print 'test combining per-frame predictions, mean dec val'

#YdC = np.vstack([ np.median(Yd[Ep==e],axis=0) for e in EpE ])
YdC = np.vstack([ Yd[Ep==e].mean(axis=0) for e in EpE ])
#YdC = np.vstack([ Yd[Ep==e].max(axis=0) for e in EpE ])
YhC = (YdC>0.5).astype(int)
#YdC = np.vstack([ Yh[Ep==e].mean(axis=0) for e in EpE ])
#YhC = (YdC>0.5).astype(int)

print 'FRAC POS:', YhC.mean(axis=0)
print 'MEAN DEC:', YdC.mean(axis=0)
print ''

print 'ALL ACC', accuracy_score(YE[trixE].ravel(), YhC[trixE].ravel()), accuracy_score(YE[teixE].ravel(), YhC[teixE].ravel())
print 'MAC AUC', roc_auc_score(YE[trixE], YdC[trixE], average='macro'), roc_auc_score(YE[teixE], YdC[teixE], average='macro')
print 'MIC AUC', roc_auc_score(YE[trixE], YdC[trixE], average='micro'), roc_auc_score(YE[teixE], YdC[teixE], average='micro')
print ''

for task in range(Y.shape[1]):
    print task, 'ACC', accuracy_score(YE[trixE,task], YhC[trixE,task]), accuracy_score(YE[teixE,task], YhC[teixE,task])
    print task, 'AUC', roc_auc_score(YE[trixE,task], YdC[trixE,task]), roc_auc_score(YE[teixE,task], YdC[teixE,task])
    print ''

test combining per-frame predictions, mean dec val
FRAC POS: [ 0.  0.  0.  0.]
MEAN DEC: [ 0.09141029  0.09255241  0.44608113  0.24746855]

ALL ACC 0.71875 0.711875
MAC AUC 0.560927140542 0.554713128332
MIC AUC 0.709480028088 0.717128279745

0 ACC 0.861528822055 0.86
0 AUC 0.749489099136 0.756333056478

1 ACC 0.858709273183 0.8575
1 AUC 0.747714956669 0.752391182037

2 ACC 0.516917293233 0.4875
2 AUC 0.362185866447 0.361375859912

3 ACC 0.637844611529 0.6425
3 AUC 0.384318639914 0.3487524149



In [47]:
H = nn.transform_features(X)

print 'fitting per-episode classifier, all feats...'
HE = np.vstack([ H[Ep==e].ravel() for e in EpE ])
#print 'fitting per-episode classifier, median feat...'
#HE = np.vstack([ np.median(H[Ep==e], axis=0) for e in EpE ])
#print 'fitting per-episode classifier, max feat...'
#HE = np.vstack([ H[Ep==e].max(axis=0) for e in EpE ])
clE = OneVsRestClassifier(LinearSVC(penalty='l1', class_weight='auto', dual=False), n_jobs=-1)
clE.fit(HE[trixE], YE[trixE])
print 'done!'

YhE = clE.predict(HE)
YdE = clE.decision_function(HE)
print 'FRAC POS:', YhE.mean(axis=0)
print 'MEAN DEC:', YdE.mean(axis=0)
print''

print 'ALL ACC', accuracy_score(YE[trixE].ravel(), YhE[trixE].ravel()), accuracy_score(YE[teixE].ravel(), YhE[teixE].ravel())
print 'MAC AUC', roc_auc_score(YE[trixE], YdE[trixE], average='macro'), roc_auc_score(YE[teixE], YdE[teixE], average='macro')
print 'MIC AUC', roc_auc_score(YE[trixE], YdE[trixE], average='micro'), roc_auc_score(YE[teixE], YdE[teixE], average='micro')
print''

for task in range(Y.shape[1]):
    print task, 'ACC', accuracy_score(YE[trixE,task], YhE[trixE,task]), accuracy_score(YE[teixE,task], YhE[teixE,task])
    print task, 'AUC', roc_auc_score(YE[trixE,task], YdE[trixE,task]), roc_auc_score(YE[teixE,task], YdE[teixE,task])
    print''

fitting per-episode classifier, all feats...
done!
FRAC POS: [ 0.32214429  0.31738477  0.4509018   0.36673347]
MEAN DEC: [-0.24593891 -0.24212055 -0.00638984 -0.1086729 ]

ALL ACC 0.782189849624 0.751875
MAC AUC 0.863258542086 0.826217042871
MIC AUC 0.859323978587 0.82362082658

0 ACC 0.762218045113 0.7275
0 AUC 0.854548745372 0.818781146179

1 ACC 0.765977443609 0.7425
1 AUC 0.854362311326 0.817298347911

2 ACC 0.768796992481 0.7375
2 AUC 0.848639311402 0.810281425891

3 ACC 0.831766917293 0.8
3 AUC 0.895483800246 0.858507251503

