--------
Trains the neural network on the medical1 dataset. Look into ```home/mldata/KDD/medical1``` for the dataset.
This code has been used in most places (py scripts) to train the neural nets.

In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import os
os.chdir('..')
os.getcwd()

'C:\\Users\\PeterChe1990\\Desktop\\Kdd2015DeepPhenotyping'

In [3]:
import theano
import tengwar.nnet.NewSdA # multitask logistic regression
import tengwar.nnet # all neural network training functions 
import tengwar.eval # most of the functions to compute performance metrics

# all non-neural network baselines
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier

from pandas import Series, DataFrame

from tengwar.data import make_theano_shared # make theano shared_variables for input
from tengwar.eval import do_per_frame_performance, do_per_episode_combined_performance, make_firstN_indeces # performance evaluation functions

In [4]:
theano.config.optdb.max_use_ratio # A ratio that prevents infinite loop in EquilibriumOptimizer

5.0

In [None]:
PATH   = '/home/mldata/KDD/medical1' # path to data folder
SPLIT = 4   # i-th data split to use.
FIRSTN = 3  # how many windows to use for FirstN experiments
            # 3 = first 24 hours

# similarity to use for Laplacian regularizer
S_MAT = None      # none; 'No prior' in KDD15 paper
#S_MAT = 'Sinf'   # infectious diseases
#S_MAT = 'Sont'   # ontology tree; 'ICD-9 Tree' prior in KDD15 paper
#S_MAT = 'Sprob'  # pairwise joint probability; 'Co-occurance' prior in KDD15 paper
#S_MAT = 'Scos'   # pairwise cosine similarity

# type of Laplacian regularizer
S_TYPE = None     # no regularizer
#S_TYPE = 'l2'    # traditional Laplacian regularizer
#S_TYPE = 'l1'    # shared sparsity regularizer

LAMBDA_S    = None # strength of Laplacian regularizer


SHALLOW = False   # train shallow model with no hidden layers
HL = []           # number of nodes in each hidden layer
CL = []           # corruption level used to train a Denoising AE in each layer
if not SHALLOW and (not HL and not CL): # set default parameters of network
    HL = [500,100,100]
    CL = [0.3,0.3,0.3]
assert(len(HL) == len(CL))
    
LAMBDA_O_L2 = 0.0001    # plain L2 regularization of outputs
LAMBDA_O_L1 = 0.0001    # plain L1 regularization of outputs
LAMBDA_H_L2 = 0.00001    # L2 regularization of hidden unit weights

USE_CLASS_WEIGHTS = False  # use re-weighting for class imbalance


# file prefix to save the performance details and weights
if not SHALLOW:
    fn_prefix = 'weights-medical1-split{2:02d}-hl{0}-co{1}'.format('_'.join([ str(h) for h in HL ]),
                                                                  '_'.join([ str(c) for c in CL ]),
                                                                  SPLIT)
else:
    fn_prefix = 'weights-medical1-split{0:02d}-shallow'.format(SPLIT)

# file suffix to encode configuration into file name
fn_suffix = ''
if not SHALLOW:
    if S_MAT is None:
        fn_suffix = fn_suffix + '-Snone'
    else:
        LAMBDA_S = LAMBDA_S if LAMBDA_S is not None else 0.0001
        fn_suffix = fn_suffix + '-' + S_MAT + '_' + '{0}'.format(LAMBDA_S)
    fn_suffix = fn_suffix + ('-Ol2{0}'.format(LAMBDA_O_L2) if LAMBDA_O_L2 is not None else '')
    fn_suffix = fn_suffix + ('-Ol1{0}'.format(LAMBDA_O_L1) if LAMBDA_O_L1 is not None else '')
    fn_suffix = fn_suffix + ('-Hl2{0}'.format(LAMBDA_H_L2) if LAMBDA_H_L2 is not None else '')

#numpy_rng = np.random.RandomState(89677)
numpy_rng = np.random.RandomState()

In [None]:
# variables for per-frame data
SUB_FOLDER = 'frames-60min-frame12-stride6'
LABEL_FILE = 'labels_etc.npz' # file that contains all the raw data except the featues

# in the two following files, each variable is sampled uniformly.
# missing data has been handled using some forward-backward pass with the previous values or empirical mean
FEATURE_FILE = 'features-imputed.npy' # file that contains raw features
FRAME_FILE = 'frames-imputed.npy' # file that contains raw frames (different level from FEATURE_FILE)

ndata = np.load(os.path.join(PATH, SUB_FOLDER, LABEL_FILE))
Y = ndata['Y'].astype(int) # labels (outcomes)
split = ndata['split'] # a number of splits which are used for training, validation and testing.
Ep = ndata['Ep'].ravel() # id of the episode that this frame belongs to
X = np.load(os.path.join(PATH, SUB_FOLDER, FEATURE_FILE)) # raw features
#X = np.load(os.path.join(PATH, SUB_FOLDER, FRAME_FILE)) # raw frames
X = X.reshape(X.shape[0],-1).astype(theano.config.floatX) # flatten each feathres to from P*T to D*1

# indices for training, validation and test set
trix = split[:,SPLIT]==0
vix = split[:,SPLIT]==1
teix = split[:,SPLIT]==2

# set class weights to be used in objective function
if USE_CLASS_WEIGHTS:
    PY_EMP = YE[trix].mean(axis=0)
else:
    PY_EMP = 0.5

In [None]:
# Per-episode data. VAR_E is simiar to VAR in per-frame data.

ndata = np.load(os.path.join(PATH, 'medical1-60min.npz'))
S_MATRIX = ndata[S_MAT] if S_MAT is not None else None

YE = ndata['Y'].astype(int)
splitE = ndata['split']
EpE = ndata['Ep'].ravel() # id of this episode

ydlist = ndata['ydlist']
yclist = ndata['yclist']
ylist = np.hstack([ydlist,yclist])

trixE = splitE[:,SPLIT]==0
vixE  = splitE[:,SPLIT]==1
teixE = splitE[:,SPLIT]==2

if USE_CLASS_WEIGHTS:
    PY_EMP_E = YE[trixE].mean(axis=0)
else:
    PY_EMP_E = 0.5

In [None]:
# create shared_variables of X and Y
print 'Creating shared variables to store data'
XtrS,YtrS = make_theano_shared(X[trix],Y[trix])
XvS,YvS   = make_theano_shared(X[vix],Y[vix])
XteS,YteS = make_theano_shared(X[teix],Y[teix])

In [None]:
# make neural net object

reload(tengwar.nnet)
reload(tengwar.nnet.NewSdA)
# describe all hyperparameters
sda = tengwar.nnet.NewSdA.NewSdA(numpy_rng, n_ins=X.shape[1], n_outs=Y.shape[1],
                                 hidden_layers_sizes=HL, corruption_levels=CL,
                                 Py_emp=PY_EMP, S_matrix=S_MATRIX, S_type=S_TYPE,
                                 lambda_S=LAMBDA_S, lambda_O_l2=LAMBDA_O_L2,
                                 lambda_O_l1=LAMBDA_O_L1, lambda_H_l2=LAMBDA_H_L2)


In [None]:
# do unsupervised pretraining
fn_prefix_unsup = fn_prefix + '-pretrain'
# we will not do unsupervised pretraining if we can reload the pretrained parameters
if not SHALLOW:
    if os.path.isfile(fn_prefix_unsup + '.npz'):
        print 'Loading pretrain weights from file ' + fn_prefix_unsup + '.npz'
        sda.load_pretrained_params(fn_prefix_unsup + '.npz')
    else:
        print 'Running pretraining'
        sda.do_unsupervised_pretraining(train_set_x=XtrS, epochs=100,
                                        learn_rate=0.01, batch_size=200,
                                        save_fnbase=fn_prefix_unsup)
else:
    print 'Shallow model so no pretraining'

In [None]:
# do supervised finetuning
fn_prefix_sup = fn_prefix + '-finetuned'
sda.do_supervised_finetuning(train_set_x=XtrS, train_set_y=YtrS,
                             valid_set_x=XvS, valid_set_y=YvS,
                             test_set_x=XteS, test_set_y=YteS,
                             epochs=1000, batch_size=200, learn_rate=0.1,
                             use_auc=False, save_fnbase=fn_prefix_sup)

------
The cells below compute performance metrics and save them to their respective files. The file names are created using the prefixes and suffixes computed above. The file names encode the configuration of the nnet.

In [None]:
reload(tengwar.eval)
fn_prefix_perf = fn_prefix.replace('weights', 'performance')

print 'Compute per-frame performance:'
fn_best = fn_prefix_sup + '-best.npz'
nn = tengwar.nnet.FeedForwardNetwork.from_saved_weights(fn_best)
Pf = tengwar.eval.do_per_frame_performance(nn, X, Y, trix, teix, ydlist, yclist)
Pf.to_csv(fn_prefix_perf + '-frame.csv')
Pf

In [None]:
print 'Compute per-episode performance by combination:'
Pc = tengwar.eval.do_per_episode_combined_performance(nn, Ep, EpE, X, YE, trixE, teixE, ydlist, yclist)
Pc.to_csv(fn_prefix_perf + '-episode.csv')
Pc

In [None]:
print 'Compute per-frame first N performance:'
IxN = tengwar.eval.make_firstN_indeces(Ep)
XN = X[IxN]
YN = Y[IxN]
EpN = Ep[IxN]
trixN = trix[IxN]
teixN = teix[IxN]

PfN = tengwar.eval.do_per_frame_performance(nn, XN, YN, trixN, teixN, ydlist, yclist=yclist)
PfN.to_csv(fn_prefix_perf + '-frame-first{0}.csv'.format(FIRSTN))
PfN

In [None]:
print 'Compute per-episode first N performance by combination:'
PcN = tengwar.eval.do_per_episode_combined_performance(nn, EpN, EpE, XN, YE, trixE, teixE, ydlist, yclist)
PcN.to_csv(fn_prefix_perf + '-episode-first{0}.csv'.format(FIRSTN))
PcN

In [None]:
'''
Classifier using the features learnt by the nnet.
'''

print 'fitting per-episode classifier, all feats...'
HN = nn.transform_features(XN)
HE = np.vstack([ HN[EpN==e].ravel() for e in EpE ])


print 'One vs. Rest sklearn classifier'
fn_prefix_cl = fn_prefix_perf + '-episode-onevsrest'
fn_prefix_cl = fn_prefix_cl.replace('weights', 'performance')
# cl = OneVsRestClassifier(LinearSVC(penalty='l2', C=100.0, class_weight=None, dual=False), n_jobs=-1)
cl = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1, class_weight=None), n_jobs=-1)
cl.fit(HE[trixE], YE[trixE])
print 'done!'

Pe = tengwar.eval.do_per_frame_performance(cl, HE, YE, trixE, teixE, ydlist, yclist)
Pe.to_csv(fn_prefix_cl + '.csv')
Pe