## Clasifying Top 20 leaf icd-9 codes

Running with the full file

In [1]:
%load_ext autoreload
%autoreload 2
# General imports
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import sys 

#keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate

# Custom functions
sys.path.append("../pipeline")
import icd9_cnn_model
import database_selection
import vectorization
import helpers

Using TensorFlow backend.


## Read Input File

In [2]:
df = pd.read_csv('../data/disch_notes_all_icd9.csv',
                 names = ['HADM_ID', 'SUBJECT_ID', 'DATE', 'ICD9','TEXT'])

In [3]:
N_TOP = 20 
full_df, top_codes = database_selection.filter_top_codes(df, 'ICD9', N_TOP, filter_empty = True)
#df = full_df.head(1000)
df = full_df

## Vectorize Labels

In [4]:
#preprocess icd9 codes
labels = vectorization.vectorize_icd_column(df, 'ICD9', top_codes)


## Vectorize Notes

In [5]:
#preprocess notes
MAX_VOCAB = None # to limit original number of words (None if no limit)
MAX_SEQ_LENGTH = 5000 # to limit length of word sequence (None if no limit)
df.TEXT = vectorization.clean_notes(df, 'TEXT')
data, dictionary, MAX_VOCAB = vectorization.vectorize_notes(df.TEXT, MAX_VOCAB, verbose = True)
data, MAX_SEQ_LENGTH = vectorization.pad_notes(data, MAX_SEQ_LENGTH)
print("Final Vocabulary: %s" % MAX_VOCAB)
print("Final Max Sequence Length: %s" % MAX_SEQ_LENGTH)

Vocabulary size: 130488
Average note length: 1728.09244863
Max note length: 10924
Final Vocabulary: 130488
Final Max Sequence Length: 5000


In [8]:
#creating embeddings
#EMBEDDING_LOC = '../data/glove.6B.100d.txt' # location of embedding
# embedding pre-trained will all MIMIC notes
EMBEDDING_LOC = '../data/notes.100.txt' # location of embedding
EMBEDDING_DIM = 100 # given the glove that we chose
embedding_matrix, embedding_dict = vectorization.embedding_matrix(EMBEDDING_LOC,
                                                                  dictionary, EMBEDDING_DIM, verbose = True)


('Vocabulary in notes:', 130488)
('Vocabulary in original embedding:', 21056)
('Vocabulary intersection:', 20620)


## Split Files

In [6]:
#split sets
X_train, X_val, X_test, y_train, y_val, y_test = helpers.train_val_test_split(
    data, labels, val_size=0.2, test_size=0.1, random_state=101)
print("Train: ", X_train.shape, y_train.shape)
print("Validation: ", X_val.shape, y_val.shape)
print("Test: ", X_test.shape, y_test.shape)

('Train: ', (30794, 5000), (30794, 20))
('Validation: ', (8798, 5000), (8798, 20))
('Test: ', (4400, 5000), (4400, 20))


In [7]:
# Delete temporary variables to free some memory
del df, data, labels

## CNN for text classification

Based on the following papers and links:
* "Convolutional Neural Networks for Sentence Classification"   
* "A Sensitivity Analysis of (and Practitioners� Guide to) Convolutional Neural Networks for Sentence Classification"
* http://www.wildml.com/2015/11/understanding-convolutional-neural-networks-for-nlp/
* https://github.com/alexander-rakhlin/CNN-for-Sentence-Classification-in-Keras/blob/master/sentiment_cnn.py

In [9]:
reload(icd9_cnn_model)
#### build model
model = icd9_cnn_model.build_icd9_cnn_model (input_seq_length=MAX_SEQ_LENGTH, max_vocab = MAX_VOCAB,
                             external_embeddings = True,
                             embedding_dim=EMBEDDING_DIM,embedding_matrix=embedding_matrix,
                             num_filters = 100, filter_sizes=[2,3,4,5],
                             training_dropout_keep_prob=0.5,
                             num_classes=N_TOP )

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 5000)          0                                            
____________________________________________________________________________________________________
embedding_1 (Embedding)          (None, 5000, 100)     13048900    input_1[0][0]                    
____________________________________________________________________________________________________
conv1d_1 (Conv1D)                (None, 4999, 100)     20100       embedding_1[0][0]                
____________________________________________________________________________________________________
conv1d_2 (Conv1D)                (None, 4998, 100)     30100       embedding_1[0][0]                
___________________________________________________________________________________________

In [10]:
#first 5 epochs
model.fit(X_train, y_train, batch_size=50, epochs=5, validation_data=(X_val, y_val), verbose=2)

Train on 30794 samples, validate on 8798 samples
Epoch 1/5
1008s - loss: 0.4447 - acc: 0.8289 - val_loss: 0.3207 - val_acc: 0.8677
Epoch 2/5
984s - loss: 0.3245 - acc: 0.8698 - val_loss: 0.2738 - val_acc: 0.8868
Epoch 3/5
981s - loss: 0.2889 - acc: 0.8835 - val_loss: 0.2522 - val_acc: 0.8978
Epoch 4/5
980s - loss: 0.2708 - acc: 0.8915 - val_loss: 0.2422 - val_acc: 0.9047
Epoch 5/5
977s - loss: 0.2605 - acc: 0.8965 - val_loss: 0.2391 - val_acc: 0.9050


<keras.callbacks.History at 0x7f87e163d310>

In [11]:
model.save('models/cnn_5_epochs_allr.h5')

In [12]:
pred_train = model.predict(X_train, batch_size=200)
pred_dev = model.predict(X_val, batch_size=200)
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.358      0.353
0.030:      0.407      0.400
0.040:      0.456      0.446
0.050:      0.502      0.488
0.055:      0.523      0.508
0.058:      0.534      0.518
0.060:      0.541      0.525
0.080:      0.602      0.582
0.100:      0.645      0.621
0.200:      0.732      0.704
0.300:      0.747      0.717
0.400:      0.738      0.707
0.500:      0.712      0.679
0.600:      0.668      0.631
0.700:      0.594      0.558


In [13]:
# 2 more epochs
model.fit(X_train, y_train, batch_size=50, epochs=2, validation_data=(X_val, y_val), verbose=2)

Train on 30794 samples, validate on 8798 samples
Epoch 1/2
834s - loss: 0.2518 - acc: 0.9010 - val_loss: 0.2371 - val_acc: 0.9076
Epoch 2/2
837s - loss: 0.2437 - acc: 0.9041 - val_loss: 0.2367 - val_acc: 0.9075


<keras.callbacks.History at 0x7f87a57c4310>

In [14]:
model.save('models/cnn_7_epochs_allr.h5')

In [15]:
pred_train = model.predict(X_train, batch_size=200)
pred_dev = model.predict(X_val, batch_size=200)
helpers.show_f1_score(y_train, pred_train, y_val, pred_dev)

F1 scores
threshold | training | dev  
0.020:      0.373      0.366
0.030:      0.424      0.412
0.040:      0.472      0.455
0.050:      0.515      0.494
0.055:      0.536      0.512
0.058:      0.548      0.522
0.060:      0.555      0.528
0.080:      0.622      0.587
0.100:      0.669      0.629
0.200:      0.759      0.713
0.300:      0.774      0.724
0.400:      0.767      0.714
0.500:      0.746      0.691
0.600:      0.708      0.651
0.700:      0.644      0.584
