In [1]:
import tensorflow as tf

with tf.Session() as sess:
    devices = sess.list_devices()

  from ._conv import register_converters as _register_converters


In [2]:
devices

[_DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 268435456, 5249275645212859955)]

In [3]:
import pandas as pd
import numpy as np

from keras.preprocessing.text import Tokenizer

from keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_curve, auc

from keras.models import Sequential, Model
from keras.layers import Dense, Input
from keras.layers.normalization import BatchNormalization
from keras.utils import to_categorical

import matplotlib.pyplot as plt

from keras.layers.embeddings import Embedding

from keras.layers import Flatten

Using TensorFlow backend.


##### Reading the dataset

In [4]:
data = pd.read_csv("train-1546603042473.csv", header=0)

In [5]:
test_data = pd.read_csv("test-1546603743049.csv", header = 0)

##### Checking the head and tail of the data.

In [6]:
data.head()

Unnamed: 0,categories,converse
0,QUERIES FROM PHARMACY,please to verify instructions for drugname pat...
1,NEW APPOINTMENT,lmovm for patients mother to and schd rov trac...
2,OTHERS,labtype and insurance approval other incoming ...
3,OTHERS,clinical list changes medfusion secure electro...
4,MEDICATION RELATED,wants to wean off medication work phone name d...


In [7]:
test_data.head()

Unnamed: 0,id,converse
0,1,request to speak with rn no given patients ref...
1,2,patients husband ret name spouse for other pat...
2,3,fyi in his szs mom other he has an appointment...
3,4,Rx refill drugname from pharmacy name reason f...
4,5,need more time for testing dad is requesting a...


In [8]:
data.tail()

Unnamed: 0,categories,converse
48694,OTHERS,fyi name spouse other spouse to ask what infom...
48695,REFILL,strattera name patient prescription refill str...
48696,REFILL,Rx request aricept mg rxrf medfusion secure el...
48697,NEW APPOINTMENT,patients wants to know if she can be worked in...
48698,MEDICATION RELATED,sudden aphasia and trouble walking significant...


In [9]:
test_data.tail()

Unnamed: 0,id,converse
8576,8577,duopa qs next rov please earlier if needed pat...
8577,8578,patient will have mom cb to schedule rov jcg m...
8578,8579,ha injection rathke patients patient patients ...
8579,8580,faxed to wmc neurosych faxed demographics insu...
8580,8581,patient appointment name mom for need other pa...


In [10]:
data.shape

(48699, 2)

In [11]:
test_data.shape

(8581, 2)

In [12]:
data.columns

Index(['categories', 'converse'], dtype='object')

In [13]:
test_data.columns

Index(['id', 'converse'], dtype='object')

In [14]:
data.dtypes

categories    object
converse      object
dtype: object

In [15]:
test_data.dtypes

id           int64
converse    object
dtype: object

In [16]:
data.isnull().sum()

categories     0
converse      32
dtype: int64

In [17]:
test_data.isnull().sum()

id          0
converse    4
dtype: int64

#####  Now removing the Null values from the Converse Column.

In [18]:
data['converse'].fillna(0, inplace=True)

In [19]:
test_data['converse'].fillna(0, inplace = True)

In [20]:
data.isnull().sum()

categories    0
converse      0
dtype: int64

In [21]:
test_data.isnull().sum()

id          0
converse    0
dtype: int64

In [22]:
test_unseen = test_data["converse"]
print(test_unseen.shape)

(8581,)


##### label Encoding of the categories column.

In [23]:
np.unique(data.categories, return_counts=True)[0]

array(['CANCELLATION', 'CHANGE OF HOSPITAL', 'CHANGE OF PHARMACY',
       'CHANGE OF PROVIDER', 'FOLLOW UP ON PREVIOUS REQUEST', 'JUNK',
       'LAB RESULTS', 'MEDICATION RELATED', 'NEW APPOINTMENT', 'OTHERS',
       'PRIOR AUTHORIZATION', 'PROVIDER', 'QUERIES FROM INSURANCE FIRM',
       'QUERIES FROM PHARMACY', 'QUERY ON CURRENT APPOINTMENT', 'REFILL',
       'RESCHEDULING', 'RUNNING LATE TO APPOINTMENT',
       'SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)',
       'SHARING OF LAB RECORDS (FAX, E-MAIL, ETC.)', 'SYMPTOMS'],
      dtype=object)

In [24]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

data['categories']=le.fit_transform(data['categories'])

In [25]:
np.unique(data.categories, return_counts=True)[0]

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20])

#####  Spliting the dataset into Train and Test.

In [26]:
X_train, X_val, y_train, y_val = train_test_split(data['converse'], 
                                                  data['categories'], 
                                                  test_size=0.3, 
                                                  random_state=123, 
                                                  stratify = data['categories'])

##### Checking the shape of the train and test data.

In [27]:
print(X_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(34089,)
(34089,)
(14610,)
(14610,)


In [28]:
X_train.head()

12270    patients on cellcept mg she has questions conc...
3387     needs rescheduling mom appointments patients t...
2567     drugname taper prescription other prescription...
33161    clinical list changes medfusion secure electro...
23536    ret re medication patient for medication infor...
Name: converse, dtype: object

In [29]:
y_train.head()

12270    15
3387     16
2567      7
33161     9
23536     7
Name: categories, dtype: int64

##### Now performing Tokenization.


In [30]:
MAX_NB_WORDS = 20000

# get the raw text data
texts_train = X_train.astype(str)
texts_test = X_val.astype(str)

In [31]:
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(nb_words=MAX_NB_WORDS, char_level=False)
tokenizer.fit_on_texts(texts_train)
sequences = tokenizer.texts_to_sequences(texts_train)
sequences_test = tokenizer.texts_to_sequences(texts_test)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))



Found 29429 unique tokens.


In [32]:
word_index

{'to': 1,
 'patients': 2,
 'and': 3,
 'the': 4,
 'for': 5,
 'she': 6,
 'of': 7,
 'patient': 8,
 'timephrase': 9,
 'is': 10,
 'a': 11,
 'mg': 12,
 'drugname': 13,
 'on': 14,
 'rn': 15,
 'with': 16,
 'in': 17,
 'that': 18,
 'her': 19,
 'please': 20,
 'rx': 21,
 'rna': 22,
 'labtype': 23,
 'pm': 24,
 'by': 25,
 'follow': 26,
 'appointment': 27,
 'at': 28,
 'am': 29,
 'po': 30,
 'was': 31,
 'mom': 32,
 'tabs': 33,
 'he': 34,
 'not': 35,
 'will': 36,
 'other': 37,
 'has': 38,
 'from': 39,
 'if': 40,
 'i': 41,
 'this': 42,
 'be': 43,
 'md': 44,
 'clinical': 45,
 'doctor': 46,
 'have': 47,
 'list': 48,
 'x': 49,
 'changes': 50,
 'would': 51,
 'name': 52,
 'phone': 53,
 'message': 54,
 'it': 55,
 'authorized': 56,
 'pharmacy': 57,
 'entered': 58,
 'medication': 59,
 'prescription': 60,
 'you': 61,
 'states': 62,
 'like': 63,
 'refill': 64,
 'can': 65,
 'completed': 66,
 'advise': 67,
 'but': 68,
 'then': 69,
 'fax': 70,
 'new': 71,
 'back': 72,
 'or': 73,
 'up': 74,
 'as': 75,
 'schedule': 76,

In [33]:
sequences[0]

[2,
 14,
 3196,
 12,
 6,
 38,
 227,
 911,
 115,
 1162,
 8,
 60,
 64,
 3196,
 12,
 158,
 33,
 30,
 133,
 2580,
 40,
 175,
 10,
 129,
 79,
 18,
 13,
 869,
 129,
 11027,
 4322,
 18,
 6,
 86,
 1,
 1667,
 10,
 55,
 149,
 1,
 103,
 59,
 95,
 6,
 38,
 4,
 554,
 22,
 26,
 20,
 477,
 3,
 67,
 101,
 59,
 1,
 1667,
 172,
 317,
 112,
 22,
 26,
 149,
 1,
 103,
 16,
 1162,
 44,
 22,
 26,
 2,
 136,
 7,
 4,
 480,
 149,
 1,
 103,
 13,
 3,
 11028,
 1204,
 6,
 10,
 3342,
 2336,
 3,
 1,
 103,
 3196,
 16,
 1162,
 53,
 66,
 172,
 317,
 112]

In [34]:
type(tokenizer.word_index), len(tokenizer.word_index)

(dict, 29429)

In [35]:
index_to_word = dict((i, w) for w, i in tokenizer.word_index.items())

In [36]:
index_to_word

{1: 'to',
 2: 'patients',
 3: 'and',
 4: 'the',
 5: 'for',
 6: 'she',
 7: 'of',
 8: 'patient',
 9: 'timephrase',
 10: 'is',
 11: 'a',
 12: 'mg',
 13: 'drugname',
 14: 'on',
 15: 'rn',
 16: 'with',
 17: 'in',
 18: 'that',
 19: 'her',
 20: 'please',
 21: 'rx',
 22: 'rna',
 23: 'labtype',
 24: 'pm',
 25: 'by',
 26: 'follow',
 27: 'appointment',
 28: 'at',
 29: 'am',
 30: 'po',
 31: 'was',
 32: 'mom',
 33: 'tabs',
 34: 'he',
 35: 'not',
 36: 'will',
 37: 'other',
 38: 'has',
 39: 'from',
 40: 'if',
 41: 'i',
 42: 'this',
 43: 'be',
 44: 'md',
 45: 'clinical',
 46: 'doctor',
 47: 'have',
 48: 'list',
 49: 'x',
 50: 'changes',
 51: 'would',
 52: 'name',
 53: 'phone',
 54: 'message',
 55: 'it',
 56: 'authorized',
 57: 'pharmacy',
 58: 'entered',
 59: 'medication',
 60: 'prescription',
 61: 'you',
 62: 'states',
 63: 'like',
 64: 'refill',
 65: 'can',
 66: 'completed',
 67: 'advise',
 68: 'but',
 69: 'then',
 70: 'fax',
 71: 'new',
 72: 'back',
 73: 'or',
 74: 'up',
 75: 'as',
 76: 'schedule',

In [37]:
" ".join([index_to_word[i] for i in sequences[0]])

'patients on cellcept mg she has questions concerning medicines food patient prescription refill cellcept mg oral tabs po bid wonder if there is any medications that drugname etc any paticular foods that she needs to avoid is it ok to take medication when she has the j rna follow please review and advise about medication to avoid mary morton lpn rna follow ok to take with food md rna follow patients notified of the above ok to take drugname and eve though she is jcv positive and to take cellcept with food phone completed mary morton lpn'

In [38]:
seq_lens = [len(s) for s in sequences]
print("average length: %0.1f" % np.mean(seq_lens))
print("max length: %d" % max(seq_lens))

average length: 81.8
max length: 424


In [39]:
MAX_SEQUENCE_LENGTH = 150

# pad sequences with 0s
x_train = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
x_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_train.shape)
print('Shape of data test tensor:', x_test.shape)

Shape of data tensor: (34089, 150)
Shape of data test tensor: (14610, 150)


In [40]:
texts_test1 = test_unseen.astype(str)
sequences_test1 = tokenizer.texts_to_sequences(texts_test1)
test11 = pad_sequences(sequences_test1, maxlen=MAX_SEQUENCE_LENGTH)

In [41]:
MAX_SEQUENCE_LENGTH = 150

# pad sequences with 0s
x_test = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y_test = pad_sequences(sequences_test, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', x_test.shape)
print('Shape of data test tensor:', y_test.shape)

Shape of data tensor: (34089, 150)
Shape of data test tensor: (14610, 150)


In [42]:
x_train

array([[   0,    0,    0, ...,  172,  317,  112],
       [   0,    0,    0, ...,   66, 1130,  662],
       [   0,    0,    0, ...,   15,    9,   24],
       ...,
       [   0,    0,    0, ...,  145,  147,   70],
       [   0,    0,    0, ...,  147,   70,  261],
       [  93,   31,   35, ...,  203,  165,    6]], dtype=int32)

In [43]:
x_test

array([[   0,    0,    0, ...,  172,  317,  112],
       [   0,    0,    0, ...,   66, 1130,  662],
       [   0,    0,    0, ...,   15,    9,   24],
       ...,
       [   0,    0,    0, ...,  145,  147,   70],
       [   0,    0,    0, ...,  147,   70,  261],
       [  93,   31,   35, ...,  203,  165,    6]], dtype=int32)

In [44]:
y_train = y_train
y_test = y_val

y_train = to_categorical(np.asarray(y_train))
print('Shape of label tensor:', y_train.shape)

Shape of label tensor: (34089, 21)


In [45]:
y_test = to_categorical(np.asarray(y_test))
print('Shape of label tensor:', y_test.shape)

Shape of label tensor: (14610, 21)


In [46]:
from keras.layers import Dense, Input, Flatten
from keras.layers import GlobalAveragePooling1D, Embedding
from keras.models import Model

EMBEDDING_DIM = 50
N_CLASSES = 21

# input: a sequence of MAX_SEQUENCE_LENGTH integers
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

embedding_layer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM,
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
embedded_sequences = embedding_layer(sequence_input)

average = GlobalAveragePooling1D()(embedded_sequences)
predictions = Dense(N_CLASSES, activation='softmax')(average)

model = Model(sequence_input, predictions)
model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics=['acc'])

In [108]:
model.fit(x_train, 
          y_train, 
          validation_split=0.1,
          nb_epoch=20, 
          batch_size=128)

Train on 30680 samples, validate on 3409 samples
Epoch 1/20
  768/30680 [..............................] - ETA: 7s - loss: 0.3743 - acc: 0.8802

  """


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1a2ca16198>

In [91]:
from sklearn.metrics import roc_auc_score

output_test = model.predict(x_train)

In [92]:
output_test

array([[3.98823295e-06, 7.44577892e-06, 7.60557770e-04, ...,
        1.10469637e-05, 7.89250862e-06, 2.02886687e-04],
       [4.10353839e-02, 6.77577918e-05, 1.26645145e-05, ...,
        4.40337717e-05, 9.53826311e-05, 6.03424909e-04],
       [2.89841613e-04, 7.06468185e-04, 1.00973819e-03, ...,
        2.25475561e-02, 3.79383634e-03, 2.15917174e-03],
       ...,
       [1.64494395e-07, 6.98598660e-06, 1.49068568e-04, ...,
        2.72667294e-05, 3.27383827e-06, 4.52154549e-04],
       [7.64834596e-09, 3.54477976e-07, 6.37733494e-04, ...,
        1.43661046e-05, 1.63371114e-05, 3.55273144e-08],
       [1.71272608e-04, 4.05202247e-02, 6.96135930e-06, ...,
        2.81428033e-03, 2.69139018e-05, 1.38468113e-05]], dtype=float32)

In [93]:
output_test1 = output_test.reshape(-1, 1) 

In [94]:
output_test1

array([[3.9882329e-06],
       [7.4457789e-06],
       [7.6055777e-04],
       ...,
       [2.8142803e-03],
       [2.6913902e-05],
       [1.3846811e-05]], dtype=float32)

In [53]:
# print("test auc:", roc_auc_score(y_train, output_test1[:,1]))

In [95]:
from sklearn.metrics import roc_auc_score

output_test = model.predict(x_test)

In [96]:
output_test

array([[3.98823295e-06, 7.44577892e-06, 7.60557770e-04, ...,
        1.10469637e-05, 7.89250862e-06, 2.02886687e-04],
       [4.10353839e-02, 6.77577918e-05, 1.26645145e-05, ...,
        4.40337717e-05, 9.53826311e-05, 6.03424909e-04],
       [2.89841613e-04, 7.06468185e-04, 1.00973819e-03, ...,
        2.25475561e-02, 3.79383634e-03, 2.15917174e-03],
       ...,
       [1.64494395e-07, 6.98598660e-06, 1.49068568e-04, ...,
        2.72667294e-05, 3.27383827e-06, 4.52154549e-04],
       [7.64834596e-09, 3.54477976e-07, 6.37733494e-04, ...,
        1.43661046e-05, 1.63371114e-05, 3.55273144e-08],
       [1.71272608e-04, 4.05202247e-02, 6.96135930e-06, ...,
        2.81428033e-03, 2.69139018e-05, 1.38468113e-05]], dtype=float32)

#####  -------------Prediction on test data

In [97]:
from sklearn.metrics import roc_auc_score

output_test = model.predict(test11)

In [98]:
output_test

array([[3.5495541e-05, 4.9273908e-04, 2.8189519e-04, ..., 3.1755488e-05,
        2.3863804e-05, 3.9422889e-03],
       [5.0318050e-03, 1.0764631e-03, 2.2605753e-04, ..., 1.0797062e-02,
        1.1884748e-03, 5.8329175e-03],
       [1.3462568e-05, 3.0615265e-05, 2.6257280e-06, ..., 8.5764514e-05,
        2.6443415e-06, 1.3621393e-01],
       ...,
       [1.5702572e-06, 5.0715457e-06, 2.2321876e-06, ..., 3.0602796e-06,
        1.1080617e-06, 4.4833493e-04],
       [2.0011006e-08, 1.8471803e-06, 2.2262162e-07, ..., 9.9755269e-01,
        2.3363035e-03, 2.8638854e-09],
       [1.1046480e-03, 1.2688494e-02, 1.2546685e-04, ..., 6.9742578e-01,
        7.5171195e-02, 1.9230335e-03]], dtype=float32)

In [99]:
test1_pred =[]
for i in output_test:
    test1_pred.append(np.argmax(i))

print(test1_pred)

[7, 8, 7, 13, 7, 7, 7, 20, 9, 7, 8, 8, 6, 6, 9, 15, 15, 11, 6, 8, 7, 7, 8, 3, 18, 11, 15, 18, 9, 15, 9, 20, 9, 18, 9, 7, 10, 8, 15, 16, 7, 6, 18, 7, 15, 7, 8, 11, 9, 9, 4, 7, 9, 15, 15, 7, 9, 8, 9, 7, 8, 15, 9, 3, 15, 18, 8, 7, 7, 18, 7, 16, 15, 16, 7, 7, 8, 8, 6, 8, 6, 12, 7, 7, 9, 9, 7, 6, 15, 11, 7, 18, 17, 7, 15, 8, 20, 3, 8, 7, 9, 15, 8, 15, 15, 15, 18, 15, 10, 8, 9, 8, 7, 7, 3, 15, 7, 7, 8, 15, 8, 9, 7, 8, 7, 8, 6, 8, 9, 15, 9, 7, 8, 3, 15, 9, 9, 17, 3, 10, 8, 18, 7, 7, 13, 15, 7, 15, 17, 7, 9, 8, 15, 15, 7, 18, 15, 15, 15, 9, 19, 15, 8, 3, 3, 18, 15, 9, 8, 18, 8, 9, 18, 3, 11, 11, 10, 8, 9, 8, 14, 8, 15, 15, 9, 9, 8, 7, 7, 15, 9, 8, 9, 7, 7, 7, 7, 8, 9, 9, 8, 15, 8, 8, 7, 6, 15, 6, 7, 8, 15, 6, 9, 8, 18, 15, 8, 15, 8, 6, 13, 8, 15, 9, 7, 9, 11, 9, 7, 7, 11, 9, 8, 11, 15, 7, 7, 8, 15, 15, 9, 15, 4, 19, 6, 15, 9, 7, 13, 9, 15, 15, 9, 8, 7, 9, 7, 7, 7, 18, 8, 9, 7, 7, 3, 17, 16, 15, 15, 7, 18, 9, 7, 10, 13, 18, 7, 6, 13, 8, 8, 9, 9, 18, 11, 9, 15, 7, 15, 6, 7, 15, 11, 15, 7, 9, 13,

In [100]:
output1 =le.inverse_transform(test1_pred)
# d_3 = {'id': test_data['id'], 'categories': output_test}
# df_3 = pd.DataFrame(data=d_3)
# df_3.head()
# #df = pd.DataFrame(op,columns=['categories'])
# out_csv = 'CUTe_04/predict1.csv'
# df_3.to_csv(out_csv, index=False, header=True)
# print("Predictions saved to disk: {0}".format(out_csv))

  if diff:


In [101]:
output1

array(['MEDICATION RELATED', 'NEW APPOINTMENT', 'MEDICATION RELATED', ...,
       'MEDICATION RELATED',
       'SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)',
       'SHARING OF HEALTH RECORDS (FAX, E-MAIL, ETC.)'], dtype=object)

In [102]:
output1 = pd.DataFrame(output1)

In [103]:
test_index = test_data["id"]

In [104]:
test_index.head()

0    1
1    2
2    3
3    4
4    5
Name: id, dtype: int64

In [105]:
final_submission = pd.concat([test_index, output1], axis=1)

final_submission.to_csv("final_submission_2275_4.csv")

In [106]:
type(final_submission)

pandas.core.frame.DataFrame

In [107]:
final_submission.head()

Unnamed: 0,id,0
0,1,MEDICATION RELATED
1,2,NEW APPOINTMENT
2,3,MEDICATION RELATED
3,4,QUERIES FROM PHARMACY
4,5,MEDICATION RELATED
