## Document Representation

In [None]:
from gensim.models import Word2Vec
import pandas as pd
from gensim.models import KeyedVectors
import numpy as np
import nltk
mt = pd.read_csv("data/medicaltranscriptions.csv")

In [None]:
mt["description"][1]

### word2vec representation of the tokens using pretrained neural network
##### download word2vec https://archive.org/download/pubmed2018_w2v_200D.tar/pubmed2018_w2v_200D.tar.gz

In [None]:
mt_token_nouns_lower=[i.lower() for i in mt_token_nouns]
model = KeyedVectors.load_word2vec_format("data/pubmed2018_w2v_200D/pubmed2018_w2v_200D.bin", binary=True)
# Access vectors for specific words with a keyed lookup:

In [70]:
len(model.word_vec("allergy"))

200

### Document representation based on word2vec rep

In [72]:
mt["description"][0]

' A 23-year-old white female presents with complaint of allergies.'

In [73]:
mt_token=nltk.word_tokenize(mt["description"][0])

In [74]:
mt_token

['A',
 '23-year-old',
 'white',
 'female',
 'presents',
 'with',
 'complaint',
 'of',
 'allergies',
 '.']

In [76]:
word2vec_ls=np.empty((0, 200), float)
for word in mt_token:
    try:
        word2vec_ls=np.vstack([word2vec_ls, model.word_vec(word)])
    except:
        pass

In [78]:
word2vec_ls.shape

(8, 200)

In [81]:
desc_vec = np.mean(word2vec_ls, axis=0).reshape(1, 200)

In [84]:
desc_vec

array([[ 0.01867389,  0.12473023,  0.10854291, -0.04492506, -0.02215727,
        -0.14056439, -0.07894486, -0.08806848,  0.16190123, -0.0493898 ,
        -0.34483707, -0.23999839,  0.14983151,  0.00619245, -0.10985174,
        -0.02837002, -0.17335408,  0.13068732,  0.18553935, -0.10612127,
        -0.02104247, -0.14069731, -0.17757733, -0.11081255, -0.30001435,
        -0.12864111, -0.15953308,  0.03164702, -0.02219148, -0.28668132,
         0.14696995,  0.09242844, -0.06853393, -0.33831785, -0.24024644,
         0.04466899, -0.10996589,  0.00901568, -0.16010646, -0.01784864,
        -0.05463978,  0.18479542, -0.09630808,  0.05056364, -0.10360411,
        -0.19180575,  0.24349   ,  0.22455727, -0.05323398,  0.03099537,
        -0.18743683, -0.20968967, -0.11088259, -0.11511871,  0.1873517 ,
         0.10901155, -0.10123763,  0.24117097, -0.03599029,  0.17218466,
        -0.28693257,  0.15717313,  0.04817914, -0.08625916,  0.00517046,
         0.24834696,  0.24700316,  0.21340912,  0.1

### Keep the noun from the token list

In [85]:
mt_token_nouns=[]
mt_token_pos=nltk.pos_tag(nltk.word_tokenize(mt["description"][0]))
nouns = [word for word, pos in mt_token_pos if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS')]
mt_token_nouns = mt_token_nouns+nouns

In [86]:
mt_token_nouns

['female', 'presents', 'complaint', 'allergies']

### Create a vector representation (by averaging) of for each of the 5000 medical transcript

In [87]:
description_vec=np.empty((0, 200), float)

for i in range(len(mt["description"])):
    word2vec_ls=np.empty((0, 200), float)
    mt_token_pos=nltk.pos_tag(nltk.word_tokenize(mt["description"][i]))
    for word, pos in mt_token_pos:
        if (pos == 'NN' or pos == 'NNP' or pos == 'NNS' or pos == 'NNPS'):
            try:
                word2vec_ls=np.vstack([word2vec_ls, model.word_vec(word)])
            except:
                pass
    if len(word2vec_ls)==0:
        desc_vec=np.zeros((1, 200))
    else:
        desc_vec = np.mean(word2vec_ls, axis=0).reshape(1, 200)
    description_vec=np.vstack([description_vec, desc_vec])

In [93]:
pd.DataFrame(description_vec)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.060857,0.003464,0.233486,-0.013175,0.019383,-0.183592,-0.058973,-0.015472,0.245538,-0.032230,...,0.071063,-0.200934,0.042387,-0.198681,-0.314611,-0.072867,0.023844,-0.100806,0.247444,-0.232486
1,0.417814,0.528424,0.156247,0.314276,-0.028756,-0.003994,-0.286784,-0.238518,-0.251542,-0.158756,...,-0.388704,0.119747,-0.123295,-0.476210,-0.097604,-0.526065,0.843057,0.243901,0.396211,-0.175522
2,0.417814,0.528424,0.156247,0.314276,-0.028756,-0.003994,-0.286784,-0.238518,-0.251542,-0.158756,...,-0.388704,0.119747,-0.123295,-0.476210,-0.097604,-0.526065,0.843057,0.243901,0.396211,-0.175522
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4994,-0.104748,-0.060461,-0.052439,0.084831,-0.274088,-0.157743,-0.047372,0.080325,0.572402,-0.068053,...,0.058013,-0.263365,0.169904,-0.279541,-0.334117,-0.225967,0.158993,-0.102792,0.106748,0.047518
4995,-0.060467,0.012673,0.034065,0.005690,-0.305405,-0.240361,-0.073215,0.128778,0.404935,0.081726,...,-0.093086,-0.309709,-0.195536,-0.262030,-0.377853,-0.129781,0.019788,0.009593,0.171874,0.007599
4996,-0.090798,-0.157026,0.135663,0.357849,-0.090305,-0.012042,-0.088013,-0.153324,0.289086,-0.089122,...,0.090011,-0.462647,0.174449,-0.257873,-0.150761,-0.038270,0.123594,-0.161759,0.263964,0.062692
4997,-0.380668,0.425155,0.201445,-0.038759,-0.270541,0.086544,0.083630,-0.508574,0.049840,-0.290579,...,0.496502,-0.402577,-0.475059,-0.208941,0.263061,-0.545444,-0.014066,-0.176127,0.609830,0.329959


## Document Classification

In [94]:
from sklearn import preprocessing
from sklearn import svm
from sklearn.metrics import confusion_matrix
Doc2VecRep=pd.DataFrame(description_vec)

In [98]:
len(np.unique(mt['medical_specialty']))

40

In [99]:
le = preprocessing.LabelEncoder()
le.fit(mt['medical_specialty'])

LabelEncoder()

In [100]:
y=le.transform(mt['medical_specialty'])

In [101]:
np.unique(y)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39])

In [105]:
len(y)

4999

In [106]:
y[10]

2

In [108]:
le.inverse_transform([30])

array([' Physical Medicine - Rehab'], dtype=object)

In [109]:
Doc2VecRep["y"] = y

In [110]:
Doc2VecRep

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,191,192,193,194,195,196,197,198,199,y
0,0.060857,0.003464,0.233486,-0.013175,0.019383,-0.183592,-0.058973,-0.015472,0.245538,-0.032230,...,-0.200934,0.042387,-0.198681,-0.314611,-0.072867,0.023844,-0.100806,0.247444,-0.232486,0
1,0.417814,0.528424,0.156247,0.314276,-0.028756,-0.003994,-0.286784,-0.238518,-0.251542,-0.158756,...,0.119747,-0.123295,-0.476210,-0.097604,-0.526065,0.843057,0.243901,0.396211,-0.175522,2
2,0.417814,0.528424,0.156247,0.314276,-0.028756,-0.003994,-0.286784,-0.238518,-0.251542,-0.158756,...,0.119747,-0.123295,-0.476210,-0.097604,-0.526065,0.843057,0.243901,0.396211,-0.175522,2
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4994,-0.104748,-0.060461,-0.052439,0.084831,-0.274088,-0.157743,-0.047372,0.080325,0.572402,-0.068053,...,-0.263365,0.169904,-0.279541,-0.334117,-0.225967,0.158993,-0.102792,0.106748,0.047518,0
4995,-0.060467,0.012673,0.034065,0.005690,-0.305405,-0.240361,-0.073215,0.128778,0.404935,0.081726,...,-0.309709,-0.195536,-0.262030,-0.377853,-0.129781,0.019788,0.009593,0.171874,0.007599,0
4996,-0.090798,-0.157026,0.135663,0.357849,-0.090305,-0.012042,-0.088013,-0.153324,0.289086,-0.089122,...,-0.462647,0.174449,-0.257873,-0.150761,-0.038270,0.123594,-0.161759,0.263964,0.062692,0
4997,-0.380668,0.425155,0.201445,-0.038759,-0.270541,0.086544,0.083630,-0.508574,0.049840,-0.290579,...,-0.402577,-0.475059,-0.208941,0.263061,-0.545444,-0.014066,-0.176127,0.609830,0.329959,0


## In sample prediction

In [111]:
clf = svm.SVC(kernel='linear')
clf.fit(Doc2VecRep.iloc[:, : 200], Doc2VecRep["y"])

SVC(kernel='linear')

In [112]:
y_pred=clf.predict(Doc2VecRep.iloc[:, : 200])

In [113]:
y_pred[0:10]

array([ 0,  2,  2, 33, 33,  2, 38, 33,  6, 33])

In [114]:
y[0:10]

array([0, 2, 2, 3, 3, 2, 2, 3, 2, 3])

In [119]:
le.inverse_transform([3])

array([' Cardiovascular / Pulmonary'], dtype=object)

In [118]:
le.inverse_transform([33])

array([' Radiology'], dtype=object)

In [120]:
ConfusionM=confusion_matrix(np.array(y_pred), np.array(Doc2VecRep["y"]))

In [122]:
ConfusionM.shape

(40, 40)

In [123]:
pd.DataFrame(ConfusionM)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,0,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,14,0,0,9,0,0,0,0,...,0,0,0,0,0,1,0,0,2,0
3,0,0,0,215,0,44,0,0,0,0,...,0,0,0,49,0,12,0,0,91,1
4,0,0,0,0,4,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
5,3,0,1,31,1,276,0,4,5,9,...,8,4,25,14,6,46,3,2,18,10
6,0,0,1,0,0,1,3,0,0,0,...,0,0,0,0,0,0,0,0,2,0
7,0,0,0,0,0,1,0,9,0,0,...,0,0,0,1,0,0,0,0,7,0
8,0,0,0,0,0,1,0,0,7,0,...,0,0,0,0,0,3,0,0,2,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [125]:
np.sum(ConfusionM,  axis=0)

array([   7,    8,   18,  372,   14,  516,   27,   27,   29,   10,  108,
         98,   75,   19,  230,  259,   90,    6,   16,    8,   23,   81,
        223,   94,  160,   51,   83,  355,   62,   70,   21,   47,   53,
        273,   10,  166,   20,    9, 1103,  158])

In [132]:
np.sum(ConfusionM,  axis=1)

array([   2,    8,   30,  485,    8,  845,    7,   22,   16,    0,   17,
         71,   50,    8,  245,  144,   76,    4,    8,    0,    7,   48,
        207,    6,  155,    4,   74,  454,   56,   29,    0,   32,   42,
        250,    3,   84,   15,   12, 1327,  148])

In [133]:
np.sum(np.diag(ConfusionM))

2145

In [135]:
np.sum(np.diag(ConfusionM))/np.sum(ConfusionM)

0.4290858171634327

In [136]:
np.diag(ConfusionM)/np.sum(ConfusionM, axis=0)

array([0.14285714, 1.        , 0.77777778, 0.57795699, 0.28571429,
       0.53488372, 0.11111111, 0.33333333, 0.24137931, 0.        ,
       0.07407407, 0.35714286, 0.24      , 0.15789474, 0.49130435,
       0.23552124, 0.35555556, 0.66666667, 0.1875    , 0.        ,
       0.17391304, 0.28395062, 0.41255605, 0.0212766 , 0.46875   ,
       0.03921569, 0.44578313, 0.53239437, 0.70967742, 0.15714286,
       0.        , 0.29787234, 0.43396226, 0.32600733, 0.1       ,
       0.21084337, 0.45      , 0.66666667, 0.5494107 , 0.43670886])