###  Solving Patient database using NLP Unsupervised Learning

#### We are going to use LDA ( *Latent Dirichlet Allocation* ) - Classify text in a document to a particular topic

### Load Dataset

In [1]:
import pandas as pd
data = pd.read_csv("Patient_Details.csv")
data["index"] = data.index

In [2]:
document = data
print(len(document))

499


In [4]:
document.head()

Unnamed: 0,TEXT,index
0,Right side of epiglottis swelled up and hinder...,0
1,Approximately 30 min post vaccination administ...,1
2,"About 15 minutes after receiving the vaccine, ...",2
3,"extreme fatigue, dizziness,. could not lift my...",3
4,"Injection site swelling, redness, warm to the ...",4


### Data Preprocessing
1.  Tokenization, sentences to word , Lower_case, remove_punchutation
2.  Words that are fewer that character are removed
3.  All stopwords are removed
4.  Words are lemmatized

In [7]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(400)
import nltk

In [9]:
## Lemmatizer Example:
print(WordNetLemmatizer().lemmatize("went",pos="v"))

go


In [10]:
## Stemmer Example:
stemmer = SnowballStemmer("english")
original_word = ["cares","files","dies","defined","died"]
singles=[stemmer.stem(plural) for plural in original_word]
pd.DataFrame(data={"orginal_data":original_word,"Stemmed_word":singles})

Unnamed: 0,orginal_data,Stemmed_word
0,cares,care
1,files,file
2,dies,die
3,defined,defin
4,died,die


### Creating a Function to do Pre-Processing step on Entire Dataset

In [25]:
def lemmatizing_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text,pos="v"))

# Tokenize and Lemmatize

def preprocessing(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            #apply lemmatizing_stemming() on the token, ten add to that result list
            result.append(lemmatizing_stemming(token))
    return result
 

### Preview the document after Preprocessing

In [26]:
document_number = 100
doc_sample = document[document["index"]== document_number].values[0][0]
print("Original document: ")
words=[]
for i in doc_sample.split(" "):
    words.append(i)
print(words)
print("\n \n Tokenized and Lemmatized document : ")
preprocessing(doc_sample)

Original document: 
['Moderna', 'COVID?19', 'Vaccine', 'EUA', '', 'Arm', 'severe', 'pain,', 'redness,', 'hard', 'hives', 'that', 'keep', 'worsening.', '', 'Fever', 'Chills', 'Body', 'aches', 'Headache', 'Nausea', 'and', 'vomiting', 'Tachycardia', 'with', 'hypertension', 'Dizzy', 'Sweats', '', 'I', 'felt', 'worst', 'after', 'vaxx', 'than', 'I', 'ever', 'did', 'with', 'COVID.']

 
 Tokenized and Lemmatized document : 


['moderna',
 'covid',
 'vaccin',
 'sever',
 'pain',
 'red',
 'hard',
 'hive',
 'worsen',
 'fever',
 'chill',
 'bodi',
 'ach',
 'headach',
 'nausea',
 'vomit',
 'tachycardia',
 'hypertens',
 'dizzi',
 'sweat',
 'felt',
 'worst',
 'vaxx',
 'covid']

In [27]:
document

Unnamed: 0,TEXT,index
0,Right side of epiglottis swelled up and hinder...,0
1,Approximately 30 min post vaccination administ...,1
2,"About 15 minutes after receiving the vaccine, ...",2
3,"extreme fatigue, dizziness,. could not lift my...",3
4,"Injection site swelling, redness, warm to the ...",4
...,...,...
494,"Tachycardia with a heart rate in the 120-140s,...",494
495,"Strong chills, with uncontrollable and vigorou...",495
496,Fever chills severe myalgia headache,496
497,"Middle of the night woke up shivering, chills,...",497


###  Now Preprocess all the Text we have, to do that lets use map function

In [29]:
preprocess_doc = document["TEXT"]. map(preprocessing)
preprocess_doc[:10]

0    [right, epiglotti, swell, hinder, swallow, pic...
1    [approxim, post, vaccin, administr, patient, d...
2    [minut, receiv, vaccin, patient, complain, lea...
3            [extrem, fatigu, dizzi, lift, leav, hour]
4       [inject, site, swell, red, warm, touch, itchi]
5      [patient, call, state, throat, swell, benadryl]
6    [sever, chill, approxim, hour, receiv, vaccin,...
7                           [nasal, congest, diarrhea]
8    [follow, vaccin, notic, rais, itchi, patch, va...
9             [hive, rash, bodi, go, away, day, begin]
Name: TEXT, dtype: object

### Bag of words on the dataset

#### Create a dictionary from "preprocess_doc" containing the number of times a word appears in the training set. To do that lets pass preprocess_docs to **gensim.corpora.Dictionary()**

In [80]:
dictionary = gensim.corpora.Dictionary(preprocess_doc)  ## Special purpose Dictionary for NLP task
tokenized_reviews = preprocess_doc
doc_term_matrix = [dictionary.doc2bow(rev) for rev in tokenized_reviews]

#### Checking dictionary created

In [32]:
count = 0
for k,v in dictionary.iteritems():
    print(k,v)
    count+=1
    if count > 10:
        break

0 benadryl
1 epiglotti
2 hinder
3 pictur
4 right
5 swallow
6 swell
7 take
8 tylenol
9 administ
10 administr


### Fliter out the words that are extreme and Not comman
1. no_below = 5
2. no_above = 0.5
3. keep_n = 100000

In [33]:
dictionary.filter_extremes(no_below=5,no_above=0.5,keep_n=100000)

### Converts list of words into bag of words

In [35]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocess_doc]
bow_corpus[document_number]

[(25, 1),
 (43, 1),
 (49, 1),
 (54, 1),
 (56, 1),
 (59, 1),
 (61, 1),
 (74, 1),
 (77, 1),
 (79, 1),
 (82, 1),
 (84, 1),
 (91, 1),
 (119, 2),
 (144, 1),
 (151, 1),
 (165, 1),
 (177, 1),
 (183, 1),
 (233, 1)]

### Preview Bag of Words for our Sample preprocessed document

In [38]:
bow_doc_100 = bow_corpus[document_number]
for i in range(len(bow_doc_100)):
    print("word{} (\"{}\") appears {} times".format(bow_doc_100[i][0],dictionary[bow_doc_100[i][0]],bow_doc_100[i][1]))

word25 ("vaccin") appears 1 times
word43 ("dizzi") appears 1 times
word49 ("red") appears 1 times
word54 ("chill") appears 1 times
word56 ("fever") appears 1 times
word59 ("pain") appears 1 times
word61 ("sever") appears 1 times
word74 ("bodi") appears 1 times
word77 ("hive") appears 1 times
word79 ("ach") appears 1 times
word82 ("felt") appears 1 times
word84 ("headach") appears 1 times
word91 ("nausea") appears 1 times
word119 ("covid") appears 2 times
word144 ("tachycardia") appears 1 times
word151 ("hard") appears 1 times
word165 ("worsen") appears 1 times
word177 ("vomit") appears 1 times
word183 ("sweat") appears 1 times
word233 ("moderna") appears 1 times


###  TF - IDF ( Term Frequency, Inverse Document Frequency) on the dataset. Giving weights to word on how many times it occurs in document

### Create tf-idf model using model.TfidfModel on "bow_corpus"

In [46]:
from gensim import corpora,models
tfidf = models.TfidfModel(bow_corpus)
print(tfidf)

TfidfModel(num_docs=499, num_nnz=6619)


### Apply transformation to entire Corpus

In [48]:
corpus_tfidf= tfidf[bow_corpus]
print(corpus_tfidf[1])

[(6, 0.17430971975028317), (7, 0.18076565842755962), (8, 0.29002599050411054), (9, 0.1685726803737903), (10, 0.22802810834630363), (11, 0.12765769340388383), (12, 0.20711414108771944), (13, 0.10130921074372404), (14, 0.1485716606624681), (15, 0.06898379841108326), (16, 0.22802810834630363), (17, 0.34272295582443013), (18, 0.4874242306538179), (19, 0.28336817215674964), (20, 0.15239416393643548), (21, 0.18432432383797243), (22, 0.17430971975028317), (23, 0.1385570565747788), (24, 0.17430971975028317), (25, 0.056460309399546084), (26, 0.22007698701347678)]


### Preview TF-IDF score for the first document

In [49]:
from pprint import pprint
for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.320698703854761),
 (1, 0.33015986313200013),
 (2, 0.5581792126555412),
 (3, 0.2142732968037797),
 (4, 0.5542319254783102),
 (5, 0.35142097225296565)]


### Run LDA using Bag of Words

# Create a model
### Hyper_Parameters
1. num_topics
2. id2words
3. workers
4. passes
5. alpha and eta

#### Train your LDA model using gensim.model.LdaMulticore and save it to "lda_model"

In [63]:
lda_model = gensim.models.LdaMulticore(bow_corpus,
                                      num_topics= 10, 
                                      id2word = dictionary,
                                      passes=2,
                                      workers=2
                                     )

### For each topic explore the words occuring in that topic and its relative weights

In [53]:
for idx,topic in lda_model.print_topics(-1):
    print("Topic: {} \n words: {}".format(idx,topic))
    print("\n")

Topic: 0 
 words: 0.064*"patient" + 0.039*"vaccin" + 0.034*"leav" + 0.024*"covid" + 0.018*"report" + 0.016*"note" + 0.015*"swell" + 0.014*"minut" + 0.013*"headach" + 0.013*"pain"


Topic: 1 
 words: 0.056*"pain" + 0.037*"inject" + 0.034*"site" + 0.032*"headach" + 0.031*"sore" + 0.025*"vaccin" + 0.024*"chill" + 0.022*"fatigu" + 0.022*"muscl" + 0.022*"leav"


Topic: 2 
 words: 0.057*"headach" + 0.037*"hour" + 0.036*"chill" + 0.035*"fever" + 0.032*"fatigu" + 0.032*"ach" + 0.028*"nausea" + 0.026*"day" + 0.025*"bodi" + 0.022*"vaccin"


Topic: 3 
 words: 0.048*"inject" + 0.035*"site" + 0.021*"start" + 0.020*"benadryl" + 0.020*"sore" + 0.020*"itch" + 0.019*"day" + 0.018*"feel" + 0.018*"take" + 0.018*"hour"


Topic: 4 
 words: 0.028*"patient" + 0.026*"headach" + 0.023*"chill" + 0.023*"leav" + 0.023*"feel" + 0.022*"vaccin" + 0.021*"nausea" + 0.019*"felt" + 0.019*"pain" + 0.016*"swell"


Topic: 5 
 words: 0.053*"vaccin" + 0.032*"hour" + 0.029*"symptom" + 0.023*"chill" + 0.021*"receiv" + 0.018*"p

### Running LDA using TF-IDF

In [56]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics = 10, id2word = dictionary, passes = 2, workers = 4)
for idx,topic in lda_model_tfidf.print_topics(-1):
    print("Topic: {} \n words: {}".format(idx,topic))
    print("\n")

Topic: 0 
 words: 0.023*"inject" + 0.021*"sore" + 0.019*"leav" + 0.019*"site" + 0.017*"swell" + 0.016*"itch" + 0.016*"pain" + 0.016*"day" + 0.014*"headach" + 0.013*"muscl"


Topic: 1 
 words: 0.031*"dizzi" + 0.028*"fever" + 0.024*"chill" + 0.023*"myalgia" + 0.020*"nausea" + 0.020*"headach" + 0.018*"vomit" + 0.016*"hive" + 0.016*"sever" + 0.011*"ach"


Topic: 2 
 words: 0.018*"leav" + 0.016*"hour" + 0.015*"fever" + 0.014*"felt" + 0.013*"ach" + 0.013*"feel" + 0.011*"symptom" + 0.011*"vaccin" + 0.011*"heavi" + 0.011*"resolv"


Topic: 3 
 words: 0.022*"patient" + 0.013*"temp" + 0.013*"vomit" + 0.013*"report" + 0.012*"nausea" + 0.012*"advis" + 0.012*"give" + 0.012*"take" + 0.011*"start" + 0.011*"loss"


Topic: 4 
 words: 0.017*"slight" + 0.017*"pain" + 0.017*"headach" + 0.013*"lip" + 0.013*"area" + 0.013*"symptom" + 0.012*"fatigu" + 0.012*"shoot" + 0.012*"swell" + 0.011*"hour"


Topic: 5 
 words: 0.021*"vaccin" + 0.018*"chest" + 0.016*"patient" + 0.013*"covid" + 0.012*"increas" + 0.011*"sor

###  Perform Evalution by Classifying the sample document using LDA Bag of Words model

In [59]:
preprocess_doc[100]

['moderna',
 'covid',
 'vaccin',
 'sever',
 'pain',
 'red',
 'hard',
 'hive',
 'worsen',
 'fever',
 'chill',
 'bodi',
 'ach',
 'headach',
 'nausea',
 'vomit',
 'tachycardia',
 'hypertens',
 'dizzi',
 'sweat',
 'felt',
 'worst',
 'vaxx',
 'covid']

### Check which topic our test document belongs to using LDA Bag of Word Model

In [66]:
for index,score in sorted(lda_model[bow_corpus[document_number]],key=lambda tup:-1 * tup[1]):
                            print("\n Score : {} \t \n Topic: {}".format(score,lda_model.print_topic(index,10)))


 Score : 0.6654343605041504 	 
 Topic: 0.073*"ach" + 0.057*"bodi" + 0.045*"nausea" + 0.043*"headach" + 0.038*"chill" + 0.032*"fever" + 0.030*"sore" + 0.024*"fatigu" + 0.019*"start" + 0.018*"pain"

 Score : 0.2981916069984436 	 
 Topic: 0.037*"vaccin" + 0.035*"inject" + 0.030*"pain" + 0.025*"headach" + 0.024*"felt" + 0.024*"site" + 0.023*"hour" + 0.022*"symptom" + 0.019*"go" + 0.019*"swell"


### Check using LDA TF-IDF model

In [67]:
for index,score in sorted(lda_model_tfidf[bow_corpus[document_number]],key=lambda tup:-1 * tup[1]):
                            print("\n Score : {} \t \n Topic: {}".format(score,lda_model_tfidf.print_topic(index,10)))


 Score : 0.7769221663475037 	 
 Topic: 0.026*"fatigu" + 0.024*"chill" + 0.022*"headach" + 0.021*"inject" + 0.021*"hour" + 0.021*"pain" + 0.020*"fever" + 0.019*"ach" + 0.019*"bodi" + 0.019*"sore"

 Score : 0.18670441210269928 	 
 Topic: 0.031*"dizzi" + 0.028*"fever" + 0.024*"chill" + 0.023*"myalgia" + 0.020*"nausea" + 0.020*"headach" + 0.018*"vomit" + 0.016*"hive" + 0.016*"sever" + 0.011*"ach"


### Testing the Model on Unseen Document

In [77]:
unseen_document= "I have cold"
bow_vector = dictionary.doc2bow(preprocessing(unseen_document))
for index,score in sorted(lda_model[bow_vector],key = lambda tup:-1*tup[1]):
                          print("score : {} \t Topic : {}".format(score,lda_model.print_topic (index,5)))

score : 0.5499391555786133 	 Topic : 0.047*"hour" + 0.037*"vaccin" + 0.035*"headach" + 0.031*"leav" + 0.023*"swell"
score : 0.050014909356832504 	 Topic : 0.043*"sore" + 0.027*"hour" + 0.023*"take" + 0.020*"felt" + 0.020*"vaccin"
score : 0.050013381987810135 	 Topic : 0.073*"ach" + 0.057*"bodi" + 0.045*"nausea" + 0.043*"headach" + 0.038*"chill"
score : 0.05000939965248108 	 Topic : 0.047*"pain" + 0.034*"vaccin" + 0.030*"leav" + 0.026*"headach" + 0.021*"feel"
score : 0.05000882223248482 	 Topic : 0.037*"vaccin" + 0.035*"inject" + 0.030*"pain" + 0.025*"headach" + 0.024*"felt"
score : 0.05000732094049454 	 Topic : 0.062*"chill" + 0.042*"fever" + 0.041*"ach" + 0.037*"swell" + 0.036*"hour"
score : 0.050003424286842346 	 Topic : 0.056*"inject" + 0.051*"hour" + 0.035*"site" + 0.024*"vaccin" + 0.023*"fever"
score : 0.05000119283795357 	 Topic : 0.058*"pain" + 0.045*"right" + 0.039*"chill" + 0.029*"sever" + 0.029*"headach"
score : 0.05000118911266327 	 Topic : 0.064*"site" + 0.053*"inject" + 0.