# Analyzing Medical Questions using Natural Language Processing

This notebook is to analyze questions asked by health care providers ('providers') and the general public ('patient') so as to glean insight from both the provider and patient viewpoints.

## Preprocessing
At this point, questions have been manually spell-checked for consistency.

In [None]:
import nltk
from nltk import FreqDist
from nltk.collocations import *
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [None]:
#Read datasets 
path = 'data/'  
filePrefix = ''
categories = ['patients','providers']
dataset = {}
dataset_raw = {}
allFeatures = set()
questions = 0
question_count = {}

corpus = []
text = ""

N={} # Number of questions in each corpus

for category in categories:
    fileName = path + filePrefix + category.lower() + '.txt'
    f = open(fileName,'r')
    text = ''
    text_raw = ''    
    lines = f.readlines()
    questions += len(lines)
    question_count[category] = len(lines)
    dataset_raw[category] = list(map(lambda line: line.lower(), lines))
    
    for line in lines:
        text += line.replace('\n',' ').lower()
        text_raw = line.lower()
    f.close
    N[category] = len(lines)
    
    #create tokens
    tokens = nltk.word_tokenize(text)
    dataset[category] = nltk.Text(tokens)


## Removing Punctuation & Stopwording

In [None]:
providersFD = FreqDist(dataset['providers'])
patientsFD = FreqDist(dataset['patients'])

punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"
print ('Punctuation FD[providers] FD[patients]')
for punct in punctuations:
    print ('   {}  {:3d}   {:3d}'.format(punct,providersFD[punct], patientsFD[punct]))

In [None]:
from nltk.corpus import stopwords
dsCleaned = {} #cleaned dataset

def removePunctuation(corpus):
    punctuations = ".,\"-\\/#!?$%\^&\*;:{}=\-_'~()"    
    filteredCorpus = [token for token in corpus if (not token in punctuations)]
    return filteredCorpus

def stopwording(corpus, min_len):
#     black_list = ['patients','providers']
    filteredCorpus = [token for token in corpus if (not token in stopwords.words('english') and len(token) > min_len)]
    return filteredCorpus

In [None]:
#remove punct & apply stopwording
for category in categories:
    print ('Processing %s' % category)
    dsCleaned[category] = stopwording(removePunctuation(dataset[category]), 3)
    print (dsCleaned[category])

## Lemmatization

In [None]:
#nltk.download('wordnet')
dsFinal={} #working dataset (final)

# def stemming(corpus):
#     stemmer = nltk.PorterStemmer()
#     normalized_corpus = [stemmer.stem(token) for token in corpus]
#     return normalized_corpus

def lemmatization(corpus):
    lemmatizer = nltk.WordNetLemmatizer()
    normalized_corpus = [lemmatizer.lemmatize(token) for token in corpus]
    return normalized_corpus

for category in categories:
    print ('Processing %s' % category)
    dsFinal[category] = lemmatization(dsCleaned[category])
    print (dsFinal[category])

## Simple Analysis
### Get Lexical Diversity

In [None]:
print ('Patient questions have a total of %s tokens and a vocabulary size of %s' % (len(dsFinal['patients']), len(vocabulary['patients'])))
print ('Provider questions have a total of %s tokens and a vocabulary size of %s' % (len(dsFinal['providers']), len(vocabulary['providers'])))

def lexical_diversity(text):
    return len(text)*1.0/len(set(text))

lexDiversity = {}
for category in categories:
    lexDiversity[category] = lexical_diversity(dsFinal[category])
    print ('Lexical Diversity in %s = %s' % (category,lexDiversity[category]))

### Counting Words

In [None]:
# print ('# time "disease" is used by health care providers %s' % ds['providers'].count('disease'))
# print ('# time "disease" is used by patients %s' % ds['patients'].count('disease'))

In [None]:
count = {}

# for token in vocabulary[category]:
#     count[token] = dsFinal[category].count(token)

for token in vocabulary['providers']:
    count[token] = dsFinal['providers'].count(token)
    
for w in sorted(count, key = count.get, reverse=True):
    print (w, count[w]) 

In [None]:
count = {}

for token in vocabulary['patients']:
    count[token] = dsFinal['patients'].count(token)

for w in sorted(count, key = count.get, reverse=True):
    print (w, count[w])

### Frequency Distribution

In [None]:
def getFrequent(freq, n):
    result = {}
    index = 0
    for i in sorted(freq, key = freq.get, reverse = True):
        index += 1
        result[i] = freq[i]
        if index == n:
            break
    return result 

frequency = nltk.FreqDist(dsFinal[category])

topTokens = getFrequent(frequency, 50)

print(topTokens)

### Finding Important Words

In [None]:
providersFD = FreqDist(dsFinal['providers'])
patientsFD = FreqDist(dsFinal['patients'])

In [None]:
print(FreqDist(dsFinal[category]))

In [None]:
providersFD.plot(20, cumulative = False, title = 'Provider Tokens');
patientsFD.plot(20, cumulative = False, title = 'Patient Tokens');

In [None]:
for token in dsFinal['patients']:
    if (len(token) >= 8):
        print ('%s [%s]' % (token, patientsFD[token]))

### Collocations, 2-grams & Co-Occurences

In [None]:
dsFinal['patients']
dsFinal['patients'] = nltk.Text(dsFinal['patients'])
dsFinal['patients'].collocation_list()

In [None]:
dsFinal['providers']
dsFinal['providers'] = nltk.Text(dsFinal['providers'])
dsFinal['providers'].collocation_list()

In [None]:
from nltk.collocations import *
from nltk.util import ngrams

print ('Generating bigrams')
bigrams = ngrams(dsFinal['patients'],2)
for bigram in bigrams:
    print (bigram)

In [None]:
bigram = nltk.collocations.BigramAssocMeasures()
trigram = nltk.collocations.TrigramAssocMeasures()

# Finding frequent 2-grams
print ('Finding frequent 2-grams')
for category in categories:
    finder = BigramCollocationFinder.from_words(dsFinal[category])
    finder.apply_freq_filter(5)
    tokens = finder.nbest(bigram.pmi, 20)
    print (tokens)
    
# Finding frequent 3-grams
print ('\nFinding frequent 3-grams')
for category in categories:
    finder = TrigramCollocationFinder.from_words(dsFinal[category])
    finder.apply_freq_filter(5)
    tokens = finder.nbest(trigram.pmi, 20)
    print (tokens)

### Lexical Resource

In [None]:
def proportion_cleantext(corpus, language):
    stopwords = nltk.corpus.stopwords.words(language)
    cleantext = [token for token in corpus if token not in stopwords]
    return len(cleantext)*1.0/len(corpus)

language='english'
for category in categories:
    print ("Proportion of clean terms in the [%s] is %s" % (category,proportion_cleantext(dataset[category],language)))

## Topic Modeling
### Topical Discovery and Latent Dirichlet Allocation
Identify the overall topic of discussion

Identify topics brought forward by Patients

Identify topics brought forward by Providers

In [None]:
#ignore depreciation warnings 
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning) 

In [None]:
import re
import gensim
from nltk.corpus import wordnet
from nltk.collocations import *
from gensim import corpora
import pyLDAvis.gensim_models

In [None]:
def remove_utf(text):
    return re.sub(r'[^\x00-\x7f]',r' ', text)

def load_stopwords():
    swords = []
    path="data/stopwords.txt"
    file_input = open (path,"r")
    lines = file_input.readlines()
    for line in lines:
        swords.append(line[:-1])
    file_input.close()
    return swords

def loadCorpus(path):
    data = []
    file_input = open (path,"r")
    lines = file_input.readlines()
    for line in lines:
        data.append(remove_utf(line[:-1].lower()))
    file_input.close()
    return data

stopwords = load_stopwords()
path = "data/splitcombo.txt"
allQuestions = loadCorpus(path)
#print (allQuestions)

In [None]:
questionsAsked = {'patients:':[],'providers:':[]}

keys = questionsAsked.keys()

current = ""
for line in allQuestions:
    if len(line)>5:
        for key in keys:
            if line.startswith(key):
                current = key
        l = questionsAsked[current]
        l.append(line)
        questionsAsked[current]=l

### More preprocessing for LDA topic modeling (builds on the cleaning, stopwording and lemmatization steps)

In [None]:
def apply_stopwording(corpus, min_len):
    black_list = ['patients','providers']
    filteredCorpus = [token for token in corpus if (not token in stopwords and not token in black_list and len(token)>min_len)]
    return filteredCorpus

def getCollocations(text, min_freq, coll_num):
    bigrams = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(text)
    finder.apply_freq_filter(min_freq)
    collocations = finder.nbest(bigrams.pmi, coll_num)
    return collocations

def replaceCollocationsInText(text,collocations):
    first = [t[0]for t in collocations]
    second = [t[1] for t in collocations]

    dtokens = []
    i = 0
    while i<=(len(text)-1):
        try:
            idx1 = first.index(text[i])
            if (text[i+1]==second[idx1]):
                dtokens.append(first[idx1]+"_"+second[idx1])
                i=i+1
        except:
            dtokens.append(text[i])
            pass
        i=i+1
    return dtokens

In [None]:
def processCorpus(corpusData):
    min_frequency = 3
    num_of_collocations=100
    text=""
    corpus=[]
    token =[]
    
    #Extract corpus and preprocess data
    for line in corpusData:
        t = nltk.word_tokenize(line)
        doc = nltk.Text(t)
        doc_clean = nltk.Text(lemmatization(apply_stopwording(removePunctuation(doc), 3)))
        corpus.append(doc_clean)
        token.extend(doc_clean.tokens)
        text=text+line
    
    #Identify collocations
    collocations = getCollocations(tokens,min_frequency,num_of_collocations)
    docs = []
    for doc in corpus:
        t = replaceCollocationsInText(doc,collocations)
        if (len(t)>0):
            docs.append(replaceCollocationsInText(doc,collocations))
    return docs

In [None]:
docs = processCorpus(allQuestions)
print(len(docs))
print (docs[0:10])

In [None]:
# Create a bag-of-words representation of the dictionary
k = 10
iterations = 40

dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]
topic_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=k, id2word = dictionary, passes = iterations)
lda_vis = pyLDAvis.gensim_models.prepare(topic_model,corpus,dictionary,sort_topics=False)
pyLDAvis.display(lda_vis)

## Vizualizing Topics by Category

In [None]:
##vizualizing provider topics 

k = 5

docs = processCorpus(questionsAsked["providers:"])
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]
topic_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=k, id2word = dictionary, passes = iterations)
lda_vis = pyLDAvis.gensim_models.prepare(topic_model,corpus,dictionary,sort_topics=False)
pyLDAvis.display(lda_vis)

In [None]:
##vizualizing patient topics

k = 4

docs = processCorpus(questionsAsked["patients:"])
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(doc) for doc in docs]
topic_model = gensim.models.ldamodel.LdaModel(corpus, num_topics=k, id2word = dictionary, passes = iterations)
lda_vis = pyLDAvis.gensim_models.prepare(topic_model,corpus,dictionary,sort_topics=False)
pyLDAvis.display(lda_vis)