#### Latent Dirichlet Allocation

In [52]:
# Gensim
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from gensim.models import TfidfModel

# NLTK
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

# Numpy 
import numpy as np

# Pandas
import pandas as pd

#vis
import pyLDAvis.gensim

In [53]:
# df = pd.read_csv('data_csv/soojeong-han-query-filter-regex-results.csv');
# df = pd.read_csv('data_csv/kang-query-pain-assessment-ts-results.csv');
df = pd.read_csv('data_csv/kang-query-pain-management-ts-results.csv');

# Extract id , title and body from the dataframe and rename the index column
documents = df.loc[:, ['_id','title','body']].reset_index().rename(columns={'index': 'index_id'})

# Total number of documents
print(len(documents))

# Take a glance at the dataset
documents[:5]

35


Unnamed: 0,index_id,_id,title,body
0,0,6210247411c2270d5f1e4872,Looking for doctor resources,We are in the north DFW Plano/Allen/McKinney a...
1,1,6210253911c2270d5f1e4ff0,Pain Management(1),My mom who is 96 has been having trouble with ...
2,2,62102fd7c9f833c0c9214f5a,Dementia and Cancer and Hospice! Oh My!,The last 10 days have been a nightmare. Dad (...
3,3,621030cdc9f833c0c9215724,My Teeth are all bad,My mother was under the care of a family Denti...
4,4,621031a5c9f833c0c9215e05,Does anyone have their loved one using a Fenta...,I thought upgrading Mom from the heavy Tylenol...


#### Step 2: Data Preprocessing 
- Tokenization: Split the text into sentences and the sentences into words. Lowercase the words and remove punctuation.
- Lemmatization: Words in third person are changed to first person and verbs in past and future tenses are changed into present.
- Stemming: Words are reduced to their root form.
- Stopwords: All stopwords are removed.
- Words that have fewer than 5 characters are removed.

In [54]:
# Tokenize and lemmatize
stemmer = SnowballStemmer("english")

def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result=[]
    # Tokenize the text
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 5:
            # Lemmatize and stem the token
            result.append(lemmatize_stemming(token))
    return result

In [55]:
# Current documents looks
documents

Unnamed: 0,index_id,_id,title,body
0,0,6210247411c2270d5f1e4872,Looking for doctor resources,We are in the north DFW Plano/Allen/McKinney a...
1,1,6210253911c2270d5f1e4ff0,Pain Management(1),My mom who is 96 has been having trouble with ...
2,2,62102fd7c9f833c0c9214f5a,Dementia and Cancer and Hospice! Oh My!,The last 10 days have been a nightmare. Dad (...
3,3,621030cdc9f833c0c9215724,My Teeth are all bad,My mother was under the care of a family Denti...
4,4,621031a5c9f833c0c9215e05,Does anyone have their loved one using a Fenta...,I thought upgrading Mom from the heavy Tylenol...
5,5,62103703c9f833c0c92186ca,Pain management issues,I am trying to figure out pain management for ...
6,6,62103f817b65de9a5b65b2e0,How to help her recover from broken wrist?,Three weeks ago Mother fell and fractured her ...
7,7,62104d987b65de9a5b660e53,Mother under hospice care but needs special wo...,My 91 year old mother is currently living in a...
8,8,62104ee57b65de9a5b66160f,Hospice &amp; pain medication,So my mom has been on hospice for 4 weeks. The...
9,9,62104f977b65de9a5b661a06,Pain Management or lack thereof,My mother was just release from a behavioral h...


In [56]:
# Process all documents
processed_docs = (documents.set_index('_id')['body']
                  .rename_axis('thread_id')
                  .map(lambda x: preprocess(x)))

In [57]:
# Number of documents
print(processed_docs.size)

# Take a glance at the processed documents
processed_docs.head()

35


thread_id
6210247411c2270d5f1e4872    [mckinney, tri, primari, retir, neurologist, t...
6210253911c2270d5f1e4ff0    [have, troubl, sever, arthriti, morn, caregiv,...
62102fd7c9f833c0c9214f5a    [nightmar, moder, dementia, rush, horribl, can...
621030cdc9f833c0c9215724    [mother, famili, dentist, bond, unexpect, chip...
621031a5c9f833c0c9215e05    [think, upgrad, tylenol, tramadol, manag, bett...
Name: body, dtype: object

#### Step 3.0 Bigramas and Trigramas

In [58]:
bigrams_phrases = gensim.models.Phrases(processed_docs, min_count=5, threshold=100)

bigrams = gensim.models.phrases.Phraser(bigrams_phrases)

def make_bigrams(texts):
    return [bigrams[doc] for doc in texts]

data_bigram = make_bigrams(processed_docs)

# Show phrases
data_bigram = [[word for word in doc if '_' in word] for doc in data_bigram]

# Print all bigrams
for doc in data_bigram[:10]:
    if doc:
        print(doc)



In [59]:
trigram_phrases = gensim.models.Phrases(bigrams_phrases[processed_docs], threshold=100)

trigram = gensim.models.phrases.Phraser(trigram_phrases)

def make_trigrams(texts):
    return [trigram[doc] for doc in texts]

data_bigram_trigram = make_trigrams(processed_docs)

# Print all bigrams and trigrams
for hyphoned_phrases in [[word for word in doc if '_' in word] for doc in data_bigram_trigram][:10]:
    if hyphoned_phrases:
        print(hyphoned_phrases)

#### Step 4: TF-IDF REMOVAL

In [60]:
id2word = corpora.Dictionary(data_bigram_trigram)

texts = data_bigram_trigram

corpus = [id2word.doc2bow(text) for text in texts]
# print (corpus[0][0:20])

tfidf = TfidfModel(corpus, id2word=id2word)

low_value = 0.03
words  = []
words_missing_in_tfidf = []
for i in range(0, len(corpus)):
    bow = corpus[i]
    low_value_words = [] #reinitialize to be safe. You can skip this.
    tfidf_ids = [id for id, value in tfidf[bow]]
    bow_ids = [id for id, value in bow]
    low_value_words = [id for id, value in tfidf[bow] if value < low_value]
    drops = low_value_words+words_missing_in_tfidf
    for item in drops:
        words.append(id2word[item])
    words_missing_in_tfidf = [id for id in bow_ids if id not in tfidf_ids] # The words with tf-idf socre 0 will be missing

    new_bow = [b for b in bow if b[0] not in low_value_words and b[0] not in words_missing_in_tfidf]
    corpus[i] = new_bow


In [61]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus[:-1],
                                           id2word=id2word,
                                           num_topics=8,
                                           random_state=100,
                                           update_every=5,
                                           chunksize=100,
                                           passes=50,
                                           alpha="auto",
                                           eta="auto")

lda_model.print_topics(num_words=10)


[(0,
  '0.020*"doctor" + 0.017*"experi" + 0.010*"live" + 0.009*"husband" + 0.008*"suffer" + 0.007*"procedur" + 0.007*"concern" + 0.007*"longer" + 0.007*"wonder" + 0.007*"suggest"'),
 (1,
  '0.040*"mother" + 0.030*"brother" + 0.017*"sister" + 0.014*"call" + 0.010*"doctor" + 0.010*"issu" + 0.010*"have" + 0.007*"health" + 0.007*"nurs" + 0.007*"question"'),
 (2,
  '0.061*"hospic" + 0.024*"patient" + 0.018*"peopl" + 0.017*"provid" + 0.013*"program" + 0.012*"dentist" + 0.012*"boomer" + 0.009*"receiv" + 0.008*"servic" + 0.007*"recommend"'),
 (3,
  '0.027*"doctor" + 0.017*"think" + 0.015*"nurs" + 0.015*"issu" + 0.013*"dementia" + 0.011*"therapist" + 0.011*"get" + 0.011*"physic" + 0.008*"primari" + 0.008*"experi"'),
 (4,
  '0.014*"medicaid" + 0.011*"hospic" + 0.011*"facil" + 0.011*"experi" + 0.011*"wonder" + 0.011*"pray" + 0.011*"tonight" + 0.007*"nurs" + 0.007*"month" + 0.007*"tramadol"'),
 (5,
  '0.029*"hospic" + 0.019*"mother" + 0.019*"seizur" + 0.015*"medic" + 0.015*"discharg" + 0.012*"hosp

In [62]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds="mmds", R=10)
vis