# Topic modeling

In [1]:
from nltk.corpus import brown

In [20]:
data = []
for fileid in brown.fileids():
    document = ' '.join(brown.words(fileid))
    data.append(document)
NO_DOCUMENTS = len(data)
print(NO_DOCUMENTS)
print(data[:5])

500


In [22]:
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords

In [23]:
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')

def clean_text(text):
    tokenized_text = word_tokenize(text.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}',t)]
    return cleaned_text
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in data:
    tokenized_data.append(clean_text(text))
    
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])

[(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2), (44, 2), (45, 2), (46, 2), (47, 2), (49, 1), (50, 1), (53, 1), (56, 1), (59, 1), (60, 1), (66, 1), (75, 1), (80, 1), (98, 1), (101, 1), (106, 1), (117, 1), (129, 1), (130, 2), (132, 2), (135, 2), (140, 1), (141, 2), (143, 4), (144, 2), (145, 2), (166, 1), (195, 1), (198, 3), (219, 1), (220, 4), (221, 3), (223, 1), (229, 4), (230, 4), (231, 2), (235, 1), (236, 1), (242, 2), (246, 2), (255, 1), (263, 1), (269, 1), (270, 5), (271, 2), (275, 5), (276, 1), (278, 4), (280, 2), (281, 1), (307, 2), (310, 1), (311, 3), (313, 1), (314, 5), (318, 4), (322, 1), (336, 1), (338, 3), (339, 1), (340, 1), (341, 1), (345, 1), (346, 1), (351, 1), (354, 1), (355, 1), (366, 3), (368, 13), (370, 1), (372, 1), (374, 3), (377, 3), (381, 3), (386, 1), (392, 6), (396, 1), (401, 1), (412, 2), (426, 2), (428, 2), (431, 2), (434, 2), (439, 2), (444, 1), (450, 1), (452, 1), (462, 1), (465, 1), (467, 1), (470, 1), (478, 1), (483, 1), (

In [24]:
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

In [25]:
print('LDA Model:')

for idx in range(NUM_TOPICS):
    # print the first 10 most representative topics
    print('Topic #%s:' % idx, lda_model.print_topic(idx, 10))
print('='*20)
print('LSI Model')

for idx in range(NUM_TOPICS):
    print('Topic #%s:' % idx, lsi_model.print_topic(idx, 10))
print('='*20)

LDA Model:
Topic #0: 0.004*"said" + 0.004*"one" + 0.004*"would" + 0.003*"new" + 0.003*"time" + 0.003*"two" + 0.003*"like" + 0.003*"even" + 0.002*"could" + 0.002*"man"
Topic #1: 0.007*"one" + 0.005*"would" + 0.004*"said" + 0.004*"time" + 0.004*"could" + 0.003*"new" + 0.003*"like" + 0.003*"man" + 0.003*"first" + 0.002*"two"
Topic #2: 0.005*"could" + 0.005*"would" + 0.005*"one" + 0.003*"said" + 0.003*"may" + 0.003*"first" + 0.002*"time" + 0.002*"two" + 0.002*"years" + 0.002*"new"
Topic #3: 0.006*"one" + 0.004*"would" + 0.003*"new" + 0.003*"time" + 0.003*"state" + 0.002*"man" + 0.002*"two" + 0.002*"people" + 0.002*"first" + 0.002*"also"
Topic #4: 0.005*"would" + 0.005*"said" + 0.005*"one" + 0.004*"could" + 0.003*"man" + 0.003*"time" + 0.003*"like" + 0.003*"even" + 0.002*"new" + 0.002*"made"
Topic #5: 0.006*"one" + 0.006*"said" + 0.005*"would" + 0.004*"could" + 0.003*"new" + 0.003*"first" + 0.002*"even" + 0.002*"like" + 0.002*"two" + 0.002*"time"
Topic #6: 0.007*"one" + 0.005*"would" + 0.00

In [12]:
text = 'The economy is working better thatn ever'
bow = dictionary.doc2bow(clean_text(text))

print(lsi_model[bow])
print(lda_model[bow])

[(0, 0.09161704509983884), (1, 0.008697252856175898), (2, 0.015836558942566713), (3, -0.04076500772695546), (4, 0.01614299890304545), (5, 0.011510490412815481), (6, 0.029287964371884717), (7, 0.020117362735880072), (8, -0.05589004055915413), (9, -0.027149081351805403)]
[(0, 0.020013448), (1, 0.020012842), (2, 0.020011842), (3, 0.81988263), (4, 0.020013053), (5, 0.02001287), (6, 0.020014398), (7, 0.020013291), (8, 0.020012608), (9, 0.020013044)]


In [13]:
from gensim import similarities

In [17]:
lda_index = similarities.MatrixSimilarity(lda_model[corpus])
 
# Let's perform some queries
similarities = lda_index[lda_model[bow]]
# Sort the similarities
similarities = sorted(enumerate(similarities), key=lambda item: -item[1])
 
# Top most similar documents:
print(similarities[:10])
# [(104, 0.87591344), (178, 0.86124849), (31, 0.8604598), (77, 0.84932965), (85, 0.84843522), (135, 0.84421808), (215, 0.84184396), (353, 0.84038532), (254, 0.83498049), (13, 0.82832891)]
 
# Let's see what's the most similar document
document_id, similarity = similarities[0]
print(data[document_id][:1000])


AttributeError: 'numpy.ndarray' object has no attribute 'MatrixSimilarity'

# Using skikit-learn for topic modeling

In [26]:
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer

In [27]:
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)



(500, 10)
(500, 10)
(500, 10)


In [28]:
# Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])

[1.05599130e-04 1.05611492e-04 9.80398872e-01 1.05619689e-04
 1.05602797e-04 1.05596716e-04 1.05611206e-04 1.05596733e-04
 1.05625225e-04 1.87562652e-02]
[0.         0.         2.11586206 0.07690596 0.         0.5430071
 1.06850316 0.         0.         0.24612067]
[ 23.30684381   1.59364315  21.86529375   0.14663324   0.69465875
  10.52134941   4.36606605  -2.24335254   1.26218832 -12.1946835 ]


In [29]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
               for i in topic.argsort()[:-top_n - 1]])
            
                       
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
Topic 1:


Topic 2:


Topic 3:


Topic 4:


Topic 5:


Topic 6:


Topic 7:


Topic 8:


Topic 9:


NMF Model:
Topic 0:


Topic 1:


KeyboardInterrupt: 

In [30]:
text = 'The economy is working better than ever'
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[0.0029002  0.         0.         0.         0.         0.00438908
 0.         0.         0.         0.00466415]


In [31]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
similarities = most_similar(x, nmf_Z)
document_id, similarity = similarities[0]
print(data[document_id][:1000])

Livery stable -- J. Vernon , prop. '' . Coaching had declined considerably by 1905 , but the sign was still there , near the old Wells Fargo building in San Francisco , creaking in the fog as it had for thirty years . John Vernon had had all the patronage he cared for -- he had prospered , but he could not retire from horsedom . Coaching was in his blood . He had two interests in life : the pleasures of the table and driving . Twice a week he drove his tallyho over the Santa Cruz road , upland and through the redwood forest , with orchards below him at one hand , and glimpses of the Pacific at the other . The journey back he made along the coast road , traveling hell-for-leather , every lantern of the tallyho ablaze . The southward route was the classic run in California , and the most fashionable . His patronage on this stretch was made up largely of San Franciscans -- regulars , most of them , and trenchermen like himself . They did not complain at the inhuman hour of starting ( seve

# Plotting words and documents in 2D with SVD

In [34]:
import pandas as pd
from bokeh.io import push_notebook, show, output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource, LabelSet
output_notebook()

In [35]:
svd = TruncatedSVD(n_components=2)
documents_2d = svd.fit_transform(data_vectorized)
 
df = pd.DataFrame(columns=['x', 'y', 'document'])
df['x'], df['y'], df['document'] = documents_2d[:,0], documents_2d[:,1], range(len(data))
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="document", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

In [36]:
svd = TruncatedSVD(n_components=2)
words_2d = svd.fit_transform(data_vectorized.T)
 
df = pd.DataFrame(columns=['x', 'y', 'word'])
df['x'], df['y'], df['word'] = words_2d[:,0], words_2d[:,1], vectorizer.get_feature_names()
 
source = ColumnDataSource(ColumnDataSource.from_df(df))
labels = LabelSet(x="x", y="y", text="word", y_offset=8,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
 
plot = figure(plot_width=600, plot_height=600)
plot.circle("x", "y", size=12, source=source, line_color="black", fill_alpha=0.8)
plot.add_layout(labels)
show(plot, notebook_handle=True)

# More about LDA

In [37]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
text = "The economy is working better than ever"
x = lda_model.transform(vectorizer.transform([text]))[0]
print(x, x.sum())



[0.02500418 0.02500523 0.02500803 0.02500707 0.02500001 0.02500611
 0.02500002 0.02500433 0.774965   0.02500002] 1.0


In [40]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

# LDA to CHW notes

In [41]:
notes = pd.read_csv('CHW_Notes.csv')
notes

Unnamed: 0,patient_id,text
0,2,A healthy lifestyle is important because hav...
1,6,"Diabetes runs in family - grandmother, mothe..."
2,13,"Prior to bariatric surgery, I was having lot..."
3,14,Healthy lifestyle is important to me because...
4,19,I had endometrial cancer 4 years ago when I ...
...,...,...
76,304,I am colon cancer survivor and Buddhist. I e...
77,314,"Living a good, happy life with my kids. .Fee..."
78,319,to enjoy my family without worrying about be...
79,323,....Importance of maintaining physical activ...


In [49]:
notes_text = notes['text'].to_list()
patient_id = notes['patient_id'].to_list()

In [50]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 3
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(notes_text)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_topics=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)

  token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')


In [51]:
panel2 = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel2

# References:
http://stanford.edu/~cpiech/cs221/handouts/kmeans.html
<br>
https://nlpforhackers.io/topic-modeling/
<br>
http://brandonrose.org/clustering
<br>
https://en.wikipedia.org/wiki/K-means_clustering
<br>
https://en.wikipedia.org/wiki/Tf%E2%80%93idf
<br>
http://www.tfidf.com/
<br>
https://www.elephate.com/blog/what-is-tf-idf/
<br>
https://www.datacamp.com/community/tutorials/wordcloud-python
<br>
https://d3js.org/
<br>
https://www.aclweb.org/anthology/W14-3110.pdf