# Topic Modeling #



Load functions from 'topics.py'.

In [1]:
import topics as tp

Read text files from a folder.

In [2]:
documents = tp.readCorpus('./test_corpus/*.txt')

Or create a test data set.

## Preprocessing ##

Tokenize texts.

In [3]:
texts = tp.tokenize(documents)

Remove hapax legomena.

In [4]:
texts = tp.removeHapaxLeg(texts)

Remove stopwords.

In [5]:
texts = tp.removeStopWords(texts, stoplist = 'en')

## Model creation ##

In [6]:
model = tp.getTopics(texts, topics = 5, ldaSource = 'gensim')

## Visualization ##

Basic text output using gensim functions:

In [13]:
model.show_topics(num_topics = 5)

['0.001*s + 0.001*sir + 0.001*holmes + 0.001*said + 0.001*man + 0.001*night + 0.000*did + 0.000*come + 0.000*eyes + 0.000*think',
 '0.001*said + 0.001*s + 0.001*man + 0.001*holmes + 0.001*sir + 0.001*know + 0.001*time + 0.001*night + 0.001*little + 0.001*came',
 '0.012*s + 0.010*conan + 0.008*like + 0.007*olivia + 0.006*eyes + 0.005*sword + 0.005*saw + 0.004*man + 0.004*lay + 0.004*feet',
 '0.013*said + 0.010*s + 0.009*man + 0.008*holmes + 0.005*little + 0.005*small + 0.005*night + 0.005*sholto + 0.005*treasure + 0.004*way',
 '0.016*sir + 0.011*said + 0.010*s + 0.010*man + 0.009*holmes + 0.007*moor + 0.007*henry + 0.005*know + 0.005*watson + 0.005*baskerville']

In [18]:
model


<gensim.models.ldamodel.LdaModel at 0x7f9f711d7a20>

In [None]:
from gensim.corpora import MmCorpus
from gensim.models import LdaModel
import numpy as np
import matplotlib.pyplot as plt
import sys, os


if len(sys.argv) < 2:
    print("usage: {0} [path to model.lda]\n".format(sys.argv[0]))
    sys.exit(1)


path, file = os.path.split(sys.argv[1])
corpusname = file.split(".")[0]


# load model

doc_labels = []
with open(path+"/"+corpusname+"_doclabels.txt", "r") as f:
    for line in f: doc_labels.append(line)

corpus = MmCorpus(path+"/"+corpusname+".mm")
model = LdaModel.load(sys.argv[1])

no_of_topics = model.num_topics
no_of_docs = len(doc_labels)


# get doc-topic matrix

doc_topic = np.zeros((no_of_docs, no_of_topics))

for doc, i in zip(corpus, range(no_of_docs)):           # use document bow from corpus
    topic_dist = model.__getitem__(doc)                 # to get topic distribution from model
    for topic in topic_dist:                            # topic_dist is a list of tuples (topic_id, topic_prob)
        doc_topic[i][topic[0]] = topic[1]               # save topic probability

# get plot labels

topic_labels = []
for i in range(no_of_topics):
    topic_terms = [x[1] for x in model.show_topic(i, topn=3)]           # show_topic() returns tuples (word_prob, word)
    topic_labels.append(" ".join(topic_terms))

#print(doc_topic)
#print(doc_topic.shape)


# cf. https://de.dariah.eu/tatom/topic_model_visualization.html

if no_of_docs > 20 or no_of_topics > 20: plt.figure(figsize=(20,20))    # if many items, enlarge figure
plt.pcolor(doc_topic, norm=None, cmap='Reds')
plt.yticks(np.arange(doc_topic.shape[0])+1.0, doc_labels)
plt.xticks(np.arange(doc_topic.shape[1])+0.5, topic_labels, rotation='90')
plt.gca().invert_yaxis()
plt.colorbar(cmap='Reds')
plt.tight_layout()

plt.savefig(path+"/"+corpusname+"_heatmap.png") #, dpi=80)
#plt.show()