/
workflow.py
75 lines (64 loc) · 2.24 KB
/
workflow.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import utils
import lda
import database
import json
import cophi
import numpy as np
import pandas as pd
def wrapper():
"""Wrapper for the topic modeling workflow.
"""
data = utils.get_data("corpus",
"topics",
"iterations",
"stopwords",
"mfw")
database.insert_into("textfiles", data["corpus"])
# 1. Preprocess:
dtm = preprocess(data)
# 2. Create model:
model = create_model(dtm, data["topics"], data["iterations"])
# 3. Get model output:
topics, descriptors, doc_topic = get_model_output(model, dtm)
# 4. Calculate similarities:
topic_sim, doc_sim = get_similarities(doc_topic)
data = {"doc_topic": doc_topic.to_json(force_ascii=False),
"topics": json.dumps(topics, ensure_ascii=False),
"doc_sim": doc_sim.to_json(force_ascii=False),
"topic_sim": topic_sim.to_json(force_ascii=False)}
database.insert_into("model", data)
def preprocess(data):
"""Preprocess text data.
"""
# Constructing corpus:
textfiles = database.select("textfiles")
documents = utils.get_documents(textfiles)
corpus = cophi.model.Corpus(documents)
# Cleaning corpus:
stopwords = utils.get_stopwords(data, corpus)
hapax = corpus.hapax
features = set(stopwords).union(set(hapax))
dtm = corpus.drop(corpus.dtm, features)
return dtm
def create_model(dtm, topics, iterations):
"""Create a topic model.
"""
model = lda.LDA(n_topics=topics,
n_iter=iterations)
model.fit(dtm.values)
return model
def get_model_output(model, dtm):
"""Get topics and distributions from topic model.
"""
# Topics and their descriptors:
topics = list(utils.get_topics(model, dtm.columns))
descriptors = list(utils.get_topic_descriptors(topics))
# Document-topic distribution:
doc_topic = utils.get_doc_topic(model, dtm.index, descriptors)
return topics, descriptors, doc_topic
def get_similarities(doc_topic):
"""Calculate similarities between vectors.
"""
topics = utils.get_cosine(doc_topic.values, doc_topic.columns)
documents = utils.get_cosine(doc_topic.T.values, doc_topic.index)
return topics, documents