Skip to content

Commit

Permalink
Update demonstrator
Browse files Browse the repository at this point in the history
  • Loading branch information
Severin Simmler committed Feb 20, 2017
1 parent 84bbf9b commit cf977a6
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 12 deletions.
28 changes: 17 additions & 11 deletions demonstrator/demonstrator.py
Expand Up @@ -6,6 +6,8 @@
This module demonstrates the joy of Topic Modeling, wrapped in an user-friendly
web application provided by `DARIAH-DE`_.
Todo: Replace print statements with logging (which is currently not working).
.. _DARIAH-DE:
https://de.dariah.eu
https://github.com/DARIAH-DE
Expand All @@ -26,19 +28,20 @@

__author__ = "Severin Simmler"
__email__ = "severin.simmler@stud-mail.uni-wuerzburg.de"
__date__ = "2017-02-17"
__date__ = "2017-02-20"

app = Flask(__name__)

@app.route('/')
def index():
print("Rendering index.html ...")
return render_template('index.html')

@app.route('/upload', methods=['POST'])
def upload_file():
# Open all files, tokenize, and save in pd.Series():
files = request.files.getlist('files')
corpus = pd.Series()
print("Accessing and tokenizing files ...")
for file in files:
filename, extension = os.path.splitext(secure_filename(file.filename))
if extension == '.txt':
Expand All @@ -51,35 +54,37 @@ def upload_file():
text = "".join(text.xpath('.//text()'))
file.flush()
else:
print("File format is not supported.") # Todo: Replace with Flask flash
print("File format is not supported.")
tokens = list(preprocessing.tokenize(text))
label = filename
corpus[label] = tokens

# Create bag-of-words:
print("Creating bag-of-words model ...")
id_types, doc_ids = preprocessing.create_dictionaries(corpus.index.tolist(), corpus.tolist())
sparse_bow = preprocessing.create_mm(corpus.index.tolist(), corpus.tolist(), id_types, doc_ids)

# Remove stopwords and hapax legomena:
stopwords = request.files['stoplist']
if request.files.get('stoplist', None):
print("Accessing external stopword list and cleaning corpus ...")
words = stopwords.read().decode('utf-8')
words = set(preprocessing.tokenize(words))
clean_term_frequency = preprocessing.remove_features(sparse_bow, id_types, words)
stopwords.flush()
else:
threshold = int(request.form['mfws'])
print("Accessing", threshold, "most frequent words and cleaning corpus ...")
stopwords = preprocessing.find_stopwords(sparse_bow, id_types, threshold)
hapax = preprocessing.find_hapax(sparse_bow, id_types)
feature_list = set(stopwords).union(hapax)
clean_term_frequency = preprocessing.remove_features(sparse_bow, id_types, feature_list)

# Create Matrix Market:
print("Creating matrix market model ...")
num_docs = max(clean_term_frequency.index.get_level_values("doc_id"))
num_types = max(clean_term_frequency.index.get_level_values("token_id"))
sum_counts = sum(clean_term_frequency[0])
header_string = str(num_docs) + " " + str(num_types) + " " + str(sum_counts) + "\n"

print("Saving matrix market model to matrixmarket.mm ...")
with open("matrixmarket.mm", 'w+', encoding = "utf-8") as f:
f.write("%%MatrixMarket matrix coordinate real general\n")
f.write(header_string)
Expand All @@ -90,23 +95,24 @@ def upload_file():
doc2id = {value : key for key, value in doc_ids.items()}
type2id = {value : key for key, value in id_types.items()}

# LDA:
num_topics = int(request.form['number_topics'])
passes = int(request.form['passes'])
print("Training LDA with", num_topics, "topics and", passes, "passes ...")
model = LdaModel(corpus=mm, id2word=type2id, num_topics=num_topics, passes=passes)

# Visualization:
print("Visualizing document-topic matrix and saving as heatmap.png ...")
doc_topic = visualization.create_doc_topic(mm, model, corpus.index.tolist())
heatmap = visualization.doc_topic_heatmap(doc_topic)
heatmap.savefig("./static/heatmap.png")
heatmap.savefig('./static/heatmap.png')

# Topic-Term-Matrix for HTML (todo: replace by DataFrame.to_html()):
# Todo: replace by DataFrame.to_html():
print("Accessing topics for HTML table ...")
import regex
pattern = regex.compile(r'\p{L}+\p{P}?\p{L}+')
topics = []
for n, topic in enumerate(model.show_topics()):
topics.append((n+1, pattern.findall(topic[1])))

print("Rendering result.html ...")
return render_template('result.html', topics=topics, documents=corpus.index.tolist())

@app.after_request
Expand Down
2 changes: 1 addition & 1 deletion demonstrator/templates/index.html
Expand Up @@ -111,7 +111,7 @@
<h1>Demonstrator: Topic Modeling</h1>
<div id="contentInner" style="text-align:justify">
<form action="/upload" method="POST" enctype="multipart/form-data">
<p>The text mining technique <b>Topic Modeling</b> has become a popular statistical method for clustering documents. This web application introduces an user-friendly workflow, basically containing data pre-processing, an implementation of the prototypic topic model <b>Latent Dirichlet Allocation</b> (LDA) which learns the relationships between words, topics, and documents, as well as one visualization to explore the trained LDA model.</p>
<p>The text mining technique <b>Topic Modeling</b> has become a popular statistical method for clustering documents. This web application introduces an user-friendly workflow, basically containing data preprocessing, an implementation of the prototypic topic model <b>Latent Dirichlet Allocation</b> (LDA) which learns the relationships between words, topics, and documents, as well as one visualization to explore the trained LDA model.</p>
<h2>1. Preprocessing</h2>
<h3>1.1 Reading a corpus of documents</h3>
<p>Select plain text (<b>.txt</b>) or TEI encoded XML files (<b>.xml</b>).</p>
Expand Down

0 comments on commit cf977a6

Please sign in to comment.