Skip to content

Commit

Permalink
Refactoring
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Apr 15, 2018
1 parent 7d72efe commit a51363d
Show file tree
Hide file tree
Showing 55 changed files with 267 additions and 192 deletions.
3 changes: 3 additions & 0 deletions application/__init__.py
@@ -0,0 +1,3 @@
from application import web
from application import gui
from application import utils
Binary file added application/__pycache__/__init__.cpython-36.pyc
Binary file not shown.
Binary file added application/__pycache__/gui.cpython-36.pyc
Binary file not shown.
Binary file added application/__pycache__/utils.cpython-36.pyc
Binary file not shown.
Binary file added application/__pycache__/web.cpython-36.pyc
Binary file not shown.
22 changes: 22 additions & 0 deletions application/config.py
@@ -0,0 +1,22 @@
#!/usr/bin/env python3

import pathlib
import sys
import flask

def create_app(**kwargs):
"""
Creates a Flask app and determines the path for bokeh resources. If the
scripts were frozen with PyInstaller, the paths are adjusted accordingly.
"""
if getattr(sys, 'frozen', False):
root = pathlib.Path(sys._MEIPASS)
app = flask.Flask(import_name=__name__,
template_folder=str(pathlib.Path(root, 'templates')),
static_folder=str(pathlib.Path(root, 'static')),
**kwargs)
bokeh_resources = str(pathlib.Path(root, 'static', 'bokeh_templates'))
else:
app = flask.Flask(import_name=__name__, **kwargs)
bokeh_resources = str(pathlib.Path('static', 'bokeh_templates'))
return app, bokeh_resources
56 changes: 56 additions & 0 deletions application/gui.py
@@ -0,0 +1,56 @@
#!/usr/bin/env python3

import sys
import pathlib
import PyQt5.QtGui
import PyQt5.QtWidgets
import PyQt5.QtWebEngineWidgets
import PyQt5.QtCore
import application.web


PORT = 5000
ROOT_URL = 'http://localhost:{port}'.format(port=PORT)


class FlaskThread(PyQt5.QtCore.QThread):
def __init__(self, application):
PyQt5.QtCore.QThread.__init__(self)
self.application = application

def __del__(self):
self.wait()

def run(self):
self.application.run(port=PORT)


def provide_gui(application):
"""
Opens a QtWebEngine window, runs the Flask application, and renders the
index.html page.
"""
title = 'Topics Explorer'
icon = str(pathlib.Path('static', 'img', 'page_icon.png'))
width = 1200
height = 660

qtapp = PyQt5.QtWidgets.QApplication(sys.argv)

webapp = FlaskThread(application)
webapp.start()

qtapp.aboutToQuit.connect(webapp.terminate)

webview = PyQt5.QtWebEngineWidgets.QWebEngineView()
webview.resize(width, height)
webview.setWindowTitle(title)
webview.setWindowIcon(PyQt5.QtGui.QIcon(icon))

webview.load(PyQt5.QtCore.QUrl(ROOT_URL))
webview.show()
return qtapp.exec_()


def run():
sys.exit(provide_gui(application.web.app))
124 changes: 10 additions & 114 deletions webapp.py → application/modeling.py 100755 → 100644
@@ -1,70 +1,10 @@
#!/usr/bin/env python3

import pathlib
import time
import sys
import shutil
import logging
import tempfile
import utils
import dariah_topics
import flask
import pandas as pd
import numpy as np
import bokeh.plotting
import bokeh.embed
import werkzeug.utils


TEMPDIR = tempfile.mkdtemp() # Storing logfile, dumping temporary data, etc.
NUM_KEYS = 8 # The number of topic keys for the topics table
# These messages are displayed during modeling:
INFO_2A = "FYI: This might take a while..."
INFO_3A = "In the meanwhile, have a look at"
INFO_4A = "our Jupyter notebook introducing"
INFO_5A = "topic modeling with MALLET."


if getattr(sys, 'frozen', False):
# If the script is frozen by PyInstaller
root = pathlib.Path(sys._MEIPASS)
app = flask.Flask(import_name=__name__,
template_folder=str(pathlib.Path(root, 'templates')),
static_folder=str(pathlib.Path(root, 'static')))
bokeh_resources = str(pathlib.Path(root, 'static', 'bokeh_templates'))
else:
app = flask.Flask(import_name=__name__)
bokeh_resources = str(pathlib.Path('static', 'bokeh_templates'))


@app.route('/')
def index():
"""
Renders the main page. A warning pops up, if the machine is not
connected to the internet.
"""
if utils.is_connected():
return flask.render_template('index.html')
else:
return flask.render_template('index.html', internet='warning')


@app.route('/help')
def help():
"""
Renders the help page.
"""
return flask.render_template('help.html')


@app.route('/modeling', methods=['POST'])
def modeling():
"""
Streams the modeling page, printing useful information to screen.
The generated data will be dumped into the TEMPDIR (specified above).
"""
@flask.stream_with_context
def create_model():
def create_model():
start = time.time()
try:
user_input = {'files': flask.request.files.getlist('files'),
Expand All @@ -90,7 +30,7 @@ def create_model():
if filename.suffix == '.txt':
text = file.read().decode('utf-8')
else:
text = utils.process_xml(file)
text = application.utils.process_xml(file)
tokens = list(dariah_topics.preprocessing.tokenize(text))
tokenized_corpus[filename.stem] = tokens
parameter['Corpus size (raw), in tokens'] += len(tokens)
Expand Down Expand Up @@ -140,15 +80,15 @@ def create_model():
INFO_5B = INFO_5B.format(parameter['Number of topics'])

yield "running", "Initializing LDA topic model ...", INFO_2B, INFO_3B, INFO_4B, INFO_5B
model = utils.enthread(target=utils.lda_modeling,
model = application.utils.enthread(target=application.utils.lda_modeling,
args=(document_term_arr,
user_input['num_topics'],
user_input['num_iterations'],
TEMPDIR))
while True:
# During modeling the logfile is read continuously and the newest
# line is sent to the browser as information for the user:
msg = utils.read_logfile(str(pathlib.Path(TEMPDIR, 'topicmodeling.log')))
msg = application.utils.read_logfile(str(pathlib.Path(TEMPDIR, 'topicmodeling.log')))
if msg == None:
# When modeling is done, get the model:
model = model.get()
Expand All @@ -161,8 +101,8 @@ def create_model():
yield "running", "Accessing topics ...", INFO_2B, INFO_3B, INFO_4B, INFO_5B
topics = dariah_topics.postprocessing.show_topics(model=model,
vocabulary=vocabulary,
num_keys=NUM_KEYS)
topics.columns = ['Key {0}'.format(i) for i in range(1, NUM_KEYS + 1)]
num_keys=8)
topics.columns = ['Key {0}'.format(i) for i in range(1, 9)]
topics.index = ['Topic {0}'.format(i) for i in range(1, user_input['num_topics'] + 1)]

yield "running", "Accessing document topics distributions ...", INFO_2B, INFO_3B, INFO_4B, INFO_5B
Expand Down Expand Up @@ -195,7 +135,7 @@ def create_model():

heatmap_script, heatmap_div = bokeh.embed.components(heatmap)

corpus_boxplot = utils.boxplot(corpus_stats)
corpus_boxplot = application.utils.boxplot(corpus_stats)
corpus_boxplot_script, corpus_boxplot_div = bokeh.embed.components(corpus_boxplot)
bokeh.plotting.output_file(str(pathlib.Path(TEMPDIR, 'corpus_statistics.html')))
bokeh.plotting.save(corpus_boxplot)
Expand All @@ -204,7 +144,7 @@ def create_model():
height = 10 * 18
else:
height = document_topics.shape[1] * 18
topics_barchart = utils.barchart(document_topics, height=height, topics=topics)
topics_barchart = application.utils.barchart(document_topics, height=height, topics=topics)
topics_script, topics_div = bokeh.embed.components(topics_barchart)
bokeh.plotting.output_file(str(pathlib.Path(TEMPDIR, 'topics_barchart.html')))
bokeh.plotting.save(topics_barchart)
Expand All @@ -213,7 +153,7 @@ def create_model():
height = 10 * 18
else:
height = document_topics.shape[0] * 18
documents_barchart = utils.barchart(document_topics.T, height=height)
documents_barchart = application.utils.barchart(document_topics.T, height=height)
documents_script, documents_div = bokeh.embed.components(documents_barchart)
bokeh.plotting.output_file(str(pathlib.Path(TEMPDIR, 'document_topics_barchart.html')))
bokeh.plotting.save(documents_barchart)
Expand Down Expand Up @@ -251,51 +191,7 @@ def create_model():
'corpus_boxplot_script': corpus_boxplot_script,
'corpus_boxplot_div': corpus_boxplot_div,
'cwd': cwd}
utils.compress(data, str(pathlib.Path(TEMPDIR, 'data.pickle')))
application.utils.compress(data, str(pathlib.Path(TEMPDIR, 'data.pickle')))
yield 'done', '', '', '', '', ''
except Exception as error:
yield 'error', str(error), '', '', '', ''

progress = create_model()

def stream_template(template_name, **context):
app.update_template_context(context)
t = app.jinja_env.get_template(template_name)
return t.stream(context)
return flask.Response(stream_template('modeling.html', info=progress))


@app.route('/model')
def model():
"""
Reads the dumped data and renders the output page.
"""
data_path = str(pathlib.Path(TEMPDIR, 'data.pickle'))
parameter_path = str(pathlib.Path(TEMPDIR, 'parameter.csv'))
topics_path = str(pathlib.Path(TEMPDIR, 'topics.csv'))

data = utils.decompress(data_path)
parameter = pd.read_csv(parameter_path, index_col=0, encoding='utf-8')
parameter.columns = [''] # remove column names
topics = pd.read_csv(topics_path, index_col=0, encoding='utf-8')

data['parameter'] = [parameter.to_html(classes='parameter', border=0)]
data['topics'] = [topics.to_html(classes='topics')]
return flask.render_template('model.html', **data)


@app.after_request
def add_header(r):
"""
Handles the cache.
"""
r.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
r.headers['Pragma'] = 'no-cache'
r.headers['Expires'] = '0'
r.headers['Cache-Control'] = 'public, max-age=0'
return r


if __name__ == '__main__':
app.debug = True
app.run()
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes
File renamed without changes.
File renamed without changes.
File renamed without changes.
68 changes: 68 additions & 0 deletions application/templates/index.html
@@ -0,0 +1,68 @@
{% extends "layout.html" %}
{% block navigation %}
<ul class="nav pull-right">
<li>
<a href="{{ url_for('help') }}"><i class="icon-question-sign icon-white"></i> Help</a>
</li>
</ul>
{% endblock %}
{% block content %}
<h1>Topics – Easy Topic Modeling</h1>
<div id="contentInner">
<form action="/modeling" method="POST" enctype="multipart/form-data">
<p>The text mining technique <b>Topic Modeling</b> has become a popular statistical method for clustering documents. This application introduces a user-friendly workflow, basically containing data preprocessing, the actual modeling using <b>latent Dirichlet allocation</b> (LDA), as well as various interactive visualizations to explore the model.</p>
<p>LDA, introduced in the context of text analysis in 2003, is an instance of a more general class of models called <b>mixed-membership models</b>. Involving a number of distributions and parameters, the model is typically performed using <b>Gibbs sampling</b> with conjugate priors and is purely based on word frequencies.</p>
<div class="alert alert-block">
<button type="button" class="close" data-dismiss="alert">&times;</button>
<i class="fa fa-exclamation-circle"></i> This application is designed to introduce the technique in a gentle way and aims for simplicity. If you have a <b>very large corpus</b> (let's say more than 200 documents with more than 5000 words per document),
you may wish to use more sophisticated models such as those implemented in <b>MALLET</b>, which is known to be more robust than standard LDA. Have a look at our Jupyter notebook introducing topic modeling with MALLET, available via <a href="https://github.com/DARIAH-DE/Topics">GitHub</a>.
</div>
<br>
<h2>1. Preprocessing</h2>
<p>A lot of harmful information, at least harmful for LDA, is sticking in your raw text collection. This is why preprocessing is a very crucial step for this workflow, and for <i>natural language processing</i> in general. First of all, your corpus will
be <b>tokenized</b>. This is the process of splitting a text into individual words (so-called <i>tokens</i>). Token frequencies are typical units of analysis when working with text corpora. It may come as a surprise that reducing a book to a list
of token frequencies retains useful information, but practice has shown this to be the case. Normally, the most frequent tokens of a document tend to be <b>semantically insignificant words</b> (like <i>the</i> or <i>and</i>, for instance). Because
you are trying to uncover hidden semantic structures of a text collection, you have to get rid of those insignificant words before modeling. This will be done while preprocessing.</p>
<h3>1.1. Reading a Corpus of Documents</h3>
<p>For this workflow, you will need a corpus (a set of texts) as plain text (<b>.txt</b>) or XML (<b>.xml</b>). TEI encoded XML is fully supported to process only the text part. Use the button below to select multiple text files. To gain better results,
<b>choose at least five documents</b> (but the more the better).
<div class="alert alert-info">
<button type="button" class="close" data-dismiss="alert">&times;</button>
<b>Tip:</b> The <a href="https://textgridrep.org">TextGrid Repository</a> is a great place to start searching for text data. It's Open Access and provides a lot of literary texts in valid and well-formed TEI XML.
</div>
</p>
<input type="file" name="files" accept=".txt, .xml" multiple required/><br><br>
<h3>1.2. Tokenization</h3>
<p>An important preprocessing step is tokenization. Without identifying tokens, it is difficult to extract necessary information, such as token frequencies in general, or <b>most frequent words</b>, also known as <i>stopwords</i>. In this application,
one token consists of one or more characters, optionally followed by exactly one punctuation (a hyphen or something related), followed by one or more characters. For example, the phrase “her father's arm-chair” will be tokenized as <code>["her", "father's", "arm-chair"]</code>.</p>
<h3>1.3. Cleaning the Corpus</h3>
<p>Stopwords are harmful for LDA and have to be removed from the corpus. In case you want to <b>determine stopwords individually</b> based on your corpus, define a threshold for most frequent words in the line below.</p>
<div class="alert alert-info">
<button type="button" class="close" data-dismiss="alert">&times;</button>
<b>Tip:</b> Be careful with removing most frequent words – you might remove words quite important for LDA. Anyway, to gain better results, it is highly recommended to use an <b>external stopwords list</b>. This application was shipped with stopword
lists for English, German, Spanish, and French.
</div>
<input type="number" name="mfw_threshold" value="150" min="1">
<p>Alternatively, upload your own tokens-to-remove list here:</p>
<input type="file" name="stopword_list"><br><br>
<h2>2. Modeling</h2>
<p>In this workflow, we are relying on an implementation by Allen Riddell, which is lightweight, fast and provides basic LDA. You have to specify some <b>model parameters</b> in this section, first of all the number of topics. The best number depends
on what you are looking for in the model. The default will provide a <b>broad overview</b> of the contents of the corpus. The number of topics should also depend to some degree on the size of the text collection, but 100 to 200 will produce reasonably
<b>fine-grained results</b>.</p>
<input type="number" name="num_topics" value="10" min="1" required>
<p>An iteration is a process of repeating the same action multiple times to achieve a specific goal. This is how LDA works. The number of sampling iterations should be a <b>trade-off</b> between the time taken to complete sampling and the quality of
the model. The default value produces quite good results, but feel free to increase the number of iterations.</p>
<input type="number" name="num_iterations" value="200" min="10" required><br>
<br>
<h2>3. Visualizing</h2>
<p>When using LDA to explore text collections, we are typically interested in examining texts in terms of their <b>constituent topics</b> (instead of word frequencies). Because the number of topics is so much smaller than the number of unique vocabulary
elements (say, 10 versus 10,000), a range of data visualization methods become available. As you will see, all of the provided visualizations are <b>interactive</b>.</p>
<br>
<div class="center_button">
<button class="button" type="submit"><b>Train<br>Topic Model</b></button>
</div>
<br>
</form>
</div>
{% endblock %}

File renamed without changes.
File renamed without changes.
File renamed without changes.
14 changes: 14 additions & 0 deletions utils.py → application/utils.py
Expand Up @@ -220,3 +220,17 @@ def is_connected(host='8.8.8.8', port=53, timeout=3):
return True
except:
return False

def load_data(TEMPDIR):
data_path = str(pathlib.Path(TEMPDIR, 'data.pickle'))
parameter_path = str(pathlib.Path(TEMPDIR, 'parameter.csv'))
topics_path = str(pathlib.Path(TEMPDIR, 'topics.csv'))

data = application.utils.decompress(data_path)
parameter = pd.read_csv(parameter_path, index_col=0, encoding='utf-8')
parameter.columns = [''] # remove column names
topics = pd.read_csv(topics_path, index_col=0, encoding='utf-8')

data['parameter'] = [parameter.to_html(classes='parameter', border=0)]
data['topics'] = [topics.to_html(classes='topics')]
return data

0 comments on commit a51363d

Please sign in to comment.