Refactoring

DARIAH-DE · Apr 15, 2018 · a51363d · a51363d
1 parent 7d72efe
commit a51363d
Show file tree

Hide file tree

Showing 55 changed files with 267 additions and 192 deletions.
diff --git a/application/__init__.py b/application/__init__.py
@@ -0,0 +1,3 @@
+from application import web
+from application import gui
+from application import utils
diff --git a/application/__pycache__/__init__.cpython-36.pyc b/application/__pycache__/__init__.cpython-36.pyc
diff --git a/application/__pycache__/gui.cpython-36.pyc b/application/__pycache__/gui.cpython-36.pyc
diff --git a/application/__pycache__/utils.cpython-36.pyc b/application/__pycache__/utils.cpython-36.pyc
diff --git a/application/__pycache__/web.cpython-36.pyc b/application/__pycache__/web.cpython-36.pyc
diff --git a/application/config.py b/application/config.py
@@ -0,0 +1,22 @@
+#!/usr/bin/env python3
+
+import pathlib
+import sys
+import flask
+
+def create_app(**kwargs):
+    """
+    Creates a Flask app and determines the path for bokeh resources. If the
+    scripts were frozen with PyInstaller, the paths are adjusted accordingly.
+    """
+    if getattr(sys, 'frozen', False):
+        root = pathlib.Path(sys._MEIPASS)
+        app = flask.Flask(import_name=__name__,
+                          template_folder=str(pathlib.Path(root, 'templates')),
+                          static_folder=str(pathlib.Path(root, 'static')),
+                          **kwargs)
+        bokeh_resources = str(pathlib.Path(root, 'static', 'bokeh_templates'))
+    else:
+        app = flask.Flask(import_name=__name__, **kwargs)
+        bokeh_resources = str(pathlib.Path('static', 'bokeh_templates'))
+    return app, bokeh_resources
diff --git a/application/gui.py b/application/gui.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+
+import sys
+import pathlib
+import PyQt5.QtGui
+import PyQt5.QtWidgets
+import PyQt5.QtWebEngineWidgets
+import PyQt5.QtCore
+import application.web
+
+
+PORT = 5000
+ROOT_URL = 'http://localhost:{port}'.format(port=PORT)
+
+
+class FlaskThread(PyQt5.QtCore.QThread):
+    def __init__(self, application):
+        PyQt5.QtCore.QThread.__init__(self)
+        self.application = application
+
+    def __del__(self):
+        self.wait()
+
+    def run(self):
+        self.application.run(port=PORT)
+
+
+def provide_gui(application):
+    """
+    Opens a QtWebEngine window, runs the Flask application, and renders the
+    index.html page.
+    """
+    title = 'Topics Explorer'
+    icon = str(pathlib.Path('static', 'img', 'page_icon.png'))
+    width = 1200
+    height = 660
+
+    qtapp = PyQt5.QtWidgets.QApplication(sys.argv)
+
+    webapp = FlaskThread(application)
+    webapp.start()
+
+    qtapp.aboutToQuit.connect(webapp.terminate)
+
+    webview = PyQt5.QtWebEngineWidgets.QWebEngineView()
+    webview.resize(width, height)
+    webview.setWindowTitle(title)
+    webview.setWindowIcon(PyQt5.QtGui.QIcon(icon))
+
+    webview.load(PyQt5.QtCore.QUrl(ROOT_URL))
+    webview.show()
+    return qtapp.exec_()
+
+
+def run():
+    sys.exit(provide_gui(application.web.app))
diff --git a/webapp.py → application/modeling.py b/webapp.py → application/modeling.py
@@ -1,70 +1,10 @@
-#!/usr/bin/env python3
-
-import pathlib
-import time
-import sys
-import shutil
-import logging
-import tempfile
-import utils
-import dariah_topics
-import flask
-import pandas as pd
-import numpy as np
-import bokeh.plotting
-import bokeh.embed
-import werkzeug.utils
-
-
-TEMPDIR = tempfile.mkdtemp()  # Storing logfile, dumping temporary data, etc.
-NUM_KEYS = 8  # The number of topic keys for the topics table
 # These messages are displayed during modeling:
 INFO_2A = "FYI: This might take a while..."
 INFO_3A = "In the meanwhile, have a look at"
 INFO_4A = "our Jupyter notebook introducing"
 INFO_5A = "topic modeling with MALLET."
 
-
-if getattr(sys, 'frozen', False):
-    # If the script is frozen by PyInstaller
-    root = pathlib.Path(sys._MEIPASS)
-    app = flask.Flask(import_name=__name__,
-                      template_folder=str(pathlib.Path(root, 'templates')),
-                      static_folder=str(pathlib.Path(root, 'static')))
-    bokeh_resources = str(pathlib.Path(root, 'static', 'bokeh_templates'))
-else:
-    app = flask.Flask(import_name=__name__)
-    bokeh_resources = str(pathlib.Path('static', 'bokeh_templates'))
-
-
-@app.route('/')
-def index():
-    """
-    Renders the main page. A warning pops up, if the machine is not
-    connected to the internet.
-    """
-    if utils.is_connected():
-        return flask.render_template('index.html')
-    else:
-        return flask.render_template('index.html', internet='warning')
-
-
-@app.route('/help')
-def help():
-    """
-    Renders the help page.
-    """
-    return flask.render_template('help.html')
-
-
-@app.route('/modeling', methods=['POST'])
-def modeling():
-    """
-    Streams the modeling page, printing useful information to screen.
-    The generated data will be dumped into the TEMPDIR (specified above).
-    """
-    @flask.stream_with_context
-    def create_model():
+def create_model():
         start = time.time()
         try:
             user_input = {'files': flask.request.files.getlist('files'),
@@ -90,7 +30,7 @@ def create_model():
                 if filename.suffix == '.txt':
                     text = file.read().decode('utf-8')
                 else:
-                    text = utils.process_xml(file)
+                    text = application.utils.process_xml(file)
                 tokens = list(dariah_topics.preprocessing.tokenize(text))
                 tokenized_corpus[filename.stem] = tokens
                 parameter['Corpus size (raw), in tokens'] += len(tokens)
@@ -140,15 +80,15 @@ def create_model():
             INFO_5B = INFO_5B.format(parameter['Number of topics'])
 
             yield "running", "Initializing LDA topic model ...", INFO_2B, INFO_3B, INFO_4B, INFO_5B
-            model = utils.enthread(target=utils.lda_modeling,
+            model = application.utils.enthread(target=application.utils.lda_modeling,
                                    args=(document_term_arr,
                                          user_input['num_topics'],
                                          user_input['num_iterations'],
                                          TEMPDIR))
             while True:
                 # During modeling the logfile is read continuously and the newest
                 # line is sent to the browser as information for the user:
-                msg = utils.read_logfile(str(pathlib.Path(TEMPDIR, 'topicmodeling.log')))
+                msg = application.utils.read_logfile(str(pathlib.Path(TEMPDIR, 'topicmodeling.log')))
                 if msg == None:
                     # When modeling is done, get the model:
                     model = model.get()
@@ -161,8 +101,8 @@ def create_model():
             yield "running", "Accessing topics ...", INFO_2B, INFO_3B, INFO_4B, INFO_5B
             topics = dariah_topics.postprocessing.show_topics(model=model,
                                                               vocabulary=vocabulary,
-                                                              num_keys=NUM_KEYS)
-            topics.columns = ['Key {0}'.format(i) for i in range(1, NUM_KEYS + 1)]
+                                                              num_keys=8)
+            topics.columns = ['Key {0}'.format(i) for i in range(1, 9)]
             topics.index = ['Topic {0}'.format(i) for i in range(1, user_input['num_topics'] + 1)]
 
             yield "running", "Accessing document topics distributions ...", INFO_2B, INFO_3B, INFO_4B, INFO_5B
@@ -195,7 +135,7 @@ def create_model():
 
             heatmap_script, heatmap_div = bokeh.embed.components(heatmap)
 
-            corpus_boxplot = utils.boxplot(corpus_stats)
+            corpus_boxplot = application.utils.boxplot(corpus_stats)
             corpus_boxplot_script, corpus_boxplot_div = bokeh.embed.components(corpus_boxplot)
             bokeh.plotting.output_file(str(pathlib.Path(TEMPDIR, 'corpus_statistics.html')))
             bokeh.plotting.save(corpus_boxplot)
@@ -204,7 +144,7 @@ def create_model():
                 height = 10 * 18
             else:
                 height = document_topics.shape[1] * 18
-            topics_barchart = utils.barchart(document_topics, height=height, topics=topics)
+            topics_barchart = application.utils.barchart(document_topics, height=height, topics=topics)
             topics_script, topics_div = bokeh.embed.components(topics_barchart)
             bokeh.plotting.output_file(str(pathlib.Path(TEMPDIR, 'topics_barchart.html')))
             bokeh.plotting.save(topics_barchart)
@@ -213,7 +153,7 @@ def create_model():
                 height = 10 * 18
             else:
                 height = document_topics.shape[0] * 18
-            documents_barchart = utils.barchart(document_topics.T, height=height)
+            documents_barchart = application.utils.barchart(document_topics.T, height=height)
             documents_script, documents_div = bokeh.embed.components(documents_barchart)
             bokeh.plotting.output_file(str(pathlib.Path(TEMPDIR, 'document_topics_barchart.html')))
             bokeh.plotting.save(documents_barchart)
@@ -251,51 +191,7 @@ def create_model():
                     'corpus_boxplot_script': corpus_boxplot_script,
                     'corpus_boxplot_div': corpus_boxplot_div,
                     'cwd': cwd}
-            utils.compress(data, str(pathlib.Path(TEMPDIR, 'data.pickle')))
+            application.utils.compress(data, str(pathlib.Path(TEMPDIR, 'data.pickle')))
             yield 'done', '', '', '', '', ''
         except Exception as error:
             yield 'error', str(error), '', '', '', ''
-
-    progress = create_model()
-
-    def stream_template(template_name, **context):
-        app.update_template_context(context)
-        t = app.jinja_env.get_template(template_name)
-        return t.stream(context)
-    return flask.Response(stream_template('modeling.html', info=progress))
-
-
-@app.route('/model')
-def model():
-    """
-    Reads the dumped data and renders the output page.
-    """
-    data_path = str(pathlib.Path(TEMPDIR, 'data.pickle'))
-    parameter_path = str(pathlib.Path(TEMPDIR, 'parameter.csv'))
-    topics_path = str(pathlib.Path(TEMPDIR, 'topics.csv'))
-
-    data = utils.decompress(data_path)
-    parameter = pd.read_csv(parameter_path, index_col=0, encoding='utf-8')
-    parameter.columns = ['']  # remove column names
-    topics = pd.read_csv(topics_path, index_col=0, encoding='utf-8')
-
-    data['parameter'] = [parameter.to_html(classes='parameter', border=0)]
-    data['topics'] = [topics.to_html(classes='topics')]
-    return flask.render_template('model.html', **data)
-
-
-@app.after_request
-def add_header(r):
-    """
-    Handles the cache.
-    """
-    r.headers['Cache-Control'] = 'no-cache, no-store, must-revalidate'
-    r.headers['Pragma'] = 'no-cache'
-    r.headers['Expires'] = '0'
-    r.headers['Cache-Control'] = 'public, max-age=0'
-    return r
-
-
-if __name__ == '__main__':
-    app.debug = True
-    app.run()
diff --git a/static/bokeh_templates/autoload_js.js → ...ion/static/bokeh_templates/autoload_js.js b/static/bokeh_templates/autoload_js.js → ...ion/static/bokeh_templates/autoload_js.js
diff --git a/static/bokeh_templates/autoload_nb_js.js → .../static/bokeh_templates/autoload_nb_js.js b/static/bokeh_templates/autoload_nb_js.js → .../static/bokeh_templates/autoload_nb_js.js
diff --git a/static/bokeh_templates/autoload_tag.html → .../static/bokeh_templates/autoload_tag.html b/static/bokeh_templates/autoload_tag.html → .../static/bokeh_templates/autoload_tag.html
diff --git a/static/bokeh_templates/css_resources.html → ...static/bokeh_templates/css_resources.html b/static/bokeh_templates/css_resources.html → ...static/bokeh_templates/css_resources.html
diff --git a/static/bokeh_templates/doc_js.js → application/static/bokeh_templates/doc_js.js b/static/bokeh_templates/doc_js.js → application/static/bokeh_templates/doc_js.js
diff --git a/static/bokeh_templates/doc_nb_js.js → ...ation/static/bokeh_templates/doc_nb_js.js b/static/bokeh_templates/doc_nb_js.js → ...ation/static/bokeh_templates/doc_nb_js.js
diff --git a/static/bokeh_templates/file.html → application/static/bokeh_templates/file.html b/static/bokeh_templates/file.html → application/static/bokeh_templates/file.html
diff --git a/static/bokeh_templates/js_resources.html → .../static/bokeh_templates/js_resources.html b/static/bokeh_templates/js_resources.html → .../static/bokeh_templates/js_resources.html
diff --git a/static/bokeh_templates/notebook_load.html → ...static/bokeh_templates/notebook_load.html b/static/bokeh_templates/notebook_load.html → ...static/bokeh_templates/notebook_load.html
diff --git a/static/bokeh_templates/plot_div.html → ...tion/static/bokeh_templates/plot_div.html b/static/bokeh_templates/plot_div.html → ...tion/static/bokeh_templates/plot_div.html
diff --git a/static/bokeh_templates/render_css.txt → ...ion/static/bokeh_templates/render_css.txt b/static/bokeh_templates/render_css.txt → ...ion/static/bokeh_templates/render_css.txt
diff --git a/static/bokeh_templates/render_js.txt → ...tion/static/bokeh_templates/render_js.txt b/static/bokeh_templates/render_js.txt → ...tion/static/bokeh_templates/render_js.txt
diff --git a/static/bokeh_templates/script_tag.html → ...on/static/bokeh_templates/script_tag.html b/static/bokeh_templates/script_tag.html → ...on/static/bokeh_templates/script_tag.html
diff --git a/static/bokeh_templates/try_run.js → ...ication/static/bokeh_templates/try_run.js b/static/bokeh_templates/try_run.js → ...ication/static/bokeh_templates/try_run.js
diff --git a/static/css/application.css → application/static/css/application.css b/static/css/application.css → application/static/css/application.css
diff --git a/static/css/bootstrap-customization.css → ...on/static/css/bootstrap-customization.css b/static/css/bootstrap-customization.css → ...on/static/css/bootstrap-customization.css
diff --git a/static/css/bootstrap-modal.css → application/static/css/bootstrap-modal.css b/static/css/bootstrap-modal.css → application/static/css/bootstrap-modal.css
diff --git a/static/css/bootstrap-responsive.css → ...ation/static/css/bootstrap-responsive.css b/static/css/bootstrap-responsive.css → ...ation/static/css/bootstrap-responsive.css
diff --git a/static/css/bootstrap.css → application/static/css/bootstrap.css b/static/css/bootstrap.css → application/static/css/bootstrap.css
diff --git a/static/css/font-awesome.css → application/static/css/font-awesome.css b/static/css/font-awesome.css → application/static/css/font-awesome.css
diff --git a/static/fonts/FontAwesome.otf → application/static/fonts/FontAwesome.otf b/static/fonts/FontAwesome.otf → application/static/fonts/FontAwesome.otf
diff --git a/static/fonts/fontawesome-webfont.eot → ...tion/static/fonts/fontawesome-webfont.eot b/static/fonts/fontawesome-webfont.eot → ...tion/static/fonts/fontawesome-webfont.eot
diff --git a/static/fonts/fontawesome-webfont.svg → ...tion/static/fonts/fontawesome-webfont.svg b/static/fonts/fontawesome-webfont.svg → ...tion/static/fonts/fontawesome-webfont.svg
diff --git a/static/fonts/fontawesome-webfont.ttf → ...tion/static/fonts/fontawesome-webfont.ttf b/static/fonts/fontawesome-webfont.ttf → ...tion/static/fonts/fontawesome-webfont.ttf
diff --git a/static/fonts/fontawesome-webfont.woff → ...ion/static/fonts/fontawesome-webfont.woff b/static/fonts/fontawesome-webfont.woff → ...ion/static/fonts/fontawesome-webfont.woff
diff --git a/static/fonts/fontawesome-webfont.woff2 → ...on/static/fonts/fontawesome-webfont.woff2 b/static/fonts/fontawesome-webfont.woff2 → ...on/static/fonts/fontawesome-webfont.woff2
diff --git a/...macroman/MuseoSansRounded-700-webfont.eot → ...macroman/MuseoSansRounded-700-webfont.eot b/...macroman/MuseoSansRounded-700-webfont.eot → ...macroman/MuseoSansRounded-700-webfont.eot
diff --git a/...macroman/MuseoSansRounded-700-webfont.svg → ...macroman/MuseoSansRounded-700-webfont.svg b/...macroman/MuseoSansRounded-700-webfont.svg → ...macroman/MuseoSansRounded-700-webfont.svg
diff --git a/...macroman/MuseoSansRounded-700-webfont.ttf → ...macroman/MuseoSansRounded-700-webfont.ttf b/...macroman/MuseoSansRounded-700-webfont.ttf → ...macroman/MuseoSansRounded-700-webfont.ttf
diff --git a/...acroman/MuseoSansRounded-700-webfont.woff → ...acroman/MuseoSansRounded-700-webfont.woff b/...acroman/MuseoSansRounded-700-webfont.woff → ...acroman/MuseoSansRounded-700-webfont.woff
diff --git a/static/img/app_icon.ico → application/static/img/app_icon.ico b/static/img/app_icon.ico → application/static/img/app_icon.ico
diff --git a/static/img/app_icon.png → application/static/img/app_icon.png b/static/img/app_icon.png → application/static/img/app_icon.png
diff --git a/static/img/dariah_wait.gif → application/static/img/dariah_wait.gif b/static/img/dariah_wait.gif → application/static/img/dariah_wait.gif
diff --git a/static/img/dariah_white.png → application/static/img/dariah_white.png b/static/img/dariah_white.png → application/static/img/dariah_white.png
diff --git a/static/img/page_icon.png → application/static/img/page_icon.png b/static/img/page_icon.png → application/static/img/page_icon.png
diff --git a/static/img/paper-pattern1-0.jpg → application/static/img/paper-pattern1-0.jpg b/static/img/paper-pattern1-0.jpg → application/static/img/paper-pattern1-0.jpg
diff --git a/static/img/pie.gif → application/static/img/pie.gif b/static/img/pie.gif → application/static/img/pie.gif
diff --git a/static/js/bootstrap.js → application/static/js/bootstrap.js b/static/js/bootstrap.js → application/static/js/bootstrap.js
diff --git a/static/js/jquery-1.8.2.js → application/static/js/jquery-1.8.2.js b/static/js/jquery-1.8.2.js → application/static/js/jquery-1.8.2.js
diff --git a/templates/help.html → application/templates/help.html b/templates/help.html → application/templates/help.html
diff --git a/application/templates/index.html b/application/templates/index.html
@@ -0,0 +1,68 @@
+{% extends "layout.html" %}
+{% block navigation %}
+<ul class="nav pull-right">
+  <li>
+    <a href="{{ url_for('help') }}"><i class="icon-question-sign icon-white"></i> Help</a>
+  </li>
+</ul>
+{% endblock %}
+{% block content %}
+<h1>Topics – Easy Topic Modeling</h1>
+<div id="contentInner">
+  <form action="/modeling" method="POST" enctype="multipart/form-data">
+    <p>The text mining technique <b>Topic Modeling</b> has become a popular statistical method for clustering documents. This application introduces a user-friendly workflow, basically containing data preprocessing, the actual modeling using <b>latent Dirichlet allocation</b>      (LDA), as well as various interactive visualizations to explore the model.</p>
+    <p>LDA, introduced in the context of text analysis in 2003, is an instance of a more general class of models called <b>mixed-membership models</b>. Involving a number of distributions and parameters, the model is typically performed using <b>Gibbs sampling</b>      with conjugate priors and is purely based on word frequencies.</p>
+    <div class="alert alert-block">
+      <button type="button" class="close" data-dismiss="alert">&times;</button>
+      <i class="fa fa-exclamation-circle"></i> This application is designed to introduce the technique in a gentle way and aims for simplicity. If you have a <b>very large corpus</b> (let's say more than 200 documents with more than 5000 words per document),
+      you may wish to use more sophisticated models such as those implemented in <b>MALLET</b>, which is known to be more robust than standard LDA. Have a look at our Jupyter notebook introducing topic modeling with MALLET, available via <a href="https://github.com/DARIAH-DE/Topics">GitHub</a>.
+    </div>
+    <br>
+    <h2>1. Preprocessing</h2>
+    <p>A lot of harmful information, at least harmful for LDA, is sticking in your raw text collection. This is why preprocessing is a very crucial step for this workflow, and for <i>natural language processing</i> in general. First of all, your corpus will
+      be <b>tokenized</b>. This is the process of splitting a text into individual words (so-called <i>tokens</i>). Token frequencies are typical units of analysis when working with text corpora. It may come as a surprise that reducing a book to a list
+      of token frequencies retains useful information, but practice has shown this to be the case. Normally, the most frequent tokens of a document tend to be <b>semantically insignificant words</b> (like <i>the</i> or <i>and</i>, for instance). Because
+      you are trying to uncover hidden semantic structures of a text collection, you have to get rid of those insignificant words before modeling. This will be done while preprocessing.</p>
+    <h3>1.1. Reading a Corpus of Documents</h3>
+    <p>For this workflow, you will need a corpus (a set of texts) as plain text (<b>.txt</b>) or XML (<b>.xml</b>). TEI encoded XML is fully supported to process only the text part. Use the button below to select multiple text files. To gain better results,
+      <b>choose at least five documents</b> (but the more the better).
+      <div class="alert alert-info">
+        <button type="button" class="close" data-dismiss="alert">&times;</button>
+        <b>Tip:</b> The <a href="https://textgridrep.org">TextGrid Repository</a> is a great place to start searching for text data. It's Open Access and provides a lot of literary texts in valid and well-formed TEI XML.
+      </div>
+    </p>
+    <input type="file" name="files" accept=".txt, .xml" multiple required/><br><br>
+    <h3>1.2. Tokenization</h3>
+    <p>An important preprocessing step is tokenization. Without identifying tokens, it is difficult to extract necessary information, such as token frequencies in general, or <b>most frequent words</b>, also known as <i>stopwords</i>. In this application,
+      one token consists of one or more characters, optionally followed by exactly one punctuation (a hyphen or something related), followed by one or more characters. For example, the phrase “her father's arm-chair” will be tokenized as <code>["her", "father's", "arm-chair"]</code>.</p>
+    <h3>1.3. Cleaning the Corpus</h3>
+    <p>Stopwords are harmful for LDA and have to be removed from the corpus. In case you want to <b>determine stopwords individually</b> based on your corpus, define a threshold for most frequent words in the line below.</p>
+    <div class="alert alert-info">
+      <button type="button" class="close" data-dismiss="alert">&times;</button>
+      <b>Tip:</b> Be careful with removing most frequent words – you might remove words quite important for LDA. Anyway, to gain better results, it is highly recommended to use an <b>external stopwords list</b>. This application was shipped with stopword
+      lists for English, German, Spanish, and French.
+    </div>
+    <input type="number" name="mfw_threshold" value="150" min="1">
+    <p>Alternatively, upload your own tokens-to-remove list here:</p>
+    <input type="file" name="stopword_list"><br><br>
+    <h2>2. Modeling</h2>
+    <p>In this workflow, we are relying on an implementation by Allen Riddell, which is lightweight, fast and provides basic LDA. You have to specify some <b>model parameters</b> in this section, first of all the number of topics. The best number depends
+      on what you are looking for in the model. The default will provide a <b>broad overview</b> of the contents of the corpus. The number of topics should also depend to some degree on the size of the text collection, but 100 to 200 will produce reasonably
+      <b>fine-grained results</b>.</p>
+    <input type="number" name="num_topics" value="10" min="1" required>
+    <p>An iteration is a process of repeating the same action multiple times to achieve a specific goal. This is how LDA works. The number of sampling iterations should be a <b>trade-off</b> between the time taken to complete sampling and the quality of
+      the model. The default value produces quite good results, but feel free to increase the number of iterations.</p>
+    <input type="number" name="num_iterations" value="200" min="10" required><br>
+    <br>
+    <h2>3. Visualizing</h2>
+    <p>When using LDA to explore text collections, we are typically interested in examining texts in terms of their <b>constituent topics</b> (instead of word frequencies). Because the number of topics is so much smaller than the number of unique vocabulary
+      elements (say, 10 versus 10,000), a range of data visualization methods become available. As you will see, all of the provided visualizations are <b>interactive</b>.</p>
+    <br>
+    <div class="center_button">
+      <button class="button" type="submit"><b>Train<br>Topic Model</b></button>
+    </div>
+    <br>
+  </form>
+</div>
+{% endblock %}
+
diff --git a/templates/layout.html → application/templates/layout.html b/templates/layout.html → application/templates/layout.html
diff --git a/templates/model.html → application/templates/model.html b/templates/model.html → application/templates/model.html
diff --git a/templates/modeling.html → application/templates/modeling.html b/templates/modeling.html → application/templates/modeling.html
diff --git a/utils.py → application/utils.py b/utils.py → application/utils.py
@@ -220,3 +220,17 @@ def is_connected(host='8.8.8.8', port=53, timeout=3):
         return True
     except:
         return False
+
+def load_data(TEMPDIR):
+    data_path = str(pathlib.Path(TEMPDIR, 'data.pickle'))
+    parameter_path = str(pathlib.Path(TEMPDIR, 'parameter.csv'))
+    topics_path = str(pathlib.Path(TEMPDIR, 'topics.csv'))
+
+    data = application.utils.decompress(data_path)
+    parameter = pd.read_csv(parameter_path, index_col=0, encoding='utf-8')
+    parameter.columns = ['']  # remove column names
+    topics = pd.read_csv(topics_path, index_col=0, encoding='utf-8')
+
+    data['parameter'] = [parameter.to_html(classes='parameter', border=0)]
+    data['topics'] = [topics.to_html(classes='topics')]
+    return data