Refactoring

DARIAH-DE · Apr 16, 2018 · b15e385 · b15e385
1 parent 4d5d433
commit b15e385
Show file tree

Hide file tree

Showing 4 changed files with 24 additions and 22 deletions.
diff --git a/application/modeling.py b/application/modeling.py
@@ -7,6 +7,7 @@
 import time
 import flask
 import shutil
+import sys
 import numpy as np
 import pandas as pd
 import bokeh.plotting
@@ -84,9 +85,11 @@ def workflow(tempdir, bokeh_resources):
         yield "running", "Removing stopwords and hapax legomena from corpus ...", INFO_2A, INFO_3A, INFO_4A, INFO_5A
         try:
             stopwords = dariah_topics.preprocessing.find_stopwords(document_term_matrix, user_input['mfw'])
+            cleaning = "removed the <b>{0} most frequent words</b>, based on a threshold".format(len(stopwords))
         except KeyError:
             stopwords = user_input['stopwords'].read().decode('utf-8')
             stopwords = dariah_topics.preprocessing.tokenize(stopwords)
+            cleaning = "removed the <b>{0} most frequent words</b>, based on an external stopwords list".format(len(stopwords))
         hapax_legomena = dariah_topics.preprocessing.find_hapax_legomena(document_term_matrix)
         features = set(stopwords).union(hapax_legomena)
         features = [token for token in features if token in document_term_matrix.columns]
@@ -110,7 +113,7 @@ def workflow(tempdir, bokeh_resources):
         INFO_2B = "You have selected {0} text files,"
         INFO_3B = "containing {0} tokens,"
         INFO_4B = "and {0} unique types"
-        INFO_5B = "to discover {0} topics."
+        INFO_5B = "to uncover {0} topics."
         INFO_2B = INFO_2B.format(parameter['Corpus size, in documents'])
         INFO_3B = INFO_3B.format(parameter['Corpus size (raw), in tokens'])
         INFO_4B = INFO_4B.format(parameter['Size of vocabulary, in tokens'])
@@ -214,10 +217,14 @@ def workflow(tempdir, bokeh_resources):
         topics.to_csv(str(pathlib.Path(tempdir, 'topics.csv')), encoding='utf-8')
         document_topics.to_csv(str(pathlib.Path(tempdir, 'document_topics.csv')), encoding='utf-8')
         parameter.to_csv(str(pathlib.Path(tempdir, 'parameter.csv')), encoding='utf-8')
-        cwd = str(pathlib.Path(*pathlib.Path.cwd().parts[:-1]))
+        if getattr(sys, 'frozen', False):
+            cwd = str(pathlib.Path(*pathlib.Path.cwd().parts[:-1]))
+        else:
+            cwd = str(pathlib.Path.cwd())
         shutil.make_archive(str(pathlib.Path(cwd, 'topicmodeling')), 'zip', tempdir)
 
-        data = {'heatmap_script': heatmap_script,
+        data = {'cleaning': cleaning,
+                'heatmap_script': heatmap_script,
                 'heatmap_div': heatmap_div,
                 'topics_script': topics_script,
                 'topics_div': topics_div,

diff --git a/application/templates/index.html b/application/templates/index.html
@@ -11,16 +11,15 @@ <h1>Topics – Easy Topic Modeling</h1>
 <div id="contentInner">
   <form action="/modeling" method="POST" enctype="multipart/form-data">
     <p>The text mining technique <b>Topic Modeling</b> has become a popular statistical method for clustering documents. This application introduces a user-friendly workflow, basically containing data preprocessing, the actual modeling using <b>latent Dirichlet allocation</b>      (LDA), as well as various interactive visualizations to explore the model.</p>
-    <p>LDA, introduced in the context of text analysis in 2003, is an instance of a more general class of models called <b>mixed-membership models</b>. Involving a number of distributions and parameters, the model is typically performed using <b>Gibbs sampling</b>      with conjugate priors and is purely based on word frequencies.</p>
+    <p>LDA, introduced in the context of text analysis in 2003, is an instance of a more general class of models called <b>mixed-membership models</b>. Involving a number of distributions and parameters, the model is typically performed using <b>Gibbs sampling</b>      with conjugate priors, and is purely based on word frequencies.</p>
     <div class="alert alert-block">
       <button type="button" class="close" data-dismiss="alert">&times;</button>
       <i class="fa fa-exclamation-circle"></i> This application is designed to introduce the technique in a gentle way and aims for simplicity. If you have a <b>very large corpus</b> (let's say more than 200 documents with more than 5000 words per document),
       you may wish to use more sophisticated models such as those implemented in <b>MALLET</b>, which is known to be more robust than standard LDA. Have a look at our Jupyter notebook introducing topic modeling with MALLET, available via <a href="https://github.com/DARIAH-DE/Topics">GitHub</a>.
     </div>
     <br>
     <h2>1. Preprocessing</h2>
-    <p>A lot of harmful information, at least harmful for LDA, is sticking in your raw text collection. This is why preprocessing is a very crucial step for this workflow, and for <i>natural language processing</i> in general. First of all, your corpus will
-      be <b>tokenized</b>. This is the process of splitting a text into individual words (so-called <i>tokens</i>). Token frequencies are typical units of analysis when working with text corpora. It may come as a surprise that reducing a book to a list
+    <p>First of all, your corpus will be <b>tokenized</b>. This is the process of splitting a text into individual words (so-called <i>tokens</i>). Token frequencies are typical units of analysis when working with text corpora. It may come as a surprise that reducing a book to a list
       of token frequencies retains useful information, but practice has shown this to be the case. Normally, the most frequent tokens of a document tend to be <b>semantically insignificant words</b> (like <i>the</i> or <i>and</i>, for instance). Because
       you are trying to uncover hidden semantic structures of a text collection, you have to get rid of those insignificant words before modeling. This will be done while preprocessing.</p>
     <h3>1.1. Reading a Corpus of Documents</h3>

diff --git a/application/templates/model.html b/application/templates/model.html
@@ -65,7 +65,7 @@ <h1>Topics – Easy Topic Modeling</h1>
             <h2>1. Corpus and Parameter Summary</h2>
             <p>All parameters, including some corpus statistics, are summed up in the following table. This kind of information might be useful, if you create more than one topic model and want to compare the results. The most common way to evaluate a probabilistic model is to measure the log-likelihood (if you are interested in the evaluation of probabilistic models, have a look at <i>Wallach et al. 2009: Evaluation Methods for Topic Models</i>, a mathematical approach). If you increase the number of iterations, your model gets better, and you will see, the log-likelihood also increases <b>until a certain point</b>. This is how you might find out the ideal number of iterations.
             {% for table in parameter %} {{ table|safe }} {% endfor %}<br>
-            As you can see, your corpus is much smaller after cleaning. You either defined a threshold for most frequent words, or selected an external stopwords list. In addition so-called <i>hapax legomena</i> have been removed. In corpus linguistics, a hapax legomenon is a word that occurs only once within a context. So, if a word occurs only once in a document, it is very likely that the word is semantically insignificant – meaning not useful for the topic modeling algorithm.<br><br>
+            As you can see, your corpus is much smaller after cleaning. You {{ cleaning|safe }}. In addition so-called <i>hapax legomena</i> have been removed. In corpus linguistics, a hapax legomenon is a word that occurs only once within a context. So, if a word occurs only once in a document, it is very likely that the word is semantically insignificant – meaning not useful for the topic modeling algorithm.<br><br>
             <center>
             {{ corpus_boxplot_div|safe }}</center>
             </p><br>
@@ -74,32 +74,32 @@ <h2>1. Corpus and Parameter Summary</h2>
                     <b>FYI:</b> All of the generated data (tables and graphics) was automatically saved in a ZIP archive (<b>topicmodeling.zip</b>) in your current working directory: <b>{{ cwd|safe }}</b>.
                   </div>
             <h2>2. Inspecting the Topic Model</h2>
-            <p>Topic Models are unsupervised. It is called <i>unsupervised</i>, because you did not have any labels describing the semantic structures or anything related, but only pure word frequencies. Since the examples given to the algorithm are unlabeled,
-              there is no evaluation of the accuracy, or how <i>good</i> your model is. So, it is up to you now by inspecting the model to decide whether you are satisfied with its performance or not.
+            <p>Topic models are unsupervised. It is called <i>unsupervised</i>, because you did not have any labels describing the semantic structures or anything related, but only pure word frequencies. Since the examples given to the algorithm are unlabeled,
+              there is no evaluation of the accuracy, or how <i>good</i> your model is. So, it is up to you by inspecting the model to decide whether you are satisfied with its performance or not.
               <div class="alert alert-info">
                 <button type="button" class="close" data-dismiss="alert">&times;</button>
                 <b>Tip:</b> The quantitative evaluation of topics (meaning a list of words as seen below) is a very challenging task. <b>Pointwise Mutual Information</b> (PMI) is one possibility to evaluate the semantic coherence of topics. We implemented
-                two variants of PMI in the programming language Python, which is available via GitHub (https://github.com/DARIAH-DE/Topics/dariah_topics/evaluation.py).
+                two variants of PMI in the programming language Python, which is available via <a href="https://github.com/DARIAH-DE/Topics/dariah_topics/evaluation.py">GitHub</a>.
               </div>
             </p>
             <h3>2.1. Topics</h3>
             <p>Each topic is a probability distribution over the vocabulary of words found in the corpus. The top words (so-called <i>keys</i>) shown in the table below are those words most probable to be found in each topic and describe the semantic structures
-              of your corpus – ideally in a meaningful way. Basically, lists of the top keys associated with each topic are often all that is needed when the corpus is large and the inferred topics make sense in light of prior knowledge of the corpus.</p><br>            {% for table in topics %} {{ table|safe }} {% endfor %}
+              of your corpus – ideally in a meaningful way. Lists of the top keys associated with each topic are often all that is needed when the corpus is large and the inferred topics make sense in light of prior knowledge of the corpus.</p><br>            {% for table in topics %} {{ table|safe }} {% endfor %}
             <br>
             <h3>2.2. Topics and Documents</h3>
-            <p>Each topic has proportions per document, which can be visualized in heatmap. This option displays the kind of information that is probably most useful to literary scholars. Going beyond pure exploration, this visualization can be used to show
+            <p>Each topic has proportions per document, which can be visualized in a heatmap. This displays the kind of information that is probably most useful to literary scholars. Going beyond pure exploration, this visualization can be used to show
               thematic developments over a set of texts as well as a single text, akin to a dynamic topic model. What also can become apparent here, is that some topics correlate highly with a specific author or group of authors, while other topics correlate
               highly with a specific text or group of texts. All in all, this displays two of LDA's properties – its use as a distant reading tool that aims to get at text meaning, and its use as a provider of data that can be further used in computational
               analysis, such as document classification or authorship attribution.</p><br> {{ heatmap_div|safe }}<br><br>
               <h3>2.3. Distribution of Topics</h3>
-              <p>In the following graphic, you can access <i>one</i> dimension of the information displayed in the heatmap above. This might be a more clear approach, if you are interested in a specific topic, or, more precisely, how the topic is distributed over the documents of your corpus. Use the dropdown menu to select a topic. <b>The proportions you can see by default is based on the first topic.</b></p>
+              <p>In the following graphic, you can access <i>one</i> dimension of the information displayed in the heatmap above. This might be a more clear approach, if you are interested in a specific topic, or, more precisely, how the topic is distributed over the documents of your corpus. Use the dropdown menu to select a topic. <b>The default distribution you can see here is that of the first topic.</b></p>
               {{ topics_div|safe }}<br>
               <h3>2.4. Distribution of Documents</h3>
-              <p>Similar thing as above, you can access the <i>other</i> dimension displayed in the heatmap. So, if you are intereseted in a specific <i>document</i>, you have the ability to select it via the dropdown menu and inspect its proportions. <b>The bars displayed by default are based on the first document.</b></p>
+              <p>Similar thing as above, you can access the <i>other</i> dimension displayed in the heatmap. So, if you are intereseted in a specific <i>document</i>, you have the ability to select it via the dropdown menu and inspect its proportions. <b>The default distribution you can see here is that of the first document.</b></p>
               {{ documents_div|safe }}<br>
-            <h2>2. Diving Deeper into Topic Modeling</h2>
+            <h2>2. Delving Deeper into Topic Modeling</h2>
             <p>We want to empower users with little or no previous experience and programming skills to create custom workflows mostly using predefined functions within a familiar environment. So, if this practical introduction aroused your interest and
-              you want to <b>dive deeper into the technical parts</b>, we provide another convenient, modular workflow that can be entirely controlled from within a well documented Jupyter notebook, integrating a total of three popular LDA implementations.</p>
+              you want to <b>delve deeper into the technical parts</b>, we provide the same convenient, modular workflow that can be entirely controlled from within a well documented <a href="http://jupyter.org/">Jupyter notebook</a>, integrating a total of three popular LDA implementations.</p>
             <p>All resources are available via <a href="https://github.com/DARIAH-DE/Topics">GitHub</a>.</p>
           </div>
         </div>

diff --git a/application/utils.py b/application/utils.py
@@ -1,14 +1,10 @@
-import lzma
 import pickle
 import time
 import regex as re
-import application
 import pathlib
-import logging
 import bokeh.plotting
 import bokeh.models
 import bokeh.layouts
-import lda
 import pandas as pd
 import threading
 import lxml
@@ -60,7 +56,7 @@ def load_data(tempdir):
     parameter_path = str(pathlib.Path(tempdir, 'parameter.csv'))
     topics_path = str(pathlib.Path(tempdir, 'topics.csv'))
 
-    data = application.utils.decompress(data_path)
+    data = decompress(data_path)
     parameter = pd.read_csv(parameter_path, index_col=0, encoding='utf-8')
     parameter.columns = ['']  # remove column names
     topics = pd.read_csv(topics_path, index_col=0, encoding='utf-8')
@@ -72,7 +68,7 @@ def load_data(tempdir):
 
 def remove_markup(content):
     """
-    Removes markup from text.
+    Removes markup from text. If lxml fails, a simple regex is used.
     """
     try:
         parser = lxml.etree.XMLParser(recover=True)