Update demonstrator

DARIAH-DE · Feb 20, 2017 · cf977a6 · cf977a6
1 parent 84bbf9b
commit cf977a6
Show file tree

Hide file tree

Showing 2 changed files with 18 additions and 12 deletions.
diff --git a/demonstrator/demonstrator.py b/demonstrator/demonstrator.py
@@ -6,6 +6,8 @@
 This module demonstrates the joy of Topic Modeling, wrapped in an user-friendly
 web application provided by `DARIAH-DE`_.
 
+Todo: Replace print statements with logging (which is currently not working).
+
 .. _DARIAH-DE:
     https://de.dariah.eu
     https://github.com/DARIAH-DE
@@ -26,19 +28,20 @@
 
 __author__ = "Severin Simmler"
 __email__ = "severin.simmler@stud-mail.uni-wuerzburg.de"
-__date__ = "2017-02-17"
+__date__ = "2017-02-20"
 
 app = Flask(__name__)
 
 @app.route('/')
 def index():
+    print("Rendering index.html ...")
     return render_template('index.html')
 
 @app.route('/upload', methods=['POST'])
 def upload_file():
-    # Open all files, tokenize, and save in pd.Series():
     files = request.files.getlist('files')
     corpus = pd.Series()
+    print("Accessing and tokenizing files ...")
     for file in files:
         filename, extension = os.path.splitext(secure_filename(file.filename))
         if extension == '.txt':
@@ -51,35 +54,37 @@ def upload_file():
             text = "".join(text.xpath('.//text()'))
             file.flush()
         else:
-            print("File format is not supported.") # Todo: Replace with Flask flash
+            print("File format is not supported.")
         tokens = list(preprocessing.tokenize(text))
         label = filename
         corpus[label] = tokens
 
-    # Create bag-of-words:
+    print("Creating bag-of-words model ...")
     id_types, doc_ids = preprocessing.create_dictionaries(corpus.index.tolist(), corpus.tolist())
     sparse_bow = preprocessing.create_mm(corpus.index.tolist(), corpus.tolist(), id_types, doc_ids)
 
-    # Remove stopwords and hapax legomena:
     stopwords = request.files['stoplist']
     if request.files.get('stoplist', None):
+        print("Accessing external stopword list and cleaning corpus ...")
         words = stopwords.read().decode('utf-8')
         words = set(preprocessing.tokenize(words))
         clean_term_frequency = preprocessing.remove_features(sparse_bow, id_types, words)
         stopwords.flush()
     else:
         threshold = int(request.form['mfws'])
+        print("Accessing", threshold, "most frequent words and cleaning corpus ...")
         stopwords = preprocessing.find_stopwords(sparse_bow, id_types, threshold)
         hapax = preprocessing.find_hapax(sparse_bow, id_types)
         feature_list = set(stopwords).union(hapax)
         clean_term_frequency = preprocessing.remove_features(sparse_bow, id_types, feature_list)
 
-    # Create Matrix Market:
+    print("Creating matrix market model ...")
     num_docs = max(clean_term_frequency.index.get_level_values("doc_id"))
     num_types = max(clean_term_frequency.index.get_level_values("token_id"))
     sum_counts = sum(clean_term_frequency[0])
     header_string = str(num_docs) + " " + str(num_types) + " " + str(sum_counts) + "\n"
 
+    print("Saving matrix market model to matrixmarket.mm ...")
     with open("matrixmarket.mm", 'w+', encoding = "utf-8") as f:
         f.write("%%MatrixMarket matrix coordinate real general\n")
         f.write(header_string)
@@ -90,23 +95,24 @@ def upload_file():
     doc2id = {value : key for key, value in doc_ids.items()}
     type2id = {value : key for key, value in id_types.items()}
 
-    # LDA:
     num_topics = int(request.form['number_topics'])
     passes = int(request.form['passes'])
+    print("Training LDA with", num_topics, "topics and", passes, "passes ...")
     model = LdaModel(corpus=mm, id2word=type2id, num_topics=num_topics, passes=passes)
 
-    # Visualization:
+    print("Visualizing document-topic matrix and saving as heatmap.png ...")
     doc_topic = visualization.create_doc_topic(mm, model, corpus.index.tolist())
     heatmap = visualization.doc_topic_heatmap(doc_topic)
-    heatmap.savefig("./static/heatmap.png")
+    heatmap.savefig('./static/heatmap.png')
 
-    # Topic-Term-Matrix for HTML (todo: replace by DataFrame.to_html()):
+    # Todo: replace by DataFrame.to_html():
+    print("Accessing topics for HTML table ...")
     import regex
     pattern = regex.compile(r'\p{L}+\p{P}?\p{L}+')
     topics = []
     for n, topic in enumerate(model.show_topics()):
         topics.append((n+1, pattern.findall(topic[1])))
-
+    print("Rendering result.html ...")
     return render_template('result.html', topics=topics, documents=corpus.index.tolist())
 
 @app.after_request

diff --git a/demonstrator/templates/index.html b/demonstrator/templates/index.html
@@ -111,7 +111,7 @@
                     <h1>Demonstrator: Topic Modeling</h1>
                     <div id="contentInner" style="text-align:justify">
                         <form action="/upload" method="POST" enctype="multipart/form-data">
-                            <p>The text mining technique <b>Topic Modeling</b> has become a popular statistical method for clustering documents. This web application introduces an user-friendly workflow, basically containing data pre-processing, an implementation of the prototypic topic model <b>Latent Dirichlet Allocation</b> (LDA) which learns the relationships between words, topics, and documents, as well as one visualization to explore the trained LDA model.</p>
+                            <p>The text mining technique <b>Topic Modeling</b> has become a popular statistical method for clustering documents. This web application introduces an user-friendly workflow, basically containing data preprocessing, an implementation of the prototypic topic model <b>Latent Dirichlet Allocation</b> (LDA) which learns the relationships between words, topics, and documents, as well as one visualization to explore the trained LDA model.</p>
                             <h2>1. Preprocessing</h2>
                             <h3>1.1 Reading a corpus of documents</h3>
                             <p>Select plain text (<b>.txt</b>) or TEI encoded XML files (<b>.xml</b>).</p>