feat: add test

DARIAH-DE · Oct 17, 2018 · 6620879 · 6620879
1 parent 42f3f6b
commit 6620879
Show file tree

Hide file tree

Showing 4 changed files with 52 additions and 30 deletions.
diff --git a/application/templates/modeling.html b/application/templates/modeling.html
@@ -20,10 +20,11 @@ <h1>This may take a while...</h1>
 			<img src="{{url_for('static', filename='img/logos/dariah-rotate.gif')}}" class="dariah-flower">
 			<span id="status">Just started topic modeling...</span>
 		</p>
-		<p>In the meantime you might want to check out some <a href="#">Jupyter notebooks</a>, where the same workflow as in
-			this application is explained step by step – but a bit more technically in the programming language Python. This makes
-			you more flexible with everything and allows you to use more sophisticated topic models. You can
-			experiment with an example corpus directly in the browser on <a href="#">Binder</a> without installing anything.</p>
+		<p>In the meantime you might want to check out some <a href="https://github.com/DARIAH-DE/Topics/tree/master/notebooks">Jupyter
+				notebooks</a>, where the same workflow as in this application is explained step by step – but a bit more
+			technically using the programming language Python. This makes you more flexible with everything and allows you to
+			use more sophisticated topic models. You can experiment with an example corpus directly in the browser on <a href="https://mybinder.org/v2/gh/DARIAH-DE/Topics/master?filepath=notebooks%2FIntroducingLda.ipynb">Binder</a>
+			without installing anything.</p>
 		<blockquote>With recent scientific advances in support of unsupervised machine learning topic models promise to be an
 			important component for summarizing and understanding our growing digitized archive of information.<footer>
 				<cite>

diff --git a/application/testing.py b/application/testing.py
@@ -4,7 +4,7 @@
 import pathlib
 import logging
 import sqlite3
-import threading
+import multiprocessing
 
 import flask
 import pandas as pd
@@ -15,16 +15,22 @@
 
 
 app = flask.Flask("topicsexplorer")
+global process
+process  = multiprocessing.Process()
 
 
 @app.route("/")
 def index():
-    """Set up database and render home page.
+    """Render home page.
 
     Note:
         Calling this function will drop all tables
             in the database – if any.
     """
+    # Kill modeling process, if any:
+    if process.is_alive():
+        logging.info("Restarting topic modeling...")
+        process.terminate()
     # Initialize logging:
     utils.init_logging()
     # Initialize database and create tables:
@@ -37,25 +43,29 @@ def index():
 def modeling():
     """Create topic model and render status page.
     """
+    process = multiprocessing.Process(target=workflow)
+    process.start()
+    return flask.render_template("modeling.html")
+    return flask.render_template("topic-presence.html", presence=relevance)
+
+def get_topic_descriptors(topics):
+    for topic in topics:
+        yield ", ".join(topic[:3])
+
+
+def workflow():
     # Get input data:
-    data = utils.get_data("corpus", "topics", "iterations", "stopwords", "mfw")
+    data = utils.get_data("corpus",
+                          "topics",
+                          "iterations",
+                          "stopwords",
+                          "mfw")
     # Insert data into textfiles table:
     utils.insert_into_textfiles(data["corpus"])
     # Preprocess data:
     dtm, vocabulary, titles, sizes = utils.preprocess(data)
     # Initialize topic model:
-    logging.info("NICE")
     model = lda.LDA(n_topics=data["topics"], n_iter=data["iterations"])
-    # Fit model:
-    x = StoppableThread(target=model.fit, args=(dtm,))
-    x.start()
-    print(x.isAlive())
-    import time
-    time.sleep(5)
-    x.stop()
-    print(x.isAlive())
-
-    """
     model.fit(dtm)
     # Get topics generator:
     topics = utils.get_topics(model, vocabulary)
@@ -72,15 +82,6 @@ def modeling():
     descriptors = list(get_topic_descriptors(topics))
     relevance = pd.Series(topic_weights_s, index=descriptors).to_dict().items()
     relevance = sorted(relevance, key=operator.itemgetter(1), reverse=True)
-    """
-    return flask.render_template("modeling.html")
-    return flask.render_template("topic-presence.html", presence=relevance)
-
-def get_topic_descriptors(topics):
-    for topic in topics:
-        yield ", ".join(topic[:3])
-
-
 
 
 @app.after_request
@@ -107,8 +108,9 @@ def help():
 
 
 
-@app.route("/topic-presence")
-def topic_presence():
+@app.route("/topic-presence/<topic>")
+def topic_presence(topic):
+    print(topic)
     return flask.render_template("topic-presence.html")
 
 

diff --git a/application/utils.py b/application/utils.py
@@ -95,35 +95,42 @@ def init_db(app):
 def get_data(corpus, topics, iterations, stopwords, mfw):
     """Get input data.
     """
+    logging.info("Fetching corpus and parameters...")
     # Get text files, number of topics and number of iterations:
     data = {"corpus": flask.request.files.getlist("corpus"),
             "topics": int(flask.request.form["topics"]),
             "iterations": int(flask.request.form["iterations"])}
     # Get stopword list, if user selected one:
     if flask.request.files.get("stopwords", None):
+        logging.info("Fetching external stopwords list...")
         data["stopwords"] = flask.request.files["stopwords"]
     # Use most frequent words threshold otherwise:
     else:
+        logging.info("Fetching threshold value for most frequent words...")
         data["mfw"] = int(flask.request.form["mfw"])
     return data
 
 
 def insert_into_textfiles(values):
     """Insert text files into table.
     """
+    logging.info("Connecting to database...")
     # Connect to database:
     db = get_db()
     # Insert values into table:
     for textfile in values:
         # Get title and text:
         title, text = load_textfile(textfile)
+        logging.info(f"Loading '{title}'...")
         # Execute SQL:
         db.execute("""
                    INSERT INTO textfiles (title, text) 
                    VALUES(?, ?);
                    """,
                    [title, text])
+    logging.info("Committing to database...")
     db.commit()
+    logging.info("Closing connection to database...")
     close_db()
 
 
@@ -153,6 +160,7 @@ def load_textfile(textfile):
     text = textfile.read().decode("utf-8")
     # If suffix implies any markup, remove it:
     if suffix in {".xml", ".html"}:
+        logging.info("Removing markup...")
         text = remove_markup(text)
     return title, text
 
@@ -168,25 +176,32 @@ def remove_markup(text):
 
 def get_stopwords(data, corpus):
     if "stopwords" in data:
-        _, stopwords = load_textfile(data["stopwords"]).split("\n")
+        _, stopwords = load_textfile(data["stopwords"])
+        stopwords = stopwords.split("\n")
     else:
         stopwords = corpus.mfw(data["mfw"])
     return stopwords
 
 
 def preprocess(data):
+    logging.info("Querying corpus from database...")
     # Query text files:
     textfiles = select_textfiles()
+    logging.info("Constructing document objetcs...")
     # Get cophi.model.Document object:
     documents = get_documents(textfiles)
+    logging.info("Constructing corpus object...")
     # Create cophi.model.Corpus object:
     corpus = cophi.model.Corpus(documents)
+    logging.info("Fetching stopwords...")
     # Get stopwords:
     stopwords = get_stopwords(data, corpus)
+    logging.info("Fetching hapax legomena...")
     # Get hapax legomena:
     hapax = corpus.hapax
     # Join both lists:
     features = set(stopwords).union(set(hapax))
+    logging.info("Cleaning corpus...")
     # Clean document-term matrix:
     dtm = corpus.drop(corpus.dtm, features)
     # Get sizes:

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,4 @@
+import pytest
+
+def test_test():
+    assert True == True