Skip to content

Commit

Permalink
feat: add test
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Oct 17, 2018
1 parent 42f3f6b commit 6620879
Show file tree
Hide file tree
Showing 4 changed files with 52 additions and 30 deletions.
9 changes: 5 additions & 4 deletions application/templates/modeling.html
Expand Up @@ -20,10 +20,11 @@ <h1>This may take a while...</h1>
<img src="{{url_for('static', filename='img/logos/dariah-rotate.gif')}}" class="dariah-flower">
<span id="status">Just started topic modeling...</span>
</p>
<p>In the meantime you might want to check out some <a href="#">Jupyter notebooks</a>, where the same workflow as in
this application is explained step by step – but a bit more technically in the programming language Python. This makes
you more flexible with everything and allows you to use more sophisticated topic models. You can
experiment with an example corpus directly in the browser on <a href="#">Binder</a> without installing anything.</p>
<p>In the meantime you might want to check out some <a href="https://github.com/DARIAH-DE/Topics/tree/master/notebooks">Jupyter
notebooks</a>, where the same workflow as in this application is explained step by step – but a bit more
technically using the programming language Python. This makes you more flexible with everything and allows you to
use more sophisticated topic models. You can experiment with an example corpus directly in the browser on <a href="https://mybinder.org/v2/gh/DARIAH-DE/Topics/master?filepath=notebooks%2FIntroducingLda.ipynb">Binder</a>
without installing anything.</p>
<blockquote>With recent scientific advances in support of unsupervised machine learning topic models promise to be an
important component for summarizing and understanding our growing digitized archive of information.<footer>
<cite>
Expand Down
52 changes: 27 additions & 25 deletions application/testing.py
Expand Up @@ -4,7 +4,7 @@
import pathlib
import logging
import sqlite3
import threading
import multiprocessing

import flask
import pandas as pd
Expand All @@ -15,16 +15,22 @@


app = flask.Flask("topicsexplorer")
global process
process = multiprocessing.Process()


@app.route("/")
def index():
"""Set up database and render home page.
"""Render home page.
Note:
Calling this function will drop all tables
in the database – if any.
"""
# Kill modeling process, if any:
if process.is_alive():
logging.info("Restarting topic modeling...")
process.terminate()
# Initialize logging:
utils.init_logging()
# Initialize database and create tables:
Expand All @@ -37,25 +43,29 @@ def index():
def modeling():
"""Create topic model and render status page.
"""
process = multiprocessing.Process(target=workflow)
process.start()
return flask.render_template("modeling.html")
return flask.render_template("topic-presence.html", presence=relevance)

def get_topic_descriptors(topics):
for topic in topics:
yield ", ".join(topic[:3])


def workflow():
# Get input data:
data = utils.get_data("corpus", "topics", "iterations", "stopwords", "mfw")
data = utils.get_data("corpus",
"topics",
"iterations",
"stopwords",
"mfw")
# Insert data into textfiles table:
utils.insert_into_textfiles(data["corpus"])
# Preprocess data:
dtm, vocabulary, titles, sizes = utils.preprocess(data)
# Initialize topic model:
logging.info("NICE")
model = lda.LDA(n_topics=data["topics"], n_iter=data["iterations"])
# Fit model:
x = StoppableThread(target=model.fit, args=(dtm,))
x.start()
print(x.isAlive())
import time
time.sleep(5)
x.stop()
print(x.isAlive())

"""
model.fit(dtm)
# Get topics generator:
topics = utils.get_topics(model, vocabulary)
Expand All @@ -72,15 +82,6 @@ def modeling():
descriptors = list(get_topic_descriptors(topics))
relevance = pd.Series(topic_weights_s, index=descriptors).to_dict().items()
relevance = sorted(relevance, key=operator.itemgetter(1), reverse=True)
"""
return flask.render_template("modeling.html")
return flask.render_template("topic-presence.html", presence=relevance)

def get_topic_descriptors(topics):
for topic in topics:
yield ", ".join(topic[:3])




@app.after_request
Expand All @@ -107,8 +108,9 @@ def help():



@app.route("/topic-presence")
def topic_presence():
@app.route("/topic-presence/<topic>")
def topic_presence(topic):
print(topic)
return flask.render_template("topic-presence.html")


Expand Down
17 changes: 16 additions & 1 deletion application/utils.py
Expand Up @@ -95,35 +95,42 @@ def init_db(app):
def get_data(corpus, topics, iterations, stopwords, mfw):
"""Get input data.
"""
logging.info("Fetching corpus and parameters...")
# Get text files, number of topics and number of iterations:
data = {"corpus": flask.request.files.getlist("corpus"),
"topics": int(flask.request.form["topics"]),
"iterations": int(flask.request.form["iterations"])}
# Get stopword list, if user selected one:
if flask.request.files.get("stopwords", None):
logging.info("Fetching external stopwords list...")
data["stopwords"] = flask.request.files["stopwords"]
# Use most frequent words threshold otherwise:
else:
logging.info("Fetching threshold value for most frequent words...")
data["mfw"] = int(flask.request.form["mfw"])
return data


def insert_into_textfiles(values):
"""Insert text files into table.
"""
logging.info("Connecting to database...")
# Connect to database:
db = get_db()
# Insert values into table:
for textfile in values:
# Get title and text:
title, text = load_textfile(textfile)
logging.info(f"Loading '{title}'...")
# Execute SQL:
db.execute("""
INSERT INTO textfiles (title, text)
VALUES(?, ?);
""",
[title, text])
logging.info("Committing to database...")
db.commit()
logging.info("Closing connection to database...")
close_db()


Expand Down Expand Up @@ -153,6 +160,7 @@ def load_textfile(textfile):
text = textfile.read().decode("utf-8")
# If suffix implies any markup, remove it:
if suffix in {".xml", ".html"}:
logging.info("Removing markup...")
text = remove_markup(text)
return title, text

Expand All @@ -168,25 +176,32 @@ def remove_markup(text):

def get_stopwords(data, corpus):
if "stopwords" in data:
_, stopwords = load_textfile(data["stopwords"]).split("\n")
_, stopwords = load_textfile(data["stopwords"])
stopwords = stopwords.split("\n")
else:
stopwords = corpus.mfw(data["mfw"])
return stopwords


def preprocess(data):
logging.info("Querying corpus from database...")
# Query text files:
textfiles = select_textfiles()
logging.info("Constructing document objetcs...")
# Get cophi.model.Document object:
documents = get_documents(textfiles)
logging.info("Constructing corpus object...")
# Create cophi.model.Corpus object:
corpus = cophi.model.Corpus(documents)
logging.info("Fetching stopwords...")
# Get stopwords:
stopwords = get_stopwords(data, corpus)
logging.info("Fetching hapax legomena...")
# Get hapax legomena:
hapax = corpus.hapax
# Join both lists:
features = set(stopwords).union(set(hapax))
logging.info("Cleaning corpus...")
# Clean document-term matrix:
dtm = corpus.drop(corpus.dtm, features)
# Get sizes:
Expand Down
4 changes: 4 additions & 0 deletions tests/test_utils.py
@@ -0,0 +1,4 @@
import pytest

def test_test():
assert True == True

0 comments on commit 6620879

Please sign in to comment.