Skip to content

Commit

Permalink
Update modeling page
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed May 29, 2018
1 parent e0bddaa commit aa9299c
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 13 deletions.
24 changes: 12 additions & 12 deletions application/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def workflow(tempdir, archive_dir):
yield "running", "Collecting external stopwords list ...", "", "", "", "", ""
user_input["stopwords"] = flask.request.files["stopword_list"]
else:
yield "running", "Collecting threshold value for stopwords ...", "", "", "", "", ""
yield "running", "Collecting threshold for stopwords ...", "", "", "", "", ""
user_input["mfw"] = int(flask.request.form["mfw_threshold"])

parameter = pd.Series()
Expand All @@ -69,13 +69,14 @@ def workflow(tempdir, archive_dir):
tokens = list(dariah_topics.preprocessing.tokenize(text))
tokenized_corpus[filename.stem] = tokens
parameter["Corpus size (raw), in tokens"] += len(tokens)
file.flush()

excerpt_int = random.randint(0, len(tokenized_corpus) - 1)
excerpt = tokenized_corpus.iloc[excerpt_int]
token_int = random.randint(1, len(excerpt) - 71)
excerpt = "..." + " ".join(excerpt[token_int:token_int + 70]) + "..."

350

yield "running", "Creating document-term matrix ...", excerpt, "", "", "", ""
document_labels = tokenized_corpus.index
document_term_matrix = dariah_topics.preprocessing.create_document_term_matrix(tokenized_corpus, document_labels)
Expand All @@ -85,14 +86,13 @@ def workflow(tempdir, archive_dir):
corpus_stats = pd.DataFrame({"score": np.array(document_term_matrix.sum(axis=1)),
"group": group})

corpus_size = len(user_input["files"])
token_size = parameter["Corpus size (raw), in tokens"]
topic_size = user_input["num_topics"]
iteration_size = user_input["num_iterations"]
corpus_size = str(len(user_input["files"]))
token_size = str(parameter["Corpus size (raw), in tokens"])
topic_size = str(user_input["num_topics"])
iteration_size = str(user_input["num_iterations"])

yield "running", "Removing stopwords and hapax legomena from corpus ...", excerpt, corpus_size, token_size, topic_size, iteration_size
try:
yield "running", "Determining {0} most frequent words from corpus ...".format(user_input["mfw"]), "", "", "", "", ""
yield "running", "Determining {0} most frequent words ...".format(user_input["mfw"]), "", "", "", "", ""
stopwords = dariah_topics.preprocessing.find_stopwords(document_term_matrix, user_input["mfw"])
cleaning = "removed the <b>{0} most frequent words</b>, based on a threshold value".format(user_input["mfw"])
except KeyError:
Expand All @@ -114,16 +114,16 @@ def workflow(tempdir, archive_dir):
"group": group}))
parameter["Corpus size (clean), in tokens"] = int(document_term_matrix.values.sum())

yield "running", "Accessing the values of the document-term matrix ...", "", "", "", "", ""
yield "running", "Accessing document-term matrix ...", "", "", "", "", ""
document_term_arr = document_term_matrix.values.astype(int)
yield "running", "Accessing the vocabulary of the corpus ...", "", "", "", "", ""
yield "running", "Accessing vocabulary of the corpus ...", "", "", "", "", ""
vocabulary = document_term_matrix.columns

parameter["Size of vocabulary, in tokens"] = len(vocabulary)
parameter["Number of topics"] = user_input["num_topics"]
parameter["Number of iterations"] = user_input["num_iterations"]

yield "running", "Initializing LDA topic model (this step might take a while) ...", "", "", "", "", ""
yield "running", "Initializing LDA topic model ...", "", "", "", "", ""
model = application.utils.enthread(target=lda_modeling,
args=(document_term_arr,
user_input["num_topics"],
Expand Down Expand Up @@ -237,6 +237,6 @@ def workflow(tempdir, archive_dir):
"first_document": list(document_topics.columns)[0]}
yield "running", "Everything went well! The results page is currently being created ...", "", "", "", "", ""
application.utils.compress(data, str(pathlib.Path(tempdir, "data.pickle")))
yield "done"
yield "done", "", "", "", "", "", ""
except Exception as error:
yield "error", str(error)
2 changes: 1 addition & 1 deletion application/templates/modeling.html
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@
{% for identifier, state, excerpt, documents, tokens, topics, iterations in stream %}
<script>

var corpus = "If you are interested in some facts and figures: Your corpus consists of {} documents, or {} tokens. You have selected {} topics at {} iterations."
var corpus = "If you are interested in some facts and figures: Your corpus consists of <b>{} documents</b>, or <b>{} tokens</b>. You have selected <b>{} topics</b> at <b>{} iterations</b>."

String.prototype.format = function () {
var i = 0,
Expand Down

0 comments on commit aa9299c

Please sign in to comment.