Skip to content

Commit

Permalink
chore: final touch
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Nov 23, 2018
1 parent 895045d commit 34f0393
Show file tree
Hide file tree
Showing 16 changed files with 131 additions and 120 deletions.
1 change: 1 addition & 0 deletions application/__init__.py
@@ -1 +1,2 @@
from application import views
from application import gui
64 changes: 26 additions & 38 deletions application/database.py
@@ -1,9 +1,7 @@
import logging
import json
import sqlite3

import flask
import pandas as pd

from application import utils

Expand All @@ -27,8 +25,6 @@ def close_db(e=None):


def _insert_into_textfiles(db, data):
"""Insert data into textfiles table.
"""
for textfile in data:
title, content = utils.load_textfile(textfile)
logging.info("Insert '{}' into database...".format(title))
Expand All @@ -37,13 +33,15 @@ def _insert_into_textfiles(db, data):
VALUES(?, ?);
""", [title, content])


def _insert_into_token_freqs(db, data):
logging.info("Insert token frequencies into database...")
db.execute("""
INSERT INTO token_freqs (content)
VALUES(?);
""", [data])


def insert_into(table, data):
"""Insert data into database.
"""
Expand All @@ -61,7 +59,10 @@ def insert_into(table, data):
db.commit()
close_db()


def update(table, data):
"""Update table in database.
"""
db = get_db()
if table in {"textfiles"}:
_update_textfile_sizes(db, data)
Expand All @@ -85,13 +86,10 @@ def _insert_into_parameters(db, data):
db.execute("""
INSERT INTO parameters (content)
VALUES(?);
""",
[data])
""", [data])


def _insert_into_model(db, data):
"""Insert data into model table.
"""
logging.info("Insert topic model output into database...")
db.execute("""
INSERT INTO model (document_topic, topics, document_similarities, topic_similarities)
Expand All @@ -102,14 +100,11 @@ def _insert_into_model(db, data):


def _insert_into_stopwords(db, data):
"""Insert data into stopwords table.
"""
logging.info("Insert stopwords into database...")
db.execute("""
INSERT INTO stopwords (content)
VALUES(?);
""",
[data])
""", [data])


def select(value, **kwargs):
Expand Down Expand Up @@ -140,101 +135,94 @@ def select(value, **kwargs):
elif value in {"textfile_sizes"}:
return _select_textfile_sizes(cursor)


def _select_textfile_sizes(cursor):
logging.info("Selecting textfile sizes from database...")
logging.info("Select textfile sizes from database...")
return cursor.execute("""
SELECT title, size
FROM textfiles;
""").fetchall()


def _select_parameters(cursor):
logging.info("Selecting parameters from database...")
logging.info("Select parameters from database...")
return cursor.execute("""
SELECT content
FROM parameters;
""").fetchone()


def _select_stopwords(cursor):
logging.info("Selecting stopwords from database...")
logging.info("Select stopwords from database...")
return cursor.execute("""
SELECT content
FROM stopwords;
""").fetchone()[0]


def _select_document_similarities(cursor):
logging.info("Selecting document similarity matrix from database...")
logging.info("Select document similarity matrix from database...")
return cursor.execute("""
SELECT document_similarities
FROM model;
""").fetchone()[0]


def _select_topic_similarities(cursor):
logging.info("Selecting topic similarity matrix from database...")
logging.info("Select topic similarity matrix from database...")
return cursor.execute("""
SELECT topic_similarities
FROM model;
""").fetchone()[0]


def _select_token_freqs(cursor):
logging.info("Selecting token frequencies from database...")
logging.info("Select token frequencies from database...")
return cursor.execute("""
SELECT content
FROM token_freqs;
""").fetchone()[0]


def _select_textfiles(cursor):
"""Select textfiles from database.
"""
logging.info("Selecting textfiles from database...")
cursor.execute("""
logging.info("Select textfiles from database...")
return cursor.execute("""
SELECT title, content
FROM textfiles;
""")
return cursor.fetchall()
""").fetchall()


def _select_document_topic_distributions(cursor):
"""Select document-topic matrix form database.
"""
logging.info("Selecting document-topic distributions from database...")
logging.info("Select document-topic distributions from database...")
return cursor.execute("""
SELECT document_topic
FROM model;
""").fetchone()[0]


def _select_topics(cursor):
logging.info("Selecting topics from database...")
logging.info("Select topics from database...")
return cursor.execute("""
SELECT topics
FROM model;
""").fetchone()[0]


def _select_textfile(cursor, title):
logging.info("Selecting '{}' from database...".format(title))
logging.info("Select '{}' from database...".format(title))
return cursor.execute("""
SELECT content
FROM textfiles
WHERE title = ?;
""", [title]).fetchone()[0]


def _select_data_export(cursor):
"""Select model output from database.
"""
logging.info("Selecting stopwords from database...")
stopwords = cursor.execute("""
SELECT content
FROM stopwords;
""").fetchone()[0]
stopwords = _select_stopwords(cursor)

logging.info("Selecting model output from database...")
logging.info("Select model output from database...")
model = cursor.execute("""
SELECT document_topic, topics, document_similarities, topic_similarities
FROM model;
""").fetchone()
return model, stopwords
return model, stopwords
20 changes: 10 additions & 10 deletions application/templates/detail-document.html
Expand Up @@ -4,35 +4,35 @@
<main class="main">
<div class="main_content">
<h1>{{ title }}</h1>
<p>

Lorem ipsum dolor sit amet, consectetur adipisici elit, sed eiusmod tempor incidunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquid ex ea commodi consequat. Quis aute iure reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint obcaecat cupiditat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>Here you can see the distribution of all topics in the document – just hover the bar. In addition, the
original text, the {{ top_topics[0] }} topics, and the three most similar documents are displayed. For the
latter, the <a onclick="window.open('https://en.wikipedia.org/wiki/Cosine_similarity', 'Cosine Similarity', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">cosine similarity</a> between all <i>document vectors</i> was calculated and ranked.</p>
<h4>Topic Distribution</h4>
<p class="chart">
{% for topic, value in distribution %}
<span data-balloon="{{ topic }}" data-balloon-pos="down" class="block" style="width: {{ value }}%;"></span>
<span data-balloon="{{ topic }}" data-balloon-pos="down" class="block" style="width: {{ value }}%;"></span>
{% endfor %}
</p>
<div class="row -narrow" style="border-top: none;">
<div class="column">
<h4 style=" margin-top: 0px;">Top 10: Related Topics</h4>
<h4 style=" margin-top: 0px;">{{ top_topics[1] }}: Related Topics</h4>
{% for topic in related_topics %}
<p><a class="main_button" style="width: 100%;" href="{{ url_for('topics', topic=topic) }}">{{ topic }}</a></p>
<p><a class="main_button" style="width: 100%;" href="{{ url_for('topics', topic=topic) }}">{{ topic }}</a></p>
{% endfor %}
<h4>Top 3: Similar Documents</h4>
{% for title in similar_documents %}
<p><a class="main_button" style="width: 100%;" href="{{ url_for('documents', title=title) }}">{{ title }}</a></p>
<p><a class="main_button" style="width: 100%;" href="{{ url_for('documents', title=title) }}">{{ title
}}</a></p>
{% endfor %}
</div>
<div class="column -wide">
<h4 style="margin-top: 0px;">Original Text</h4>
{% for paragraph in text %}
<p style="text-align: justify;">{{ paragraph }}</p>
<p style="text-align: justify;">{{ paragraph }}</p>
{% endfor %}
</div>
</div>
</div>
</div>
</main>
{% endblock %}
13 changes: 5 additions & 8 deletions application/templates/detail-topic.html
Expand Up @@ -4,10 +4,10 @@
<main class="main">
<div class="main_content">
<h1>{{ topic }}</h1>
<p>

Lorem ipsum dolor sit amet, consectetur adipisici elit, sed eiusmod tempor incidunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquid ex ea commodi consequat. Quis aute iure reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint obcaecat cupiditat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</p>
<p>On this page you can find the 15 most relevant words for this topic, as well as the 10 most relevant
documents, whose bar width indicates the respective weight, and the three most similar topics, where the <a
onclick="window.open('https://en.wikipedia.org/wiki/Cosine_similarity', 'Cosine Similarity', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">cosine similarity</a> between all <i>topic vectors</i> was calculated and ranked.</p>
<div class="row -narrow" style="border-top: none;">
<div class="column">
<h4 style=" margin-top: 0px;">Top 15: Related Words</h4>
Expand All @@ -26,13 +26,10 @@ <h4>Top 3: Similar Topics</h4>
<div class="column -wide">
<h4 style="margin-top: 0px;">Top 10: Related Documents</h4>
{% for title, proportion in related_documents %}
<p><a class="main_button" style="width: {{ proportion }}%;" href="{{ url_for('documents', title=title) }}">{{ title
}}</a></p>
<p><a class="main_button" style="width: {{ proportion }}%;" href="{{ url_for('documents', title=title) }}">{{ title }}</a></p>
{% endfor %}
</div>
</div>
</div>
</div>
</main>

{% endblock %}
25 changes: 14 additions & 11 deletions application/templates/document-topic-distributions.html
Expand Up @@ -5,30 +5,34 @@
<div class="main_content">
<h1>Document-Topic Distributions</h1>
<p>Each document consists to a certain extent of each topic, which is one of the theoretical assumptions of
topic models. Visualizing those proportions in a heatmap displays the kind of information that is probably
most useful to literary scholars. Going beyond pure exploration, it can be used to show thematic
developments over a set of texts as well as a single text, akin to a dynamic topic model.</p>
topic models. Although some values are <i>too small</i> to be visualized here (and have therefore been
rounded to zero), they <i>are</i> actually greater than zero. Just export the data in the menu bar and take
a look at the document-topic matrix.
</p>
<p>Visualizing the document-topic proportions in a heatmap displays the kind of information that is probably
most useful. Going beyond pure exploration, it can be used to show thematic developments over a set of
texts, akin to a dynamic topic model.</p>
</p>
<p id="document-topic-heatmap"></p>
</div>
<script>
function formatData(data) {
var series = [];
for (var document in data) {
var values = {
let series = [];
for (let document in data) {
let values = {
name: document,
data: []
};
for (var topic in data[document]) {
for (let topic in data[document]) {
values['data'].push({
x: topic,
y: data[document][topic]
});
}
};
series.push(values);
}
};
return series;
}
};

function getHeight(rows) {
if (rows.length < 20) {
Expand All @@ -43,7 +47,6 @@ <h1>Document-Topic Distributions</h1>
$.getJSON("{{ url_for('get_document_topic_distributions') }}", function (data) {
const series = formatData(data);
const height = getHeight(series);
console.log(height)
const options = {
chart: {
toolbar: {
Expand Down
3 changes: 2 additions & 1 deletion application/templates/error.html
Expand Up @@ -4,10 +4,11 @@
<main class="main">
<div class="main_content">
<h1>Something went wrong...</h1>
<p>It looks like something didn't work out the way it should. Below you see the last few lines of the logfile, maybe
<p>It looks like something didnt work out the way it should. Below you see the last few lines of the logfile, maybe
you can solve the problem on your own. If not, open a new issue on <a onclick="window.open('https://github.com/DARIAH-DE/TopicsExplorer/issues', 'Issues · DARIAH-DE/TopicsExplorer', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">GitHub</a>.</p>
<pre>{{ log }}</pre>
<p>In case you want to check the whole logfile, it is located in the directory <code>{{ tempdir }}</code>.</p>
</div>
</main>
{% endblock %}
19 changes: 10 additions & 9 deletions application/templates/help.html
Expand Up @@ -6,19 +6,20 @@
<h1>Help on Topics Explorer</h1>
<h2>About</h2>
<p>This application is designed to introduce topic modeling particularly gently (e.g. for educational purpose).
If you have a very large text corpus, you may wish to use more sophisticated models such as those
implemented in <a onclick="window.open('http://mallet.cs.umass.edu/topics.php', 'Topic Modeling', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">MALLET</a>, which is known to be more robust
than standard LDA. You might want to check out some <a onclick="window.open('https://github.com/DARIAH-DE/Topics/tree/master/notebooks', 'Topics/notebooks at master · DARIAH-DE/Topics', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">Jupyter notebooks</a> – experimenting with an example corpus on
If you have a very large text corpus, you may wish to use more <i>powerful</i> tools like <a onclick="window.open('http://mallet.cs.umass.edu/topics.php', 'Topic Modeling', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">MALLET</a>, which is written in Java and can be completely controlled from the command-line.
The topic modeling algorithm used in this application, <i>latent Dirichlet allocation</i>, was implemented
by <a onclick="window.open('https://www.ariddell.org/', 'Allen B. Riddell', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">Allen B. Riddell</a> using collapsed Gibbs sampling as described in <a onclick="window.open('http://www.genetics.org/content/155/2/945.full', 'Inference of Population Structure Using Multilocus Genotype Data', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">Pritchard et al. (2000)</a>.</p>
<p>You might want to check out some <a onclick="window.open('https://github.com/DARIAH-DE/Topics/tree/master/notebooks', 'Topics/notebooks at master · DARIAH-DE/Topics', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">Jupyter notebooks</a> for topic modeling in Python – experimenting with an example corpus on
<a onclick="window.open('https://mybinder.org/v2/gh/DARIAH-DE/Topics/master?filepath=notebooks%2FIntroducingLda.ipynb', 'Topic Modeling', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">Binder</a> does not require any software on your local machine.</p>
<h2>Issues</h2>
<p>Please use the project’s <a onclick="window.open('https://github.com/DARIAH-DE/TopicsExplorer/issues', 'Issues · DARIAH-DE/TopicsExplorer', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">issue tracker</a> on
GitHub, but consider first checking out the <a onclick="window.open('https://dariah-de.github.io/TopicsExplorer/#troubleshooting', 'DARIAH-DE Topics Explorer', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">troubleshooting
section</a> on the application’s website.</p>
href="#">issue tracker</a> on GitHub, but consider first checking out the <a onclick="window.open('https://dariah-de.github.io/TopicsExplorer/#troubleshooting', 'DARIAH-DE Topics Explorer', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
href="#">troubleshooting section</a> on the application’s website.</p>
<h2>What is topic modeling?</h2>
<ul>
<li><b>David M. Blei</b>, <a onclick="window.open('http://www.cs.columbia.edu/~blei/papers/Blei2012.pdf', 'David M. Blei: Probabilistic Topic Models', 'location=yes,height=550,width=1120,scrollbars=yes,status=yes');"
Expand Down

0 comments on commit 34f0393

Please sign in to comment.