Skip to content

Commit

Permalink
pep8
Browse files Browse the repository at this point in the history
  • Loading branch information
severinsimmler committed Dec 21, 2018
1 parent 3fddeae commit 30cfe21
Show file tree
Hide file tree
Showing 6 changed files with 80 additions and 55 deletions.
30 changes: 15 additions & 15 deletions application/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ def _insert_into_textfiles(db, data):
if content:
logging.info("Insert '{}' into database...".format(title))
db.execute("""
INSERT INTO textfiles (title, content)
INSERT INTO textfiles (title, content)
VALUES(?, ?);
""", [title, content])


def _insert_into_token_freqs(db, data):
logging.info("Insert token frequencies into database...")
db.execute("""
INSERT INTO token_freqs (content)
INSERT INTO token_freqs (content)
VALUES(?);
""", [data])

Expand Down Expand Up @@ -75,9 +75,9 @@ def _update_textfile_sizes(db, data):
logging.info("Update textfile sizes in database...")
for title, size in data.items():
db.execute("""
UPDATE textfiles
SET size = ?
WHERE title = ?;
UPDATE textfiles
SET size = ?
WHERE title = ?;
""",
[size, title])

Expand Down Expand Up @@ -148,71 +148,71 @@ def _select_textfile_sizes(cursor):
def _select_parameters(cursor):
logging.info("Select parameters from database...")
return cursor.execute("""
SELECT content
SELECT content
FROM parameters;
""").fetchone()


def _select_stopwords(cursor):
logging.info("Select stopwords from database...")
return cursor.execute("""
SELECT content
SELECT content
FROM stopwords;
""").fetchone()[0]


def _select_document_similarities(cursor):
logging.info("Select document similarity matrix from database...")
return cursor.execute("""
SELECT document_similarities
SELECT document_similarities
FROM model;
""").fetchone()[0]


def _select_topic_similarities(cursor):
logging.info("Select topic similarity matrix from database...")
return cursor.execute("""
SELECT topic_similarities
SELECT topic_similarities
FROM model;
""").fetchone()[0]


def _select_token_freqs(cursor):
logging.info("Select token frequencies from database...")
return cursor.execute("""
SELECT content
SELECT content
FROM token_freqs;
""").fetchone()[0]


def _select_textfiles(cursor):
logging.info("Select textfiles from database...")
return cursor.execute("""
SELECT title, content
SELECT title, content
FROM textfiles;
""").fetchall()


def _select_document_topic_distributions(cursor):
logging.info("Select document-topic distributions from database...")
return cursor.execute("""
SELECT document_topic
SELECT document_topic
FROM model;
""").fetchone()[0]


def _select_topics(cursor):
logging.info("Select topics from database...")
return cursor.execute("""
SELECT topics
SELECT topics
FROM model;
""").fetchone()[0]


def _select_textfile(cursor, title):
logging.info("Select '{}' from database...".format(title))
return cursor.execute("""
SELECT content
SELECT content
FROM textfiles
WHERE title = ?;
""", [title]).fetchone()[0]
Expand All @@ -223,7 +223,7 @@ def _select_data_export(cursor):

logging.info("Select model output from database...")
model = cursor.execute("""
SELECT document_topic, topics, document_similarities, topic_similarities
SELECT document_topic, topics, document_similarities, topic_similarities
FROM model;
""").fetchone()
return model, stopwords
47 changes: 24 additions & 23 deletions application/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,30 +16,31 @@

# This is for high DPI scaling:
if hasattr(QtCore.Qt, "AA_EnableHighDpiScaling"):
QtWidgets.QApplication.setAttribute(QtCore.Qt.AA_EnableHighDpiScaling, True)
QtWidgets.QApplication.setAttribute(
QtCore.Qt.AA_EnableHighDpiScaling, True)
if hasattr(QtCore.Qt, "AA_UseHighDpiPixmaps"):
QtWidgets.QApplication.setAttribute(QtCore.Qt.AA_UseHighDpiPixmaps, True)


def download_request(item):
"""Opens a file dialog to save the ZIP archive.
"""
mimetype = item.mimeType()
if "octet-stream" in mimetype:
ext = ".png"
elif "svg" in item.mimeType():
ext = ".svg"
elif "zip" in mimetype:
ext = ".zip"
else:
ext = ""

path = QtWidgets.QFileDialog.getSaveFileName(None,
"Select destination folder and file name",
"",
"")[0]
item.setPath("{path}{ext}".format(path=path, ext=ext))
item.accept()
"""Opens a file dialog to save the ZIP archive.
"""
mimetype = item.mimeType()
if "octet-stream" in mimetype:
ext = ".png"
elif "svg" in item.mimeType():
ext = ".svg"
elif "zip" in mimetype:
ext = ".zip"
else:
ext = ""

path = QtWidgets.QFileDialog.getSaveFileName(None,
"Select destination folder and file name",
"",
"")[0]
item.setPath("{path}{ext}".format(path=path, ext=ext))
item.accept()


class ApplicationThread(QtCore.QThread):
Expand Down Expand Up @@ -83,15 +84,15 @@ def init_gui(application, port=PORT, argv=None, title=TITLE, icon=ICON):
qtapp = QtWidgets.QApplication(argv)
web = ApplicationThread(application, port)
web.start()

def kill(application=web):
"""Kill the Flask process.
"""
application.terminate()

qtapp.aboutToQuit.connect(kill)

# Setting width and height individually based on the
# Setting width and height individually based on the
# screen resolution: 93% of the screen for width,
# 80% for height:
screen = qtapp.primaryScreen()
Expand All @@ -104,11 +105,11 @@ def kill(application=web):
webview.resize(width, height)
webview.setWindowTitle(title)
webview.setWindowIcon(QtGui.QIcon(icon))

page = WebPage('http://localhost:{}'.format(port))
page.home()
webview.setPage(page)

# If the user clicks a download button, a window pops up:
webview.page().profile().downloadRequested.connect(download_request)

Expand Down
14 changes: 9 additions & 5 deletions application/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,8 @@ def get_topics(model, vocabulary, maximum=100):
"""
logging.info("Fetching topics from topic model...")
for distribution in model.topic_word_:
words = list(np.array(vocabulary)[np.argsort(distribution)][:-maximum-1:-1])
words = list(np.array(vocabulary)[
np.argsort(distribution)][:-maximum - 1:-1])
yield "{}, ...".format(", ".join(words[:3])), words


Expand Down Expand Up @@ -203,17 +204,20 @@ def export_data():

logging.info("Preparing document-topic distributions...")
document_topic = pd.read_json(document_topic, orient="index")
document_topic.columns = [col.replace(",", "").replace(" ...", "") for col in document_topic.columns]
document_topic.columns = [col.replace(",", "").replace(
" ...", "") for col in document_topic.columns]

logging.info("Preparing topics...")
topics = pd.read_json(topics, orient="index")
topics.index = ["Topic {}".format(n) for n in range(topics.shape[0])]
topics.columns = ["Word {}".format(n) for n in range(topics.shape[1])]
topics.columns = ["Word {}".format(n) for n in range(topics.shape[1])]

logging.info("Preparing topic similarity matrix...")
topic_similarities = pd.read_json(topic_similarities)
topic_similarities.columns = [col.replace(",", "").replace(" ...", "") for col in topic_similarities.columns]
topic_similarities.index = [ix.replace(",", "").replace(" ...", "") for ix in topic_similarities.index]
topic_similarities.columns = [col.replace(",", "").replace(
" ...", "") for col in topic_similarities.columns]
topic_similarities.index = [ix.replace(",", "").replace(
" ...", "") for ix in topic_similarities.index]

logging.info("Preparing document similarity matrix...")
document_similarities = pd.read_json(document_similarities)
Expand Down
28 changes: 19 additions & 9 deletions application/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,18 +172,22 @@ def topics(topic):
logging.info("Get topics...")
topics = json.loads(get_topics())
logging.info("Get document-topic distributions...")
document_topic = pd.read_json(get_document_topic_distributions(), orient="index")
document_topic = pd.read_json(
get_document_topic_distributions(), orient="index")
logging.info("Get topic similarity matrix...")
topic_similarites = pd.read_json(get_topic_similarities())

logging.info("Get related documents...")
related_docs = document_topic[topic].sort_values(ascending=False)[:10]
related_docs_proportions = utils.scale(related_docs, minimum=70)
related_docs_proportions = pd.Series(related_docs_proportions, index=related_docs.index)
related_docs_proportions = related_docs_proportions.sort_values(ascending=False)
related_docs_proportions = pd.Series(
related_docs_proportions, index=related_docs.index)
related_docs_proportions = related_docs_proportions.sort_values(
ascending=False)

# Convert pandas.Series to a 2-D array:
related_docs_proportions = list(utils.series2array(related_docs_proportions))
related_docs_proportions = list(
utils.series2array(related_docs_proportions))

logging.info("Get related words...")
related_words = topics[topic][:15]
Expand Down Expand Up @@ -213,19 +217,24 @@ def documents(title):
logging.info("Get textfiles...")
text = get_textfile(title)
logging.info("Get document-topics distributions...")
document_topic = pd.read_json(get_document_topic_distributions(), orient="index")
document_topic = pd.read_json(
get_document_topic_distributions(), orient="index")
logging.info("Get document similarity matrix...")
document_similarites = pd.read_json(get_document_similarities())

logging.info("Get related topics...")
related_topics = document_topic.loc[title].sort_values(ascending=False) * 100
related_topics = document_topic.loc[title].sort_values(
ascending=False) * 100
distribution = list(related_topics.to_dict().items())

logging.info("Get similar documents...")
similar_docs = document_similarites[title].sort_values(ascending=False)[1:4]
similar_docs = document_similarites[title].sort_values(ascending=False)[
1:4]

logging.debug("Use only the first 10000 characters (or less) from document...")
text = text if len(text) < 10000 else "{}... This was an excerpt of the original text.".format(text[:10000])
logging.debug(
"Use only the first 10000 characters (or less) from document...")
text = text if len(
text) < 10000 else "{}... This was an excerpt of the original text.".format(text[:10000])

logging.debug("Split paragraphs...")
text = text.split("\n\n")
Expand Down Expand Up @@ -379,6 +388,7 @@ def handle_http_exception(e):
"""
return error()


for code in werkzeug.exceptions.default_exceptions:
web.errorhandler(code)(handle_http_exception)

Expand Down
6 changes: 4 additions & 2 deletions application/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,10 @@ def wrapper():
topics, descriptors, document_topic = get_model_output(model, dtm)
logging.info("Got model output.")
# 4. Calculate similarities:
topic_similarities, document_similarities = get_similarities(document_topic)
logging.info("Successfully calculated topic and document similarities.")
topic_similarities, document_similarities = get_similarities(
document_topic)
logging.info(
"Successfully calculated topic and document similarities.")

data = {"document_topic": document_topic.to_json(orient="index", force_ascii=False),
"topics": json.dumps(topics, ensure_ascii=False),
Expand Down
10 changes: 9 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,17 @@ def test_init_app():
app = utils.init_app(TEST_STRING)
assert app.name == TEST_STRING


def test_init_logging():
# TODO
pass


def test_init_db():
# TODO
pass


def test_format_logging():
a = "n_documents: 1"
b = "vocab_size: 1"
Expand Down Expand Up @@ -55,32 +58,37 @@ def test_remove_markup():
text = "<tag>{}</anothertag>".format(TEST_STRING)
utils.remove_markup(text)


def test_get_documents():
textfiles = [("A", "This is a document.")]
documents = list(utils.get_documents(textfiles))
for document in documents:
assert document.title == "A"
assert document.text == "This is a document."


def test_get_stopwords():
# TODO
pass


def test_get_data():
# TODO
pass


def test_get_topics():
# TODO
pass


def test_get_document_topic():
# TODO
pass


def test_get_cosine():
matrix = np.array([[1, 2], [1, 3]])
descriptors = ["A", "B"]
similarites = utils.get_cosine(matrix, descriptors)
assert similarites.sum().sum() == 3.9611613513818402

0 comments on commit 30cfe21

Please sign in to comment.