Skip to content

Commit

Permalink
adding postgres, updating mysql support. +small style changes
Browse files Browse the repository at this point in the history
  • Loading branch information
jaredly committed Mar 4, 2013
1 parent 9062f82 commit a0f6ea2
Show file tree
Hide file tree
Showing 24 changed files with 112 additions and 69 deletions.
17 changes: 17 additions & 0 deletions README.rst
Expand Up @@ -70,6 +70,23 @@ Start up the web server with the following command::


and then open a web browser and navigate to http://localhost:8000/. and then open a web browser and navigate to http://localhost:8000/.


POSTGRESQL
==========

It can be tons faster to use postgres. Because it took me a bit of hunting to
get it to behave, here's how to do it on Fedora::

sudo yum install postgres*
sudo service postgres initdb
sudo service postgres start
sudo -u postgres createuser --superuser $USER
sudo -u postgres psql
## here type `\password $USER` and then type in a new password
## then type `\q` to exit
createdb topicalguide
## based off of https://help.ubuntu.com/community/PostgreSQL

Contributing Contributing
============ ============


Expand Down
12 changes: 12 additions & 0 deletions commentary.rst
@@ -1,3 +1,15 @@

Performance:
- postgresql
- 1:49 for analysis import
- :13 for dataset import
- :14 document pairwise topic analysis
- 50 SECONDS for chord diagram reload
- mysql
- :16 for dataset import
- 2:10 for analysis import


Backend Running Sequence: Backend Running Sequence:


-- extract_data -- extract_data
Expand Down
17 changes: 10 additions & 7 deletions import_tool/backend.py
Expand Up @@ -41,16 +41,20 @@
# Allow specification of multiple num_topics # Allow specification of multiple num_topics
# #


if __name__ == "__main__":
raise Exception("This file is only meant to be run by doit. "
"use ./run_import.py to run the backend import")

import codecs import codecs
import datetime import datetime
import hashlib import hashlib
import os import os
import sys import sys


from topic_modeling.tools import setup_logging, logging
setup_logging()
logger = logging.getLogger('root')

if __name__ == "__main__":
logger.warn("This file is only meant to be run by doit. "
"use ./run_import.py to run the backend import")

from collections import defaultdict from collections import defaultdict
from subprocess import Popen, PIPE from subprocess import Popen, PIPE


Expand Down Expand Up @@ -82,9 +86,6 @@
from topic_modeling.visualize.models import PairwiseDocumentMetric from topic_modeling.visualize.models import PairwiseDocumentMetric
from topic_modeling.visualize.models import TopicNameScheme from topic_modeling.visualize.models import TopicNameScheme


from topic_modeling.tools import setup_logging
setup_logging()

try: try:
from import_tool.local_settings import LOCAL_DIR, build from import_tool.local_settings import LOCAL_DIR, build
except ImportError: except ImportError:
Expand Down Expand Up @@ -869,6 +870,7 @@ def task_compile_java():
clean = ['rm -rf ' + c['java_bin']] clean = ['rm -rf ' + c['java_bin']]
return {'actions':actions, 'result_dep':result_deps, 'clean':clean} return {'actions':actions, 'result_dep':result_deps, 'clean':clean}


'''
if 'task_graphs' not in locals(): if 'task_graphs' not in locals():
def task_graphs(): def task_graphs():
classpath = '{0}:{1}/lib/gephi-toolkit.jar:{1}/lib/statnlp-rev562.jar:{1}/lib/{2}'.format(c['java_bin'], c['java_base'], c['db_jar']) classpath = '{0}:{1}/lib/gephi-toolkit.jar:{1}/lib/statnlp-rev562.jar:{1}/lib/{2}'.format(c['java_bin'], c['java_base'], c['db_jar'])
Expand All @@ -894,6 +896,7 @@ def utd(_task, _values): return os.path.exists(graphs_img_dir)
task['name'] = ns.scheme_name() task['name'] = ns.scheme_name()
task['uptodate'] = [utd] task['uptodate'] = [utd]
yield task yield task
'''


# #
#def task_reset_db(): #def task_reset_db():
Expand Down
6 changes: 5 additions & 1 deletion import_tool/config.py
Expand Up @@ -164,7 +164,11 @@ def create_config(build_script):
c.default('db_jar', 'mysql-connector-java-5.1.18-bin.jar') c.default('db_jar', 'mysql-connector-java-5.1.18-bin.jar')
c.default('jdbc_path', 'jdbc:mysql://%s/%s?user=%s\&password=%s' c.default('jdbc_path', 'jdbc:mysql://%s/%s?user=%s\&password=%s'
% (c['mysql_server'], c['mysql_db'], c['mysql_user'], c['mysql_password'])) % (c['mysql_server'], c['mysql_db'], c['mysql_user'], c['mysql_password']))
else: raise Exception("Unknown database type '" + settings.DBTYPE + "'") elif settings.DBTYPE == 'postgres':
# do we really need stuff here?
pass
else:
raise Exception("Unknown database type '" + settings.DBTYPE + "'")
return c return c


build_script = get_buildscript(build) build_script = get_buildscript(build)
Expand Down
4 changes: 2 additions & 2 deletions import_tool/import_scripts/analysis_import.py
Expand Up @@ -30,15 +30,15 @@
from django.db import connection, transaction from django.db import connection, transaction


from topic_modeling.visualize.models import Analysis, Dataset, Topic from topic_modeling.visualize.models import Analysis, Dataset, Topic
from import_scripts.metadata import Metadata from import_tool.import_scripts.metadata import Metadata
from topic_modeling import settings from topic_modeling import settings
from topic_modeling.visualize.models import Document from topic_modeling.visualize.models import Document
from topic_modeling.tools import TimeLongThing from topic_modeling.tools import TimeLongThing
import logging import logging


from import_tool.config import config from import_tool.config import config


logger = logging.getLogger('console') logger = logging.getLogger('root')


def check_analysis(analysis_name, dataset_name): def check_analysis(analysis_name, dataset_name):
try: try:
Expand Down
4 changes: 2 additions & 2 deletions import_tool/import_scripts/dataset_import.py
Expand Up @@ -36,7 +36,7 @@
from topic_modeling import settings from topic_modeling import settings
import logging import logging


logger = logging.getLogger('console') logger = logging.getLogger('root')


def check_dataset(name): def check_dataset(name):
try: try:
Expand All @@ -47,7 +47,7 @@ def check_dataset(name):
return False return False
for document in dataset.documents.all(): for document in dataset.documents.all():
if not document.tokens.count(): if not document.tokens.count():
logging.warn('Dataset present, but not all documents are populated: %s %d' % (document.filename, document.pk)) logger.warn('Dataset present, but not all documents are populated: %s %d' % (document.filename, document.pk))
return False return False


def import_dataset(name, readable_name, description, metadata_filenames, def import_dataset(name, readable_name, description, metadata_filenames,
Expand Down
2 changes: 1 addition & 1 deletion import_tool/import_scripts/metadata.py
Expand Up @@ -13,7 +13,7 @@
from topic_modeling.tools import TimeLongThing from topic_modeling.tools import TimeLongThing
import logging import logging


logger = logging.getLogger('console') logger = logging.getLogger('root')


datetime_format = "%Y-%m-%dT%H:%M:%S" datetime_format = "%Y-%m-%dT%H:%M:%S"


Expand Down
4 changes: 2 additions & 2 deletions import_tool/metric_scripts/analyses/__init__.py
Expand Up @@ -20,9 +20,9 @@
# contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL, # contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL,
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu. # Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.


from metric_scripts import MetricSet from .. import MetricSet


import entropy import entropy


metrics = MetricSet() metrics = MetricSet()
metrics['entropy'] = entropy metrics['entropy'] = entropy
2 changes: 1 addition & 1 deletion import_tool/metric_scripts/analyses/entropy.py
Expand Up @@ -24,7 +24,7 @@
from topic_modeling.visualize.models import AnalysisMetric, AnalysisMetricValue from topic_modeling.visualize.models import AnalysisMetric, AnalysisMetricValue


import logging import logging
logger = logging.getLogger('console') logger = logging.getLogger('root')


def add_metric(analysis): def add_metric(analysis):
metric, _ = AnalysisMetric.objects.get_or_create(name="Topic Entropy") metric, _ = AnalysisMetric.objects.get_or_create(name="Topic Entropy")
Expand Down
4 changes: 2 additions & 2 deletions import_tool/metric_scripts/datasets/__init__.py
Expand Up @@ -20,9 +20,9 @@
# contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL, # contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL,
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu. # Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.


from metric_scripts import MetricSet from .. import MetricSet


import counts import counts


metrics = MetricSet() metrics = MetricSet()
metrics['counts'] = counts metrics['counts'] = counts
2 changes: 1 addition & 1 deletion import_tool/metric_scripts/documents/__init__.py
Expand Up @@ -21,7 +21,7 @@
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu. # Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.




from metric_scripts import MetricSet from .. import MetricSet


import token_count import token_count
import type_count import type_count
Expand Down
6 changes: 3 additions & 3 deletions import_tool/metric_scripts/documents/pairwise/__init__.py
Expand Up @@ -20,9 +20,9 @@
# contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL, # contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL,
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu. # Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.


from metric_scripts import MetricSet from ... import MetricSet
from metric_scripts.documents.pairwise import topic_correlation, word_correlation from ...documents.pairwise import topic_correlation, word_correlation


metrics = MetricSet() metrics = MetricSet()
metrics['topic_correlation'] = topic_correlation metrics['topic_correlation'] = topic_correlation
metrics['word_correlation'] = word_correlation metrics['word_correlation'] = word_correlation
35 changes: 27 additions & 8 deletions import_tool/metric_scripts/documents/pairwise/topic_correlation.py
Expand Up @@ -37,17 +37,21 @@
from topic_modeling.visualize.models import PairwiseDocumentMetricValue from topic_modeling.visualize.models import PairwiseDocumentMetricValue


from topic_modeling.tools import TimeLongThing from topic_modeling.tools import TimeLongThing
import logging
logger = logging.getLogger('root')


metric_name = "Topic Correlation" metric_name = "Topic Correlation"


# @transaction.commit_manually # @transaction.commit_manually
def add_metric(dataset, analysis): def add_metric(dataset, analysis):
print 'begginning metric'
sys.stdout.flush()
logger.info('beginning metric: document > pairwise > topic correlation')
try: try:
dataset = Dataset.objects.get(name=dataset) dataset = Dataset.objects.get(name=dataset)
analysis = Analysis.objects.get(dataset=dataset, name=analysis) analysis = Analysis.objects.get(dataset=dataset, name=analysis)
metric,created = PairwiseDocumentMetric.objects.get_or_create(name=metric_name, analysis=analysis) metric,created = PairwiseDocumentMetric.objects.get_or_create(name=metric_name, analysis=analysis)
if not created and PairwiseDocumentMetricValue.objects.filter(metric=metric).count(): if not created and PairwiseDocumentMetricValue.objects.filter(metric=metric).count():
# transaction.rollback()
raise RuntimeError("%s is already in the database for this" raise RuntimeError("%s is already in the database for this"
" analysis" % metric_name) " analysis" % metric_name)


Expand All @@ -56,19 +60,34 @@ def add_metric(dataset, analysis):
for i, topic in enumerate(topics): for i, topic in enumerate(topics):
topic_idx[topic] = i topic_idx[topic] = i


documents = dataset.documents.all() documents = list(dataset.documents.all())
doctopicvectors = [document_topic_vector(doc, topic_idx) for doc in documents] logger.info('Generating document topic vectors')
print 'gen t vectors'
sys.stdout.flush()
num_docs = len(documents)

timer = TimeLongThing(num_docs, 1, .05)
doctopicvectors = []
for doc in documents:
timer.inc()
doctopicvectors.append(document_topic_vector(doc, topic_idx))

vectornorms = [norm(vector) for vector in doctopicvectors] vectornorms = [norm(vector) for vector in doctopicvectors]


# start = datetime.now() logger.info('Comparing the vectors')
print 'compare vectors'
sys.stdout.flush()
timer = TimeLongThing(len(documents)**2, 5, 100)
for i, doc1 in enumerate(documents): for i, doc1 in enumerate(documents):
write('.')
# print >> sys.stderr, 'Working on document', i, 'out of', num_docs # print >> sys.stderr, 'Working on document', i, 'out of', num_docs
# print >> sys.stderr, 'Time for last document:', datetime.now() - start # print >> sys.stderr, 'Time for last document:', datetime.now() - start
# start = datetime.now() # start = datetime.now()
logger.info('Working on document %d out of %d' % (i, num_docs))
doc1_topic_vals = doctopicvectors[i] doc1_topic_vals = doctopicvectors[i]
doc1_norm = vectornorms[i] doc1_norm = vectornorms[i]
for j, doc2 in enumerate(documents): for j, doc2 in enumerate(documents):
timer.inc()
sys.stderr.flush()
doc2_topic_vals = doctopicvectors[j] doc2_topic_vals = doctopicvectors[j]
doc2_norm = vectornorms[j] doc2_norm = vectornorms[j]
correlation_coeff = pmcc(doc1_topic_vals, doc2_topic_vals, correlation_coeff = pmcc(doc1_topic_vals, doc2_topic_vals,
Expand All @@ -89,8 +108,8 @@ def metric_names_generated(_dataset, _analysis):
return [metric_name] return [metric_name]


def write(s): def write(s):
sys.stdout.write(s) sys.stderr.write(s)
sys.stdout.flush() sys.stderr.flush()


def pmcc(doc1_topic_vals, doc2_topic_vals, doc1_norm, doc2_norm): def pmcc(doc1_topic_vals, doc2_topic_vals, doc1_norm, doc2_norm):
return float(dot(doc1_topic_vals, doc2_topic_vals) / return float(dot(doc1_topic_vals, doc2_topic_vals) /
Expand Down
2 changes: 1 addition & 1 deletion import_tool/metric_scripts/topics/__init__.py
Expand Up @@ -21,7 +21,7 @@
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu. # Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.




from metric_scripts import MetricSet from .. import MetricSet


import alpha import alpha
import attribute_entropy import attribute_entropy
Expand Down
4 changes: 2 additions & 2 deletions import_tool/metric_scripts/topics/pairwise/__init__.py
Expand Up @@ -19,8 +19,8 @@
# If you have inquiries regarding any further use of the Topical Guide, please # If you have inquiries regarding any further use of the Topical Guide, please
# contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL, # contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL,
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu. # Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.
from metric_scripts.metric_set import MetricSet from ...metric_set import MetricSet
from metric_scripts.topics.pairwise import document_correlation, pairwise_coherence, word_correlation import document_correlation, pairwise_coherence, word_correlation


metrics = MetricSet() metrics = MetricSet()
metrics['document_correlation'] = document_correlation metrics['document_correlation'] = document_correlation
Expand Down
Expand Up @@ -37,7 +37,7 @@


from optparse import OptionParser from optparse import OptionParser


from metric_scripts.topics.coherence import compute_pmi from ...topics.coherence import compute_pmi
from topic_modeling.visualize.models import Analysis from topic_modeling.visualize.models import Analysis
from topic_modeling.visualize.models import PairwiseTopicMetric from topic_modeling.visualize.models import PairwiseTopicMetric
from topic_modeling.visualize.models import PairwiseTopicMetricValue from topic_modeling.visualize.models import PairwiseTopicMetricValue
Expand Down
Expand Up @@ -59,8 +59,8 @@ def add_metric(dataset, analysis):
for j, topic2 in enumerate(topics): for j, topic2 in enumerate(topics):
topic2_word_vals = topicwordvectors[j] topic2_word_vals = topicwordvectors[j]
correlation_coeff = pmcc(topic1_word_vals, topic2_word_vals) correlation_coeff = pmcc(topic1_word_vals, topic2_word_vals)
if not correlation_coeff or numpy.isnan(correlation_coeff): # if not correlation_coeff or numpy.isnan(correlation_coeff):
raise Exception('Null correlation? %s %s %s %s' % (topic1, topic2, topic1_word_vals, topic2_word_vals)) # raise Exception('Null correlation? %s %s %s %s' % (topic1, topic2, topic1_word_vals, topic2_word_vals))
PairwiseTopicMetricValue.objects.create(topic1=topic1, PairwiseTopicMetricValue.objects.create(topic1=topic1,
topic2=topic2, metric=metric, value=correlation_coeff) topic2=topic2, metric=metric, value=correlation_coeff)
# transaction.commit() # transaction.commit()
Expand Down
17 changes: 9 additions & 8 deletions topic_modeling/logging.conf
Expand Up @@ -10,18 +10,19 @@
"class": "logging.StreamHandler", "class": "logging.StreamHandler",
"level": "DEBUG", "level": "DEBUG",
"formatter": "simple", "formatter": "simple",
"stream": "ext://sys.stdout" "stream": "ext://sys.stderr"
},
"outfile": {
"class": "logging.FileHandler",
"level": "DEBUG",
"formatter": "simple",
"filename": "tg-debug.log"
} }
}, },
"loggers": { "loggers": {
"console": { "root": {
"level": "DEBUG", "level": "DEBUG",
"handlers": ["console"], "handlers": ["console", "outfile"]
"propagate": "no",
"root": {
"level": "DEBUG",
"handlers": ["console"]
}
} }
} }
} }
2 changes: 1 addition & 1 deletion topic_modeling/media/scripts/charts/circle.js
Expand Up @@ -106,7 +106,7 @@ var CircleControls = Backbone.View.extend({
/** /**
* This is a Circle visualization * This is a Circle visualization
*/ */
var CircleViewer = MainView.add(ZoomableView, { var CircleViewer = MainView.add({
name: 'circle-topics', name: 'circle-topics',
title: 'Circle Diagram', title: 'Circle Diagram',
menu_class: CircleMenu, menu_class: CircleMenu,
Expand Down
4 changes: 2 additions & 2 deletions topic_modeling/media/scripts/charts/visualization.js
Expand Up @@ -222,8 +222,8 @@ var MainView = Backbone.View.extend({
*/ */
var VisualizationView = Backbone.View.extend({ var VisualizationView = Backbone.View.extend({
base_defaults: { base_defaults: {
width: 720, width: 630,
height: 720 height: 630
}, },
menu_class: null, menu_class: null,
info_class: null, info_class: null,
Expand Down
4 changes: 2 additions & 2 deletions topic_modeling/media/styles/fancy.css
Expand Up @@ -22,7 +22,7 @@ ul.nav li, ul.nav li a {


#main { #main {
float: left; float: left;
width: 720px; width: 630px;
position: relative; position: relative;
} }


Expand Down Expand Up @@ -85,7 +85,7 @@ body.loading #right-bar * {
#right-bar { #right-bar {
position: relative; position: relative;
float: right; float: right;
width: 210px; width: 300px;
} }


#right-bar .bar-item { #right-bar .bar-item {
Expand Down

0 comments on commit a0f6ea2

Please sign in to comment.