Permalink
Browse files

adding postgres, updating mysql support. +small style changes

  • Loading branch information...
1 parent 9062f82 commit a0f6ea20dda8f519f48fbdb37245007c3a981015 @jaredly jaredly committed Mar 4, 2013
View
17 README.rst
@@ -70,6 +70,23 @@ Start up the web server with the following command::
and then open a web browser and navigate to http://localhost:8000/.
+POSTGRESQL
+==========
+
+It can be tons faster to use postgres. Because it took me a bit of hunting to
+get it to behave, here's how to do it on Fedora::
+
+ sudo yum install postgres*
+ sudo service postgres initdb
+ sudo service postgres start
+ sudo -u postgres createuser --superuser $USER
+ sudo -u postgres psql
+ ## here type `\password $USER` and then type in a new password
+ ## then type `\q` to exit
+ createdb topicalguide
+
+ ## based off of https://help.ubuntu.com/community/PostgreSQL
+
Contributing
============
View
12 commentary.rst
@@ -1,3 +1,15 @@
+
+Performance:
+ - postgresql
+ - 1:49 for analysis import
+ - :13 for dataset import
+ - :14 document pairwise topic analysis
+ - 50 SECONDS for chord diagram reload
+ - mysql
+ - :16 for dataset import
+ - 2:10 for analysis import
+
+
Backend Running Sequence:
-- extract_data
View
17 import_tool/backend.py
@@ -41,16 +41,20 @@
# Allow specification of multiple num_topics
#
-if __name__ == "__main__":
- raise Exception("This file is only meant to be run by doit. "
- "use ./run_import.py to run the backend import")
-
import codecs
import datetime
import hashlib
import os
import sys
+from topic_modeling.tools import setup_logging, logging
+setup_logging()
+logger = logging.getLogger('root')
+
+if __name__ == "__main__":
+ logger.warn("This file is only meant to be run by doit. "
+ "use ./run_import.py to run the backend import")
+
from collections import defaultdict
from subprocess import Popen, PIPE
@@ -82,9 +86,6 @@
from topic_modeling.visualize.models import PairwiseDocumentMetric
from topic_modeling.visualize.models import TopicNameScheme
-from topic_modeling.tools import setup_logging
-setup_logging()
-
try:
from import_tool.local_settings import LOCAL_DIR, build
except ImportError:
@@ -869,6 +870,7 @@ def task_compile_java():
clean = ['rm -rf ' + c['java_bin']]
return {'actions':actions, 'result_dep':result_deps, 'clean':clean}
+'''
if 'task_graphs' not in locals():
def task_graphs():
classpath = '{0}:{1}/lib/gephi-toolkit.jar:{1}/lib/statnlp-rev562.jar:{1}/lib/{2}'.format(c['java_bin'], c['java_base'], c['db_jar'])
@@ -894,6 +896,7 @@ def utd(_task, _values): return os.path.exists(graphs_img_dir)
task['name'] = ns.scheme_name()
task['uptodate'] = [utd]
yield task
+'''
#
#def task_reset_db():
View
6 import_tool/config.py
@@ -164,7 +164,11 @@ def create_config(build_script):
c.default('db_jar', 'mysql-connector-java-5.1.18-bin.jar')
c.default('jdbc_path', 'jdbc:mysql://%s/%s?user=%s\&password=%s'
% (c['mysql_server'], c['mysql_db'], c['mysql_user'], c['mysql_password']))
- else: raise Exception("Unknown database type '" + settings.DBTYPE + "'")
+ elif settings.DBTYPE == 'postgres':
+ # do we really need stuff here?
+ pass
+ else:
+ raise Exception("Unknown database type '" + settings.DBTYPE + "'")
return c
build_script = get_buildscript(build)
View
4 import_tool/import_scripts/analysis_import.py
@@ -30,15 +30,15 @@
from django.db import connection, transaction
from topic_modeling.visualize.models import Analysis, Dataset, Topic
-from import_scripts.metadata import Metadata
+from import_tool.import_scripts.metadata import Metadata
from topic_modeling import settings
from topic_modeling.visualize.models import Document
from topic_modeling.tools import TimeLongThing
import logging
from import_tool.config import config
-logger = logging.getLogger('console')
+logger = logging.getLogger('root')
def check_analysis(analysis_name, dataset_name):
try:
View
4 import_tool/import_scripts/dataset_import.py
@@ -36,7 +36,7 @@
from topic_modeling import settings
import logging
-logger = logging.getLogger('console')
+logger = logging.getLogger('root')
def check_dataset(name):
try:
@@ -47,7 +47,7 @@ def check_dataset(name):
return False
for document in dataset.documents.all():
if not document.tokens.count():
- logging.warn('Dataset present, but not all documents are populated: %s %d' % (document.filename, document.pk))
+ logger.warn('Dataset present, but not all documents are populated: %s %d' % (document.filename, document.pk))
return False
def import_dataset(name, readable_name, description, metadata_filenames,
View
2 import_tool/import_scripts/metadata.py
@@ -13,7 +13,7 @@
from topic_modeling.tools import TimeLongThing
import logging
-logger = logging.getLogger('console')
+logger = logging.getLogger('root')
datetime_format = "%Y-%m-%dT%H:%M:%S"
View
4 import_tool/metric_scripts/analyses/__init__.py
@@ -20,9 +20,9 @@
# contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL,
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.
-from metric_scripts import MetricSet
+from .. import MetricSet
import entropy
metrics = MetricSet()
-metrics['entropy'] = entropy
+metrics['entropy'] = entropy
View
2 import_tool/metric_scripts/analyses/entropy.py
@@ -24,7 +24,7 @@
from topic_modeling.visualize.models import AnalysisMetric, AnalysisMetricValue
import logging
-logger = logging.getLogger('console')
+logger = logging.getLogger('root')
def add_metric(analysis):
metric, _ = AnalysisMetric.objects.get_or_create(name="Topic Entropy")
View
4 import_tool/metric_scripts/datasets/__init__.py
@@ -20,9 +20,9 @@
# contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL,
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.
-from metric_scripts import MetricSet
+from .. import MetricSet
import counts
metrics = MetricSet()
-metrics['counts'] = counts
+metrics['counts'] = counts
View
2 import_tool/metric_scripts/documents/__init__.py
@@ -21,7 +21,7 @@
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.
-from metric_scripts import MetricSet
+from .. import MetricSet
import token_count
import type_count
View
6 import_tool/metric_scripts/documents/pairwise/__init__.py
@@ -20,9 +20,9 @@
# contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL,
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.
-from metric_scripts import MetricSet
-from metric_scripts.documents.pairwise import topic_correlation, word_correlation
+from ... import MetricSet
+from ...documents.pairwise import topic_correlation, word_correlation
metrics = MetricSet()
metrics['topic_correlation'] = topic_correlation
-metrics['word_correlation'] = word_correlation
+metrics['word_correlation'] = word_correlation
View
35 import_tool/metric_scripts/documents/pairwise/topic_correlation.py
@@ -37,17 +37,21 @@
from topic_modeling.visualize.models import PairwiseDocumentMetricValue
from topic_modeling.tools import TimeLongThing
+import logging
+logger = logging.getLogger('root')
metric_name = "Topic Correlation"
# @transaction.commit_manually
def add_metric(dataset, analysis):
+ print 'begginning metric'
+ sys.stdout.flush()
+ logger.info('beginning metric: document > pairwise > topic correlation')
try:
dataset = Dataset.objects.get(name=dataset)
analysis = Analysis.objects.get(dataset=dataset, name=analysis)
metric,created = PairwiseDocumentMetric.objects.get_or_create(name=metric_name, analysis=analysis)
if not created and PairwiseDocumentMetricValue.objects.filter(metric=metric).count():
- # transaction.rollback()
raise RuntimeError("%s is already in the database for this"
" analysis" % metric_name)
@@ -56,19 +60,34 @@ def add_metric(dataset, analysis):
for i, topic in enumerate(topics):
topic_idx[topic] = i
- documents = dataset.documents.all()
- doctopicvectors = [document_topic_vector(doc, topic_idx) for doc in documents]
+ documents = list(dataset.documents.all())
+ logger.info('Generating document topic vectors')
+ print 'gen t vectors'
+ sys.stdout.flush()
+ num_docs = len(documents)
+
+ timer = TimeLongThing(num_docs, 1, .05)
+ doctopicvectors = []
+ for doc in documents:
+ timer.inc()
+ doctopicvectors.append(document_topic_vector(doc, topic_idx))
+
vectornorms = [norm(vector) for vector in doctopicvectors]
-
- # start = datetime.now()
+
+ logger.info('Comparing the vectors')
+ print 'compare vectors'
+ sys.stdout.flush()
+ timer = TimeLongThing(len(documents)**2, 5, 100)
for i, doc1 in enumerate(documents):
- write('.')
# print >> sys.stderr, 'Working on document', i, 'out of', num_docs
# print >> sys.stderr, 'Time for last document:', datetime.now() - start
# start = datetime.now()
+ logger.info('Working on document %d out of %d' % (i, num_docs))
doc1_topic_vals = doctopicvectors[i]
doc1_norm = vectornorms[i]
for j, doc2 in enumerate(documents):
+ timer.inc()
+ sys.stderr.flush()
doc2_topic_vals = doctopicvectors[j]
doc2_norm = vectornorms[j]
correlation_coeff = pmcc(doc1_topic_vals, doc2_topic_vals,
@@ -89,8 +108,8 @@ def metric_names_generated(_dataset, _analysis):
return [metric_name]
def write(s):
- sys.stdout.write(s)
- sys.stdout.flush()
+ sys.stderr.write(s)
+ sys.stderr.flush()
def pmcc(doc1_topic_vals, doc2_topic_vals, doc1_norm, doc2_norm):
return float(dot(doc1_topic_vals, doc2_topic_vals) /
View
2 import_tool/metric_scripts/topics/__init__.py
@@ -21,7 +21,7 @@
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.
-from metric_scripts import MetricSet
+from .. import MetricSet
import alpha
import attribute_entropy
View
4 import_tool/metric_scripts/topics/pairwise/__init__.py
@@ -19,8 +19,8 @@
# If you have inquiries regarding any further use of the Topical Guide, please
# contact the Copyright Licensing Office, Brigham Young University, 3760 HBLL,
# Provo, UT 84602, (801) 422-9339 or 422-3821, e-mail copyright@byu.edu.
-from metric_scripts.metric_set import MetricSet
-from metric_scripts.topics.pairwise import document_correlation, pairwise_coherence, word_correlation
+from ...metric_set import MetricSet
+import document_correlation, pairwise_coherence, word_correlation
metrics = MetricSet()
metrics['document_correlation'] = document_correlation
View
2 import_tool/metric_scripts/topics/pairwise/pairwise_coherence.py
@@ -37,7 +37,7 @@
from optparse import OptionParser
-from metric_scripts.topics.coherence import compute_pmi
+from ...topics.coherence import compute_pmi
from topic_modeling.visualize.models import Analysis
from topic_modeling.visualize.models import PairwiseTopicMetric
from topic_modeling.visualize.models import PairwiseTopicMetricValue
View
4 import_tool/metric_scripts/topics/pairwise/word_correlation.py
@@ -59,8 +59,8 @@ def add_metric(dataset, analysis):
for j, topic2 in enumerate(topics):
topic2_word_vals = topicwordvectors[j]
correlation_coeff = pmcc(topic1_word_vals, topic2_word_vals)
- if not correlation_coeff or numpy.isnan(correlation_coeff):
- raise Exception('Null correlation? %s %s %s %s' % (topic1, topic2, topic1_word_vals, topic2_word_vals))
+ # if not correlation_coeff or numpy.isnan(correlation_coeff):
+ # raise Exception('Null correlation? %s %s %s %s' % (topic1, topic2, topic1_word_vals, topic2_word_vals))
PairwiseTopicMetricValue.objects.create(topic1=topic1,
topic2=topic2, metric=metric, value=correlation_coeff)
# transaction.commit()
View
17 topic_modeling/logging.conf
@@ -10,18 +10,19 @@
"class": "logging.StreamHandler",
"level": "DEBUG",
"formatter": "simple",
- "stream": "ext://sys.stdout"
+ "stream": "ext://sys.stderr"
+ },
+ "outfile": {
+ "class": "logging.FileHandler",
+ "level": "DEBUG",
+ "formatter": "simple",
+ "filename": "tg-debug.log"
}
},
"loggers": {
- "console": {
+ "root": {
"level": "DEBUG",
- "handlers": ["console"],
- "propagate": "no",
- "root": {
- "level": "DEBUG",
- "handlers": ["console"]
- }
+ "handlers": ["console", "outfile"]
}
}
}
View
2 topic_modeling/media/scripts/charts/circle.js
@@ -106,7 +106,7 @@ var CircleControls = Backbone.View.extend({
/**
* This is a Circle visualization
*/
-var CircleViewer = MainView.add(ZoomableView, {
+var CircleViewer = MainView.add({
name: 'circle-topics',
title: 'Circle Diagram',
menu_class: CircleMenu,
View
4 topic_modeling/media/scripts/charts/visualization.js
@@ -222,8 +222,8 @@ var MainView = Backbone.View.extend({
*/
var VisualizationView = Backbone.View.extend({
base_defaults: {
- width: 720,
- height: 720
+ width: 630,
+ height: 630
},
menu_class: null,
info_class: null,
View
4 topic_modeling/media/styles/fancy.css
@@ -22,7 +22,7 @@ ul.nav li, ul.nav li a {
#main {
float: left;
- width: 720px;
+ width: 630px;
position: relative;
}
@@ -85,7 +85,7 @@ body.loading #right-bar * {
#right-bar {
position: relative;
float: right;
- width: 210px;
+ width: 300px;
}
#right-bar .bar-item {
View
20 topic_modeling/settings.py
@@ -42,30 +42,16 @@
raise Exception("You need to set up your import_tool/local_settings.py")
sys.path.append(BASE_DIR)
try:
- from import_tool.local_settings import DB_FILE
+ from import_tool.local_settings import DB_CONFIG, DBTYPE, SQLITE_CONFIG, MYSQL_CONFIG
except ImportError as e:
raise Exception("Error imporing import_tool/local_settings.py: %s" % e)
MANAGERS = ADMINS
-DBTYPE = 'sqlite3'
-#DBTYPE = 'mysql'
-
-SQLITE_CONFIG = {
- 'ENGINE': 'django.db.backends.sqlite3',
- 'NAME': DB_FILE
-}
-
-MYSQL_CONFIG = {
- 'ENGINE': 'django.db.backends.mysql',
- 'USER': 'topicalguide',
- 'SERVER': 'localhost',
- 'PASSWORD': 'topicalguide',
- 'NAME': 'topicalguide_newimport'
+DATABASES = {
+ 'default': DB_CONFIG
}
-DATABASES = {'default': SQLITE_CONFIG if DBTYPE=='sqlite3' else MYSQL_CONFIG}
-
def database_type():
return DBTYPE
View
6 topic_modeling/templates/fancy.html
@@ -129,7 +129,7 @@ <h3 class="topic-name">Click a Topic</h3>
</div>
<div id="info-force-topics">
<h3 class="topic-name">Click a Topic</h3>
- <table class="metrics table-striped">
+ <table class="metrics table table-condensed table-striped">
<thead>
<th>Metric</th>
<th>Value</th>
@@ -140,7 +140,7 @@ <h3 class="topic-name">Click a Topic</h3>
</tbody>
</table>
<h3>Top Documents</h3>
- <table class="documents table-striped">
+ <table class="documents table table-condensed table-striped">
<thead>
<th>Name</th>
<th>Count</th>
@@ -149,7 +149,7 @@ <h3 class="topic-name">Click a Topic</h3>
</tbody>
</table>
<h3>Top Words</h3>
- <table class="words table-striped">
+ <table class="words table table-condensed table-striped">
<thead>
<th>Name</th>
<th>Count</th>
View
1 topic_modeling/tools.py
@@ -3,6 +3,7 @@
import sys
import os
import logging
+import logging.config
import json
class BackendError(Exception):

0 comments on commit a0f6ea2

Please sign in to comment.