Skip to content

Commit

Permalink
Bug 587616:
Browse files Browse the repository at this point in the history
- Added happy/sad info to clusters (SCHEMA CHANGE)
- Generate distinct clusters for happy/sad even for 'both' search setting.
- Added migrations for sites.

Bug 587588:
- Fixed foreign key integrity fail bug when running generate_sites
  (SCHEMA CHANGE)

Bug 586915:
- Added Cluster for last day in addition to last week and latest beta

Allow output of sites directly to a SQL file without writing to DB first.
  • Loading branch information
x1B committed Aug 16, 2010
1 parent 7626483 commit aeddd3a
Show file tree
Hide file tree
Showing 6 changed files with 223 additions and 78 deletions.
230 changes: 178 additions & 52 deletions apps/website_issues/management/commands/generate_sites.py
@@ -1,9 +1,11 @@
from datetime import datetime, timedelta
from itertools import count
from optparse import make_option

from django.core.management.base import BaseCommand, CommandError
from django.conf import settings
from django.db.models import Q
from django.db.models.sql import InsertQuery
from django.db import transaction

from textcluster.cluster import Corpus
Expand All @@ -15,6 +17,7 @@
from website_issues.management.utils import normalize_url
from website_issues.helpers import without_protocol

DB_ALIAS = "website_issues"

class Command(BaseCommand):
"""
Expand All @@ -30,7 +33,7 @@ class Command(BaseCommand):
depending on whether there are comments matching the search
criteria.
The websites_issues database can contain multiple entries
The website_issues database can contain multiple entries
("summaries") for each logical site: one for each combinations of
search criteria that yields a result. Along with the criteria, the
number of matching comments is stored.
Expand Down Expand Up @@ -67,14 +70,25 @@ class Command(BaseCommand):
site.
"""

option_list = BaseCommand.option_list + (
make_option('--offline',
action='store_true',
dest='offline',
default=False,
help='Output the results as a MySQL dump to stdout rather'
'than loading them into the sites database.'),
)

@transaction.commit_manually
def handle(self, *args, **options):
# forwards compatible with django dev
try:
err = self.stderr.write
out = self.stdout.write
except AttributeError:
import sys
err = sys.stderr.write
out = sys.stdout.write

# Pregroup comments by key criteria.
self.collect_groups(err)
Expand All @@ -91,8 +105,9 @@ def handle(self, *args, **options):
sorted_sites = SiteGroup.sorted(err)

err("Generating clusters...\n")
i = 0
with DatabaseStorage(err) as storage:
context = MysqlDumpStorage(err, out) if options['offline'] \
else DatabaseStorage(err)
with context as storage:
for i, group in enumerate(sorted_sites):
self.generate_clusters_for(err, storage, group)
# The first clusters take longest, report more often
Expand All @@ -104,6 +119,7 @@ def handle(self, *args, **options):
def collect_groups(self, err):
now = datetime.now()
seven_days_ago = now - timedelta(days=7)
one_day_ago = now - timedelta(days=1)
latest_version = LATEST_BETAS[FIREFOX]
err("Collecting groups...\n")
def add(opinion, **kwargs):
Expand All @@ -125,22 +141,30 @@ def add(opinion, **kwargs):
keypart = dict(version="<week>", url=site_url)
add(opinion, positive=opinion.positive, **keypart)
add(opinion, positive=None, **keypart)
if opinion.created > one_day_ago:
keypart = dict(version="<day>", url=site_url)
add(opinion, positive=opinion.positive, **keypart)
add(opinion, positive=None, **keypart)

if i % 1000 == 0: err(" ... %i comments\n" % i)
err("%i site summaries for %i comments.\n" % (len(SiteGroup.all), i))

def add_singleton_cluster(self, storage, site_summary, opinion):
cluster = Cluster(pk=self.cluster_id.next(),
site_summary=site_summary,
primary_description=opinion.description,
primary_comment=None,
positive=opinion.positive,
size=1)
storage.save(cluster)
comment = Comment(pk=self.comment_id.next(),
description=opinion.description,
opinion_id=opinion.id,
cluster=cluster,
score=1.0)
cluster = Cluster(pk=self.cluster_id.next(),
site_summary=site_summary,
primary_description=comment.description,
primary_comment=comment,
size=1)
storage.save_cluster(cluster)
comment.cluster = cluster
storage.save_comment(comment)
storage.save(comment)
cluster.primary_comment = comment
storage.save(cluster)

def generate_clusters_for(self, err, storage, group):
num_clusters = 0
Expand All @@ -149,42 +173,151 @@ def generate_clusters_for(self, err, storage, group):
issues_count=group.positive_counts[0],
praise_count=group.positive_counts[1],
**group.key)
storage.save_site_summary(site_summary)
storage.save(site_summary)
group_positive = group.key["positive"]

# Handle single-comment case:
if site_summary.size == 1:
opinion = Opinion.objects.get(pk=group.opinion_pks[0])
self.add_singleton_cluster(storage, site_summary, opinion)
return

# Handle cluster case:
corpus = Corpus()
remaining_opinions = { }
for opinion in Opinion.objects.filter(pk__in=group.opinion_pks):
remaining_opinions[opinion.id] = opinion
corpus.add(opinion, str=unicode(opinion.description))
clusters = corpus.cluster()
for next in clusters:
primary = {"object": next.primary, "similarity": 1.0}
comments = [Comment(pk=self.comment_id.next(),
description=doc["object"].description,
opinion_id=doc["object"].id,
score=doc["similarity"])
for doc in [primary] + next.similars]
cluster = Cluster(pk=self.cluster_id.next(),
site_summary=site_summary,
primary_description=comments[0].description,
primary_comment=comments[0],
size=len(comments))
storage.save_cluster(cluster)
for comment in comments:
del remaining_opinions[comment.opinion_id]
comment.cluster = cluster
storage.save_comment(comment)

# Add singletons for remaining opinions
for opinion in remaining_opinions.values():
self.add_singleton_cluster(storage, site_summary, opinion)
opinions = Opinion.objects.filter(pk__in=group.opinion_pks)

# Handle cluster case, make one corpus for positive, one for negative.
for positive in (0,1):
if group_positive is not None and positive != group_positive:
continue
corpus = Corpus()
remaining_opinions = { }
for opinion in opinions:
if opinion.positive != positive: continue
remaining_opinions[opinion.id] = opinion
corpus.add(opinion, str=unicode(opinion.description))
clusters = corpus.cluster()
for next in clusters:
primary = {"object": next.primary, "similarity": 1.0}
comments = [Comment(pk=self.comment_id.next(),
description=doc["object"].description,
opinion_id=doc["object"].id,
score=doc["similarity"])
for doc in [primary] + next.similars]
cluster = Cluster(pk=self.cluster_id.next(),
site_summary=site_summary,
primary_description=comments[0].description,
primary_comment=None,
positive=positive,
size=len(comments))
storage.save(cluster)
for comment in comments:
del remaining_opinions[comment.opinion_id]
comment.cluster = cluster
storage.save(comment)
cluster.primary_comment=comments[0]
cluster.save()

# Add singletons for remaining opinions
for opinion in remaining_opinions.values():
self.add_singleton_cluster(storage, site_summary, opinion)


class MysqlDumpStorage(object):
"""Storage that dumps objects into MySQL syntax insert statements.
These are probably going to be compatible with most database
implementations. We need to access the MySQL driver directly because
django does not expose an API for escaping parameters using the configured
database connection.
"""
def __init__(self, err, out):
self.err = err
self.out = out
# We need entirely different imports if this storage is used.
from django.db import connections
from MySQLdb import connect
self.connection = connections[DB_ALIAS]
db = settings.DATABASES[DB_ALIAS]
connection = connect(db=db["NAME"], host=db["HOST"], user=db["USER"],
passwd=db["PASSWORD"], charset="utf8",
use_unicode=True)
self.mysql_escape_string = connection.escape_string
self.out("""
/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
/*!40101 SET NAMES utf8 */;
/*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */;
/*!40103 SET TIME_ZONE='+00:00' */;
/*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */;
/*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
/*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
/*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
""")

def __enter__(self):
return self

def __exit__(self, type, value, traceback):
if type is not None:
self.err("Unexpected error occurred. Re-raising.")
return False
self.out("""
/*!40000 ALTER TABLE `website_issues_sitesummary` ENABLE KEYS */;
UNLOCK TABLES;
/*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */;
/*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
/*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
/*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */;
/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
/*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
""")

def placeholder(self, field, val):
"""Copied from the django query compiler (private api)."""
if field is None:
# A field value of None means the value is raw.
return val
elif hasattr(field, 'get_placeholder'):
# Some fields (e.g. geo fields) need special munging before
# they can be inserted.
return field.get_placeholder(val, self.connection)
else:
# Return the common case for the placeholder
return '%s'

def escape(self, v):
"""Encode value to utf-8 and escape it for MySQL insertion."""
escape = self.mysql_escape_string
if v is None: return "NULL"
elif type(v) == unicode: return "'%s'" % escape(v.encode("utf-8"))
elif type(v) == str: return "'%s'" % escape(v)
elif type(v) in (bool, int, float): return "%s" % v
return "'%s'" % escape(str(v))

def quote(self, name):
"""Encode value to utf-8 and quote it as a MySQL name."""
return self.connection.ops.quote_name(name)

def save(self, model):
"""Adapted from the django query compiler (private api)."""
if isinstance(model, Cluster) and model.primary_comment is None:
return
query = InsertQuery(model)
meta = query.model._meta
values = [(f, f.get_db_prep_save(f.pre_save(model, True),
connection=self.connection))
for f in meta.local_fields]
query.insert_values(values)
result = [
'INSERT INTO %s' % self.quote(meta.db_table),
'(%s)' % ', '.join([self.quote(c) for c in query.columns]),
"VALUES (%s)" % ", ".join([self.escape(p) for p in query.params])
]
self.out(' '.join(result))
self.out(";\n")


class DatabaseStorage(object):
Expand All @@ -193,10 +326,9 @@ class DatabaseStorage(object):
This storage inserts the objects into the database configured for their
model, replacing the current contents in one transaction.
Another possible storage might just directly generate a SQL script from
the objects it receives. That would be *much* faster as currently there is
one read and one write for every save with a pregenerated pk hogging the
tubes."""
For offline processing, use MysqlDumpStorage instead which is *much*
faster, as currently as DatabaseStorage needs one read and one write for
every save, even with the pregenerated PK."""

def __init__(self, err):
self.err = err
Expand All @@ -214,14 +346,8 @@ def __exit__(self, type, value, traceback):
return False
transaction.commit()

def save_cluster(self, cluster):
cluster.save()

def save_site_summary(self, sitesummary):
sitesummary.save()

def save_comment(self, comment):
comment.save()
def save(self, model):
model.save()


class frozendict(dict):
Expand Down
2 changes: 2 additions & 0 deletions apps/website_issues/models.py
Expand Up @@ -22,7 +22,9 @@ class Cluster(ModelBase):
size = models.PositiveIntegerField()
primary_description = models.TextField()
primary_comment = models.ForeignKey("Comment",
null=True,
related_name="defined_cluster")
positive = models.BooleanField(default=False)

class Meta:
ordering = ['-size']
Expand Down
24 changes: 24 additions & 0 deletions migrations/__init__.py
@@ -0,0 +1,24 @@
import manage
from django.conf import settings

def db_command(name):
config = settings.DATABASES[name]
config['HOST'] = config.get('HOST', 'localhost')
config['PORT'] = config.get('PORT', '3306')

if not config['HOST'] or config['HOST'].endswith('.sock'):
""" Oh you meant 'localhost'! """
config['HOST'] = 'localhost'

s = 'mysql --silent {NAME} -h{HOST} -u{USER}'

if config['PASSWORD']:
s += ' -p{PASSWORD}'
else:
del config['PASSWORD']
if config['PORT']:
s += ' -P{PORT}'
else:
del config['PORT']

return s.format(**config)
30 changes: 4 additions & 26 deletions migrations/schematic_settings.py
@@ -1,30 +1,8 @@
import sys
import os
from os.path import dirname, abspath

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
sys.path.insert(0, dirname(dirname(abspath(__file__))))
from migrations import db_command

# Set up zamboni.
import manage
from django.conf import settings

config = settings.DATABASES['default']
config['HOST'] = config.get('HOST', 'localhost')
config['PORT'] = config.get('PORT', '3306')

if not config['HOST'] or config['HOST'].endswith('.sock'):
""" Oh you meant 'localhost'! """
config['HOST'] = 'localhost'

s = 'mysql --silent {NAME} -h{HOST} -u{USER}'

if config['PASSWORD']:
s += ' -p{PASSWORD}'
else:
del config['PASSWORD']
if config['PORT']:
s += ' -P{PORT}'
else:
del config['PORT']

db = s.format(**config)
db = db_command('default')
table = 'schema_version'

0 comments on commit aeddd3a

Please sign in to comment.