From aeddd3a6dc1b2d8e8df93c9c15487f583eeae2e9 Mon Sep 17 00:00:00 2001 From: michaelku Date: Mon, 16 Aug 2010 15:29:58 -0700 Subject: [PATCH] Bug 587616: - Added happy/sad info to clusters (SCHEMA CHANGE) - Generate distinct clusters for happy/sad even for 'both' search setting. - Added migrations for sites. Bug 587588: - Fixed foreign key integrity fail bug when running generate_sites (SCHEMA CHANGE) Bug 586915: - Added Cluster for last day in addition to last week and latest beta Allow output of sites directly to a SQL file without writing to DB first. --- .../management/commands/generate_sites.py | 230 ++++++++++++++---- apps/website_issues/models.py | 2 + migrations/__init__.py | 24 ++ migrations/schematic_settings.py | 30 +-- migrations/sites/01-add-cluster-positive.sql | 7 + migrations/sites/schematic_settings.py | 8 + 6 files changed, 223 insertions(+), 78 deletions(-) create mode 100644 migrations/__init__.py create mode 100644 migrations/sites/01-add-cluster-positive.sql create mode 100644 migrations/sites/schematic_settings.py diff --git a/apps/website_issues/management/commands/generate_sites.py b/apps/website_issues/management/commands/generate_sites.py index 5608179e..2de69f13 100644 --- a/apps/website_issues/management/commands/generate_sites.py +++ b/apps/website_issues/management/commands/generate_sites.py @@ -1,9 +1,11 @@ from datetime import datetime, timedelta from itertools import count +from optparse import make_option from django.core.management.base import BaseCommand, CommandError from django.conf import settings from django.db.models import Q +from django.db.models.sql import InsertQuery from django.db import transaction from textcluster.cluster import Corpus @@ -15,6 +17,7 @@ from website_issues.management.utils import normalize_url from website_issues.helpers import without_protocol +DB_ALIAS = "website_issues" class Command(BaseCommand): """ @@ -30,7 +33,7 @@ class Command(BaseCommand): depending on whether there are comments matching the search criteria. - The websites_issues database can contain multiple entries + The website_issues database can contain multiple entries ("summaries") for each logical site: one for each combinations of search criteria that yields a result. Along with the criteria, the number of matching comments is stored. @@ -67,14 +70,25 @@ class Command(BaseCommand): site. """ + option_list = BaseCommand.option_list + ( + make_option('--offline', + action='store_true', + dest='offline', + default=False, + help='Output the results as a MySQL dump to stdout rather' + 'than loading them into the sites database.'), + ) + @transaction.commit_manually def handle(self, *args, **options): # forwards compatible with django dev try: err = self.stderr.write + out = self.stdout.write except AttributeError: import sys err = sys.stderr.write + out = sys.stdout.write # Pregroup comments by key criteria. self.collect_groups(err) @@ -91,8 +105,9 @@ def handle(self, *args, **options): sorted_sites = SiteGroup.sorted(err) err("Generating clusters...\n") - i = 0 - with DatabaseStorage(err) as storage: + context = MysqlDumpStorage(err, out) if options['offline'] \ + else DatabaseStorage(err) + with context as storage: for i, group in enumerate(sorted_sites): self.generate_clusters_for(err, storage, group) # The first clusters take longest, report more often @@ -104,6 +119,7 @@ def handle(self, *args, **options): def collect_groups(self, err): now = datetime.now() seven_days_ago = now - timedelta(days=7) + one_day_ago = now - timedelta(days=1) latest_version = LATEST_BETAS[FIREFOX] err("Collecting groups...\n") def add(opinion, **kwargs): @@ -125,22 +141,30 @@ def add(opinion, **kwargs): keypart = dict(version="", url=site_url) add(opinion, positive=opinion.positive, **keypart) add(opinion, positive=None, **keypart) + if opinion.created > one_day_ago: + keypart = dict(version="", url=site_url) + add(opinion, positive=opinion.positive, **keypart) + add(opinion, positive=None, **keypart) + if i % 1000 == 0: err(" ... %i comments\n" % i) err("%i site summaries for %i comments.\n" % (len(SiteGroup.all), i)) def add_singleton_cluster(self, storage, site_summary, opinion): + cluster = Cluster(pk=self.cluster_id.next(), + site_summary=site_summary, + primary_description=opinion.description, + primary_comment=None, + positive=opinion.positive, + size=1) + storage.save(cluster) comment = Comment(pk=self.comment_id.next(), description=opinion.description, opinion_id=opinion.id, + cluster=cluster, score=1.0) - cluster = Cluster(pk=self.cluster_id.next(), - site_summary=site_summary, - primary_description=comment.description, - primary_comment=comment, - size=1) - storage.save_cluster(cluster) - comment.cluster = cluster - storage.save_comment(comment) + storage.save(comment) + cluster.primary_comment = comment + storage.save(cluster) def generate_clusters_for(self, err, storage, group): num_clusters = 0 @@ -149,7 +173,8 @@ def generate_clusters_for(self, err, storage, group): issues_count=group.positive_counts[0], praise_count=group.positive_counts[1], **group.key) - storage.save_site_summary(site_summary) + storage.save(site_summary) + group_positive = group.key["positive"] # Handle single-comment case: if site_summary.size == 1: @@ -157,34 +182,142 @@ def generate_clusters_for(self, err, storage, group): self.add_singleton_cluster(storage, site_summary, opinion) return - # Handle cluster case: - corpus = Corpus() - remaining_opinions = { } - for opinion in Opinion.objects.filter(pk__in=group.opinion_pks): - remaining_opinions[opinion.id] = opinion - corpus.add(opinion, str=unicode(opinion.description)) - clusters = corpus.cluster() - for next in clusters: - primary = {"object": next.primary, "similarity": 1.0} - comments = [Comment(pk=self.comment_id.next(), - description=doc["object"].description, - opinion_id=doc["object"].id, - score=doc["similarity"]) - for doc in [primary] + next.similars] - cluster = Cluster(pk=self.cluster_id.next(), - site_summary=site_summary, - primary_description=comments[0].description, - primary_comment=comments[0], - size=len(comments)) - storage.save_cluster(cluster) - for comment in comments: - del remaining_opinions[comment.opinion_id] - comment.cluster = cluster - storage.save_comment(comment) - - # Add singletons for remaining opinions - for opinion in remaining_opinions.values(): - self.add_singleton_cluster(storage, site_summary, opinion) + opinions = Opinion.objects.filter(pk__in=group.opinion_pks) + + # Handle cluster case, make one corpus for positive, one for negative. + for positive in (0,1): + if group_positive is not None and positive != group_positive: + continue + corpus = Corpus() + remaining_opinions = { } + for opinion in opinions: + if opinion.positive != positive: continue + remaining_opinions[opinion.id] = opinion + corpus.add(opinion, str=unicode(opinion.description)) + clusters = corpus.cluster() + for next in clusters: + primary = {"object": next.primary, "similarity": 1.0} + comments = [Comment(pk=self.comment_id.next(), + description=doc["object"].description, + opinion_id=doc["object"].id, + score=doc["similarity"]) + for doc in [primary] + next.similars] + cluster = Cluster(pk=self.cluster_id.next(), + site_summary=site_summary, + primary_description=comments[0].description, + primary_comment=None, + positive=positive, + size=len(comments)) + storage.save(cluster) + for comment in comments: + del remaining_opinions[comment.opinion_id] + comment.cluster = cluster + storage.save(comment) + cluster.primary_comment=comments[0] + cluster.save() + + # Add singletons for remaining opinions + for opinion in remaining_opinions.values(): + self.add_singleton_cluster(storage, site_summary, opinion) + + +class MysqlDumpStorage(object): + """Storage that dumps objects into MySQL syntax insert statements. + + These are probably going to be compatible with most database + implementations. We need to access the MySQL driver directly because + django does not expose an API for escaping parameters using the configured + database connection. + """ + def __init__(self, err, out): + self.err = err + self.out = out + # We need entirely different imports if this storage is used. + from django.db import connections + from MySQLdb import connect + self.connection = connections[DB_ALIAS] + db = settings.DATABASES[DB_ALIAS] + connection = connect(db=db["NAME"], host=db["HOST"], user=db["USER"], + passwd=db["PASSWORD"], charset="utf8", + use_unicode=True) + self.mysql_escape_string = connection.escape_string + self.out(""" + /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; + /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; + /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; + /*!40101 SET NAMES utf8 */; + /*!40103 SET @OLD_TIME_ZONE=@@TIME_ZONE */; + /*!40103 SET TIME_ZONE='+00:00' */; + /*!40014 SET @OLD_UNIQUE_CHECKS=@@UNIQUE_CHECKS, UNIQUE_CHECKS=0 */; + /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */; + /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */; + /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */; + """) + + def __enter__(self): + return self + + def __exit__(self, type, value, traceback): + if type is not None: + self.err("Unexpected error occurred. Re-raising.") + return False + self.out(""" + /*!40000 ALTER TABLE `website_issues_sitesummary` ENABLE KEYS */; + UNLOCK TABLES; + /*!40103 SET TIME_ZONE=@OLD_TIME_ZONE */; + + /*!40101 SET SQL_MODE=@OLD_SQL_MODE */; + /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */; + /*!40014 SET UNIQUE_CHECKS=@OLD_UNIQUE_CHECKS */; + /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; + /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; + /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; + /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */; + """) + + def placeholder(self, field, val): + """Copied from the django query compiler (private api).""" + if field is None: + # A field value of None means the value is raw. + return val + elif hasattr(field, 'get_placeholder'): + # Some fields (e.g. geo fields) need special munging before + # they can be inserted. + return field.get_placeholder(val, self.connection) + else: + # Return the common case for the placeholder + return '%s' + + def escape(self, v): + """Encode value to utf-8 and escape it for MySQL insertion.""" + escape = self.mysql_escape_string + if v is None: return "NULL" + elif type(v) == unicode: return "'%s'" % escape(v.encode("utf-8")) + elif type(v) == str: return "'%s'" % escape(v) + elif type(v) in (bool, int, float): return "%s" % v + return "'%s'" % escape(str(v)) + + def quote(self, name): + """Encode value to utf-8 and quote it as a MySQL name.""" + return self.connection.ops.quote_name(name) + + def save(self, model): + """Adapted from the django query compiler (private api).""" + if isinstance(model, Cluster) and model.primary_comment is None: + return + query = InsertQuery(model) + meta = query.model._meta + values = [(f, f.get_db_prep_save(f.pre_save(model, True), + connection=self.connection)) + for f in meta.local_fields] + query.insert_values(values) + result = [ + 'INSERT INTO %s' % self.quote(meta.db_table), + '(%s)' % ', '.join([self.quote(c) for c in query.columns]), + "VALUES (%s)" % ", ".join([self.escape(p) for p in query.params]) + ] + self.out(' '.join(result)) + self.out(";\n") class DatabaseStorage(object): @@ -193,10 +326,9 @@ class DatabaseStorage(object): This storage inserts the objects into the database configured for their model, replacing the current contents in one transaction. - Another possible storage might just directly generate a SQL script from - the objects it receives. That would be *much* faster as currently there is - one read and one write for every save with a pregenerated pk hogging the - tubes.""" + For offline processing, use MysqlDumpStorage instead which is *much* + faster, as currently as DatabaseStorage needs one read and one write for + every save, even with the pregenerated PK.""" def __init__(self, err): self.err = err @@ -214,14 +346,8 @@ def __exit__(self, type, value, traceback): return False transaction.commit() - def save_cluster(self, cluster): - cluster.save() - - def save_site_summary(self, sitesummary): - sitesummary.save() - - def save_comment(self, comment): - comment.save() + def save(self, model): + model.save() class frozendict(dict): diff --git a/apps/website_issues/models.py b/apps/website_issues/models.py index c4f6b02f..5f643a99 100644 --- a/apps/website_issues/models.py +++ b/apps/website_issues/models.py @@ -22,7 +22,9 @@ class Cluster(ModelBase): size = models.PositiveIntegerField() primary_description = models.TextField() primary_comment = models.ForeignKey("Comment", + null=True, related_name="defined_cluster") + positive = models.BooleanField(default=False) class Meta: ordering = ['-size'] diff --git a/migrations/__init__.py b/migrations/__init__.py new file mode 100644 index 00000000..3e5fe966 --- /dev/null +++ b/migrations/__init__.py @@ -0,0 +1,24 @@ +import manage +from django.conf import settings + +def db_command(name): + config = settings.DATABASES[name] + config['HOST'] = config.get('HOST', 'localhost') + config['PORT'] = config.get('PORT', '3306') + + if not config['HOST'] or config['HOST'].endswith('.sock'): + """ Oh you meant 'localhost'! """ + config['HOST'] = 'localhost' + + s = 'mysql --silent {NAME} -h{HOST} -u{USER}' + + if config['PASSWORD']: + s += ' -p{PASSWORD}' + else: + del config['PASSWORD'] + if config['PORT']: + s += ' -P{PORT}' + else: + del config['PORT'] + + return s.format(**config) diff --git a/migrations/schematic_settings.py b/migrations/schematic_settings.py index ff7ceba3..d5421427 100644 --- a/migrations/schematic_settings.py +++ b/migrations/schematic_settings.py @@ -1,30 +1,8 @@ import sys -import os +from os.path import dirname, abspath -sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +sys.path.insert(0, dirname(dirname(abspath(__file__)))) +from migrations import db_command -# Set up zamboni. -import manage -from django.conf import settings - -config = settings.DATABASES['default'] -config['HOST'] = config.get('HOST', 'localhost') -config['PORT'] = config.get('PORT', '3306') - -if not config['HOST'] or config['HOST'].endswith('.sock'): - """ Oh you meant 'localhost'! """ - config['HOST'] = 'localhost' - -s = 'mysql --silent {NAME} -h{HOST} -u{USER}' - -if config['PASSWORD']: - s += ' -p{PASSWORD}' -else: - del config['PASSWORD'] -if config['PORT']: - s += ' -P{PORT}' -else: - del config['PORT'] - -db = s.format(**config) +db = db_command('default') table = 'schema_version' diff --git a/migrations/sites/01-add-cluster-positive.sql b/migrations/sites/01-add-cluster-positive.sql new file mode 100644 index 00000000..3a8e4abb --- /dev/null +++ b/migrations/sites/01-add-cluster-positive.sql @@ -0,0 +1,7 @@ +ALTER TABLE `website_issues_cluster` + ADD COLUMN `positive` tinyint(1) NOT NULL DEFAULT 0; + +-- Get rid of circular constraint so we can add rows using django ORM. +-- We do not know the name of the constraint, so we recreate the column. +ALTER TABLE `website_issues_cluster` + MODIFY COLUMN `primary_comment_id` int(11) DEFAULT NULL; diff --git a/migrations/sites/schematic_settings.py b/migrations/sites/schematic_settings.py new file mode 100644 index 00000000..c2cf01fe --- /dev/null +++ b/migrations/sites/schematic_settings.py @@ -0,0 +1,8 @@ +import sys +from os.path import dirname, abspath + +sys.path.insert(0, dirname(dirname(dirname(abspath(__file__))))) +from migrations import db_command + +db = db_command('website_issues') +table = 'schema_version'