Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Collect tweets regarding Firefox for Customer Care page. Bug 591929.
  • Loading branch information
Fred Wenzel committed Sep 17, 2010
1 parent c207f79 commit 7b83bc1
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 0 deletions.
105 changes: 105 additions & 0 deletions apps/customercare/cron.py
@@ -0,0 +1,105 @@
import calendar
from datetime import datetime
import json
import logging
import re
import rfc822
import urllib

from django.conf import settings

import cronjobs

from .models import Tweet


SEARCH_URL = 'http://search.twitter.com/search.json'

LINK_REGEX = re.compile('https?\:', re.IGNORECASE)
MENTION_REGEX = re.compile('(^|\W)@')
RT_REGEX = re.compile('^rt\W', re.IGNORECASE)

log = logging.getLogger('k.twitter')


@cronjobs.register
def collect_tweets():
"""Collect new tweets about Firefox."""
search_options = {
'q': 'firefox',
'rpp': 100, # Items per page.

This comment has been minimized.

Copy link
@jsocol

jsocol Sep 17, 2010

Should this be hard-coded, or match settings.CC_TWEETS_PERPAGE?

This comment has been minimized.

Copy link
@fwenzel

fwenzel Sep 17, 2010

Owner

Oh, good catch. That should of course use the setting. Thanks!

'result_type': 'recent', # Retrieve tweets by date.
}

# If we already have some tweets, collect nothing older than what we have.
try:
latest_tweet = Tweet.objects.all()[0]

This comment has been minimized.

Copy link
@buchanae

buchanae Sep 22, 2010

Collaborator

Could there be a better way to do this? Maybe http://www.djangoproject.com/documentation/models/get_latest/

This comment has been minimized.

Copy link
@fwenzel

fwenzel Sep 22, 2010

Owner

True -- though we'd still need to wrap in in try/except.

except IndexError:
log.debug('No existing tweets. Retrieving %d tweets from search.' % (
settings.CC_TWEETS_PERPAGE))
else:
search_options['since_id'] = latest_tweet.tweet_id
log.debug('Retrieving tweets with id >= %s' % latest_tweet.tweet_id)

# Retrieve Tweets
try:
raw_data = json.load(urllib.urlopen('%s?%s' % (
SEARCH_URL, urllib.urlencode(search_options))))
except Exception, e:
log.warning('Twitter request failed: %s' % e)
return

if not ('results' in raw_data and raw_data['results']):
log.info('Twitter returned 0 results.')
return

# Drop tweets into DB
for item in raw_data['results']:
log.debug('Handling tweet %d: %s...' % (item['id'], item['text'][:50]))
# Apply filters to tweet before saving
item = _filter_tweet(item)
if not item:
continue

created_date = datetime.utcfromtimestamp(calendar.timegm(
rfc822.parsedate(item['created_at'])))

tweet = Tweet(tweet_id=item['id'], raw_json=json.dumps(item),
created=created_date)
tweet.save()
log.debug('Tweet %d saved.' % item['id'])

# When all is done, truncate list of tweets to (approx.) maximum number.
try:
keep_tweet = Tweet.objects.all()[settings.CC_MAX_TWEETS]
except IndexError:
pass
else:
log.debug('Truncating tweet list: Removing tweets older than %s.' % (
keep_tweet.created))
Tweet.objects.filter(created__lte=keep_tweet.created).delete()


def _filter_tweet(item):
"""
Apply some filters to an incoming tweet.
May modify tweet. If None is returned, tweet will be discarded.
Used to exclude replies and such from incoming tweets.
"""
# No replies, no mentions
if item['to_user_id'] or MENTION_REGEX.search(item['text']):
log.debug('Tweet %d discarded (reply).' % item['id'])
return None

# No retweets
if RT_REGEX.search(item['text']) or item['text'].find('(via ') > -1:
log.debug('Tweet %d discarded (retweet).' % item['id'])
return None

# No links
if LINK_REGEX.search(item['text']):
log.debug('Tweet %d discarded (link).' % item['id'])
return None

return item
15 changes: 15 additions & 0 deletions apps/customercare/models.py
@@ -0,0 +1,15 @@
from datetime import datetime

from django.db import models

from sumo.models import ModelBase


class Tweet(ModelBase):
"""An entry on twitter."""
tweet_id = models.BigIntegerField()

This comment has been minimized.

Copy link
@jsocol

jsocol Sep 17, 2010

Are you ever going to look up tweets by ID?

This comment has been minimized.

Copy link
@fwenzel

fwenzel Sep 17, 2010

Owner

Well I am currently (only) using this to find the latest tweet we know about in order to populate the "since_id" parameter on subsequent searches. Why?

This comment has been minimized.

Copy link
@jsocol

jsocol Sep 17, 2010

Just wondering if this should have an index. But if you're only sorting/search by created, then no.

This comment has been minimized.

Copy link
@fwenzel

fwenzel Sep 17, 2010

Owner

Thanks, I'll keep that in mind.

raw_json = models.TextField()
created = models.DateTimeField(default=datetime.now, db_index=True)

class Meta:
ordering = ('-created',)
10 changes: 10 additions & 0 deletions migrations/41-customercare-tweets.sql
@@ -0,0 +1,10 @@
BEGIN;
CREATE TABLE `customercare_tweet` (
`id` integer AUTO_INCREMENT NOT NULL PRIMARY KEY,
`tweet_id` bigint NOT NULL,
`raw_json` longtext NOT NULL,
`created` datetime NOT NULL
)
;
CREATE INDEX `customercare_tweet_3216ff68` ON `customercare_tweet` (`created`);
COMMIT;
5 changes: 5 additions & 0 deletions settings.py
Expand Up @@ -188,6 +188,7 @@
'product_details',
'wiki',
'gallery',
'customercare',
)

# Extra apps for testing
Expand Down Expand Up @@ -418,3 +419,7 @@ def JINJA_CONFIG():
GALLERY_VIDEO_THUMBNAIL_PATH = 'uploads/gallery/videos/thumbnails/'
THUMBNAIL_PROGRESS_URL = MEDIA_URL + 'img/wait-trans.gif'
VIDEO_MAX_FILESIZE = 16777216 # 16 megabytes, in bytes

# Customare care tweet collection settings
CC_MAX_TWEETS = 500 # Max. no. of tweets in DB
CC_TWEETS_PERPAGE = 100 # How many tweets to collect in one go. Max: 100.

0 comments on commit 7b83bc1

Please sign in to comment.