Permalink
Browse files

Showing publisher info even if it comes from Google News.

  • Loading branch information...
1 parent cda21b3 commit 57b419fce5de009da4a38c222e2d35364f333b60 @joshuaeckroth joshuaeckroth committed Aug 23, 2011
Showing with 37 additions and 34 deletions.
  1. +26 −0 AINewsCorpus.py
  2. +6 −0 AINewsCrawler.py
  3. +2 −12 AINewsDuplicates.py
  4. +3 −22 AINewsPublisher.py
View
@@ -10,6 +10,7 @@
import random
import math
import operator
+import re
from itertools import izip
from AINewsConfig import config, paths
from AINewsDB import AINewsDB
@@ -31,9 +32,34 @@ def __init__(self):
"Interfaces","MachineLearning","NaturalLanguage","Philosophy",\
"Reasoning","Representation", "Robots","ScienceFiction",\
"Speech", "Systems","Vision"]
+
+ self.sources = {}
+ rows = self.db.selectall("select parser, relevance from sources")
+ for row in rows:
+ self.sources[row[0].split('::')[0]] = int(row[1])
self.restore_corpus()
+ def get_relevance(self, publisher):
+ if re.search(r'via Google News', publisher):
+ publisher = 'GoogleNews'
+ return self.sources[publisher]
+
+ def compare_articles(self, article1, article2):
+ dupcount1 = len(article1['duplicates'])
+ dupcount2 = len(article2['duplicates'])
+ relevance1 = self.get_relevance(article1['publisher'])
+ relevance2 = self.get_relevance(article2['publisher'])
+ cat_count1 = len(article1['categories'])
+ cat_count2 = len(article2['categories'])
+ if cmp(dupcount1, dupcount2) == 0:
+ if cmp(relevance1, relevance2) == 0:
+ return cmp(cat_count1, cat_count2)
+ else:
+ return cmp(relevance1, relevance2)
+ else:
+ return cmp(dupcount1, dupcount2)
+
def get_tfidf(self, urlid, wordfreq):
"""
Helper function to retrieve the tfidf of each word based on the urlid.
View
@@ -77,7 +77,13 @@ def crawl(self):
for candidate in parser.candidates:
if len(candidate) != 4: continue
url = candidate[0].encode('utf-8')
+ print "Fetching", url
title = convert_to_printable(ents.convert((re.sub(r'\s+', ' ', candidate[1])))).strip()
+ # if publisher is GoogleNews, extract true publisher from title
+ if publisher == "GoogleNews":
+ true_publisher = re.match(r'^.*\- ([^\-]+)$', title).group(1)
+ publisher = "%s via Google News" % true_publisher
+
# removing site title like " - NPR"
title = re.sub(r'\s+[:-]\s+.*$', '', title)
pubdate = candidate[2]
View
@@ -24,21 +24,11 @@ def add_to_duplicates(duplicates, urlid1, urlid2):
dupset.add(urlid2)
duplicates.append(dupset)
-def compare_articles(article1, article2, sources):
- relevance1 = sources[article1['publisher']]
- relevance2 = sources[article2['publisher']]
- cat_count1 = len(article1['categories'])
- cat_count2 = len(article2['categories'])
- if cmp(relevance1, relevance2) == 0:
- return cmp(cat_count1, cat_count2)
- else:
- return cmp(relevance1, relevance2)
-
class AINewsDuplicates:
def __init__(self):
self.corpus = AINewsCorpus()
- def filter_duplicates(self, articles, sources):
+ def filter_duplicates(self, articles):
date_start = date.today() - timedelta(days = int(config['duplicates.days_back']))
date_end = date.today()
cutoff = float(config['duplicates.threshold'])
@@ -84,7 +74,7 @@ def filter_duplicates(self, articles, sources):
filter(lambda u: u in articles and (u,urlid) in similarities, dupset2))
sorted_dups = sorted(filter(lambda u: u in articles and articles[u]['publish'], dupset),
- cmp=lambda x,y: compare_articles(articles[x], articles[y], sources),
+ cmp=lambda x,y: self.corpus.compare_articles(articles[x], articles[y]),
reverse = True)
if(len(sorted_dups) > 1):
# first in sorted set is chosen; rest are dumped
View
@@ -43,11 +43,6 @@ def __init__(self):
self.txtpro = AINewsTextProcessor()
self.summarizer = AINewsSummarizer()
- self.sources = {}
- rows = self.db.selectall("select parser, relevance from sources")
- for row in rows:
- self.sources[row[0].split('::')[0]] = int(row[1])
-
self.articles = {}
self.published_articles = []
@@ -118,7 +113,7 @@ def filter_and_process(self):
# filter out duplicates; some articles may have 'publish' set to False
# by this function
- self.duplicates.filter_duplicates(self.articles, self.sources)
+ self.duplicates.filter_duplicates(self.articles)
# add article summaries
self.summarizer.summarize(self.corpus, self.articles)
@@ -142,7 +137,7 @@ def filter_and_process(self):
# then by number of categories (more = better)
unpublished_articles = sorted(
filter(lambda x: x['publish'], self.articles.values()),
- cmp=lambda x,y: self.compare_articles(x, y),
+ cmp=lambda x,y: self.corpus.compare_articles(x, y),
reverse = True)
max_cat_count = int(config['publisher.max_cat_count'])
@@ -175,21 +170,6 @@ def filter_and_process(self):
self.semiauto_email_output = ""
- def compare_articles(self, article1, article2):
- dupcount1 = len(article1['duplicates'])
- dupcount2 = len(article2['duplicates'])
- relevance1 = self.sources[article1['publisher']]
- relevance2 = self.sources[article2['publisher']]
- cat_count1 = len(article1['categories'])
- cat_count2 = len(article2['categories'])
- if cmp(dupcount1, dupcount2) == 0:
- if cmp(relevance1, relevance2) == 0:
- return cmp(cat_count1, cat_count2)
- else:
- return cmp(relevance1, relevance2)
- else:
- return cmp(dupcount1, dupcount2)
-
def update_db(self, article):
self.db.execute("delete from categories where urlid = %s", article['urlid'])
for cat in article['categories']:
@@ -210,6 +190,7 @@ def generate_email_output(self):
"""
email = LatestNewsEmail()
email.date = self.today.strftime("%B %d, %Y")
+ email.year = self.today.strftime("%Y")
email.news = self.published_articles
email.aitopic_urls = aitopic_urls
email.topicids = self.topicids

0 comments on commit 57b419f

Please sign in to comment.