Permalink
Fetching contributors…
Cannot retrieve contributors at this time
208 lines (185 sloc) 8.69 KB
# This file is part of NewsFinder.
# https://github.com/joshuaeckroth/AINews
#
# Copyright (c) 2011 by the Association for the Advancement of
# Artificial Intelligence. This program and parts of it may be used and
# distributed without charge for non-commercial purposes as long as this
# notice is included.
import os
import sys
import re
import feedparser
import cgi
from subprocess import *
from datetime import date, timedelta
import codecs
import string
import csv
import urllib2
from AINewsConfig import config, paths, blacklist_words
from AINewsTools import trunc, convert_to_printable
from AINewsDB import AINewsDB
from AINewsSummarizer import AINewsSummarizer
class AINewsCrawler:
def __init__(self):
self.today = date.today()
self.earliest_date = self.today - timedelta(days = int(config['ainews.period']))
self.db = AINewsDB()
self.summarizer = AINewsSummarizer()
self.articles = []
def get_sources(self, opts):
"""
Get the news source list.
"""
sources = []
csv_file = csv.reader(urllib2.urlopen(paths['ainews.sources_csv']))
header = True
for row in csv_file:
if header:
header = False
continue
if len(opts) == 0 or (opts[0][0] == '--source' and opts[0][1] == row[1]):
sources.append({'source_id': row[0],
'title': row[1],
'link': row[2],
'parser': row[3],
'relevance': int(row[4])})
return sources
def fetch_all_sources(self, opts):
for source in self.get_sources(opts):
print "CRAWL: Crawling \"%s\"..." % source['title']
try:
f = feedparser.parse(source['link'])
except Exception, e:
print "Exception while parsing feed: %s" % (source['link'],)
print e
continue
for entry in f.entries:
d = None
try:
if hasattr(entry, 'published_parsed'):
d = date(entry.published_parsed[0],
entry.published_parsed[1],
entry.published_parsed[2])
else:
d = date(entry.updated_parsed[0],
entry.updated_parsed[1],
entry.updated_parsed[2])
except Exception, e:
print e
print entry
print "Setting date as today; could not parse date for feed", \
source['link']
d = self.today
if d > self.today or d < self.earliest_date: continue
if entry.title[-6:] == '(blog)' \
or entry.title[-15:] == '(press release)':
print "Blog or press release in title. (%s) (%s)" % \
(entry.link, entry.title)
continue
try:
url = urllib2.urlopen(entry.link).geturl()
except KeyboardInterrupt:
print "Quitting early due to keyboard interrupt."
sys.exit()
except: continue
# attempt to skip blogs
if re.match('^.*blog.*$', url):
print "'blog' in url (%s) (%s)" % \
(entry.link, entry.title)
continue
# attempt to skip job postings
if re.match('^.*job.*$', url):
print "'job' in url (%s) (%s)" % \
(entry.link, entry.title)
continue
# skip urls we have already crawled
if self.db.crawled(url):
print "Seen this url before (%s) (%s)" % \
(entry.link, entry.title)
continue
title = cgi.escape(convert_to_printable(entry.title)).strip()
# if source is Google News, extract true source from title
if re.match(r'^.*Google News.*$', source['title']):
true_source = re.match(r'^.* - (.+)$', title).group(1)
true_source = "%s via Google News" % true_source
title = re.match(r'^(.*) - .+$', title).group(1)
else: true_source = source['title']
self.articles.append({'url': url, 'title': title, 'pubdate': d,
'source': true_source, 'source_id': source['source_id'],
'source_relevance': source['relevance']})
def fetch_all_articles(self):
try:
os.makedirs(paths['ainews.content_tmp'])
except: pass
f = open("%surllist.txt" % paths['ainews.content_tmp'], 'w')
for article in self.articles:
f.write("%s\n" % article['url'])
f.close()
goose_cmd = "cd %s/goose; MAVEN_OPTS=\"-Xms256m -Xmx800m\" %s exec:java -Dexec.mainClass=com.gravity.goose.FetchMany -Dexec.args=\"%s\" -q" % \
(paths['libraries.tools'], paths['libraries.maven'], paths['ainews.content_tmp'])
Popen(goose_cmd, shell = True).communicate()
i = 0
for article in self.articles:
print "READ: Opening", ("%s%d" % (paths['ainews.content_tmp'], i))
f = codecs.open("%s%d" % (paths['ainews.content_tmp'], i), encoding='utf-8')
rows = f.read().split("\n")
f.close()
#os.remove("%s%d" % (paths['ainews.content_tmp'], i))
# don't move this; have to ensure the increment occurs!
i += 1
if self.db.crawled(article['url']):
print "READ: .. Ignoring; already in crawled database."
continue
if len(rows) < 3:
print "FETCH: .. Ignoring; not enough lines in Goose output: URL=%s, ROWS=%s" % \
(article['url'], rows)
continue
self.db.set_crawled(article['url'])
content = ' '.join(rows[:-2])
content = convert_to_printable(cgi.escape(re.sub(r'\s+', ' ', content))).strip()
content = re.sub("%s\\s*-?\\s*" % re.escape(article['title']), '', content)
content = re.sub(r'\s*Share this\s*', '', content)
content = re.sub(r'\s+,\s+', ', ', content)
content = re.sub(r'\s+\.', '.', content)
# shorten content to (presumably) ignore article comments
content = trunc(content, max_pos=5000)
article['content'] = content
article['summary'] = self.summarizer.summarize_first_two_sentences(article['content'])
print "SUMRY: ..", article['summary']
article['image_url'] = convert_to_printable(rows[-2]).strip()
if len(article['title']) < 5 or len(article['content']) < 1000:
print "CRAWL: -- Ignoring. Content or title too short. Title = {%s}; Content = {%s}" % \
(article['title'], article['content'])
continue
# remove content with blacklisted words
found_blacklist_word = False
for word in blacklist_words:
if re.search("\W%s\W" % word, article['content'], re.IGNORECASE) != None:
print "CRAWL: -- Ignoring. Found blacklisted word \"%s\", ignoring article." % word
found_blacklist_word = True
break
if found_blacklist_word:
continue
urlid = self.put_in_db(article)
if urlid == None: continue
print "CRAWL: ++ {ID:%d/%d} %s (%s, %s)" % \
(urlid, i, article['title'], str(article['pubdate']), article['source'])
def put_in_db(self, article):
"""
Save the article into the database.
"""
try:
urlid = self.db.execute("""insert into urllist (url, pubdate, crawldate,
source, source_id, source_relevance, title, content, summary, image_url)
values (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
(article['url'], str(article['pubdate']), str(self.today),
article['source'], article['source_id'], article['source_relevance'],
article['title'], article['content'], article['summary'], article['image_url']))
return urlid
except KeyboardInterrupt:
print "Quitting early due to keyboard interrupt."
sys.exit()
except Exception, e:
print "ERROR: can't add article to db:", e
return None