Permalink
Browse files

Various improvements with text handling and summarizing.

  • Loading branch information...
1 parent 875daf8 commit 05d0e141321799b8f247407c7dcef956fcd943e3 @joshuaeckroth joshuaeckroth committed Feb 8, 2013
Showing with 81 additions and 19 deletions.
  1. +7 −6 AINewsCrawler.py
  2. +17 −9 AINewsPublisher.py
  3. +6 −0 AINewsSummarizer.py
  4. +51 −4 AINewsTools.py
View
@@ -55,7 +55,6 @@ def fetch_all_sources(self, opts):
f = feedparser.parse(source['link'])
for entry in f.entries:
d = None
- error = False
try:
if hasattr(entry, 'published_parsed'):
d = date(entry.published_parsed[0], entry.published_parsed[1], entry.published_parsed[2])
@@ -64,9 +63,8 @@ def fetch_all_sources(self, opts):
except Exception, e:
print e
print entry
- print "Could not parse date for feed", source['link']
- error = True
- if error: continue
+ print "Setting date as today; could not parse date for feed", source['link']
+ d = self.today
if d > self.today or d < self.earliest_date: continue
if entry.title[-6:] == '(blog)' \
or entry.title[-15:] == '(press release)': continue
@@ -104,6 +102,9 @@ def fetch_all_sources(self, opts):
'source_relevance': source['relevance']})
def fetch_all_articles(self):
+ try:
+ os.makedirs(paths['ainews.content_tmp'])
+ except: pass
f = open("%surllist.txt" % paths['ainews.content_tmp'], 'w')
for article in self.articles:
f.write("%s\n" % article['url'])
@@ -141,8 +142,8 @@ def fetch_all_articles(self):
content = trunc(content, max_pos=5000)
article['content'] = content
- print "SUMRY: ..", article['title']
- article['summary'] = self.summarizer.summarize_single_ots(article['content'])
+ article['summary'] = self.summarizer.summarize_first_two_sentences(article['content'])
+ print "SUMRY: ..", article['summary']
article['image_url'] = convert_to_printable(rows[-2]).strip()
if len(article['title']) < 5 or len(article['content']) < 1000:
View
@@ -17,7 +17,7 @@
from random import shuffle
from subprocess import *
from datetime import date, datetime, timedelta
-from AINewsTools import savefile
+from AINewsTools import savefile, convert_to_printable
from AINewsConfig import config, paths, aitopic_urls, blacklist_urls
from AINewsDB import AINewsDB
from AINewsCorpus import AINewsCorpus
@@ -63,6 +63,7 @@ def filter_and_process(self):
self.articles[urlid]['transcript'] = []
# filter by date
+ print "Filtering by date..."
for urlid in self.articles:
if self.articles[urlid]['pubdate'] == None:
# give a meaningful pubdate so that other code doesn't crash
@@ -78,6 +79,7 @@ def filter_and_process(self):
self.articles[urlid]['pubdate'].strftime('%F')))
# filter by blacklist (for urls)
+ print "Filtering by blacklist..."
for urlid in self.articles:
for black in blacklist_urls:
if re.search(black, self.articles[urlid]['url']):
@@ -87,6 +89,7 @@ def filter_and_process(self):
break
# filter by whitelist
+ print "Filtering by whitelist..."
for urlid in self.articles:
white_wordfreq = self.txtpro.whiteprocess(urlid,
self.articles[urlid]['content'])
@@ -101,6 +104,7 @@ def filter_and_process(self):
'Rejected due to only one or no whitelisted terms')
# update categories based on SVM classifier predictions
+ print "Classifying..."
self.svm_classifier.predict(self.articles)
# drop articles classified as 'NotRelated' unless the article
@@ -113,6 +117,7 @@ def filter_and_process(self):
'Rejected due to NotRelated classification')
# drop articles with no categories (even if user-submitted)
+ print "Dropping articles with no categories..."
for urlid in self.articles:
if len(self.articles[urlid]['categories']) == 0:
self.articles[urlid]['publish'] = False
@@ -121,6 +126,7 @@ def filter_and_process(self):
# filter out duplicates; some articles may have 'publish' set to False
# by this function
+ print "Filtering duplicates..."
self.duplicates.filter_duplicates(self.articles)
for urlid in self.articles:
@@ -130,6 +136,7 @@ def filter_and_process(self):
self.articles[urlid]['summary']
print
+ print "Grabbing images..."
for urlid in self.articles:
# grab and convert article image (if it exists)
self.grab_convert_image(self.articles[urlid])
@@ -138,6 +145,7 @@ def filter_and_process(self):
self.update_db(self.articles[urlid])
# mark each as processed
+ print "Marking as processed."
self.corpus.mark_processed(self.articles.itervalues())
# save sorted list of articles to be read by AINewsPublisher; sort by
@@ -188,13 +196,14 @@ def grab_convert_image(self, article):
img.write(f.read())
img.close()
# produces [urlid].jpg
- Popen("%s -format jpg -gravity Center -thumbnail 100x100 %s%s" % \
+ Popen("%s -format jpg -gravity Center -thumbnail 200x200 %s%s" % \
(paths['imagemagick.mogrify'], paths['ainews.image_dir'], str(article['urlid'])),
shell = True).communicate()
# remove [urlid] file (with no extension)
remove("%s%s" % (paths['ainews.image_dir'], str(article['urlid'])))
article['image_path'] = "public://newsfinder_images/%s.jpg" % article['urlid']
- except:
+ except Exception as e:
+ print "Failed converting image for %d: %s" % (article['urlid'], e)
article['image_path'] = ''
def update_db(self, article):
@@ -209,13 +218,12 @@ def generate_feed_import(self):
"""
xml = FeedImport()
for article in self.articles.values():
+ article['source'] = re.sub(r'&', '&amp;', article['source'])
cats_fixed = []
for cat in article['categories']:
if cat == "Agents": continue
if cat == "AIOverview":
cat = "AI Overview"
- if cat == "Applications":
- cat = "Application Areas"
if cat == "CognitiveScience":
cat = "Cognitive Science"
if cat == "Education": continue
@@ -228,9 +236,9 @@ def generate_feed_import(self):
if cat == "NaturalLanguage":
cat = "Natural Language"
if cat == "Reasoning":
- cat = "Reasoning &amp; Representation"
+ cat = "Representation &amp; Reasoning"
if cat == "Representation":
- cat = "Reasining &amp; Representation"
+ cat = "Representation &amp; Reasoning"
if cat == "ScienceFiction":
cat = "Science Fiction"
if cat == "Systems":
@@ -254,10 +262,10 @@ def generate_email_output(self):
'pubdate': date(int(published[0:4]),
int(published[5:7]),
int(published[8:10])),
- 'summary': node.findtext("Body"),
+ 'summary': re.sub(r'</p>(</blockquote>)?$', '', re.sub(r'^(<blockquote>)?<p>', '', convert_to_printable(node.findtext("Body")))),
'url': node.findtext("Original_link"),
'link': re.sub(r'/news/', 'http://aitopics.org/news/', node.findtext("Link")),
- 'image': re.sub(r'<img', '<img align="left" style="margin: 5px 5px 5px 0;" ',
+ 'image': re.sub(r'<img', '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ',
node.findtext("Representative_image"))})
except Exception, e:
print e
View
@@ -10,6 +10,7 @@
from subprocess import *
import sys
import os
+import re
import nltk
from AINewsConfig import stopwords, paths
@@ -18,6 +19,11 @@ class AINewsSummarizer:
def __init__(self):
self.sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
+ def summarize_first_two_sentences(self, content):
+ content = re.sub(r'^\s*[\(\[].*?[\)\]]\s*', '', content)
+ sentences = self.sent_detector.tokenize(content)
+ return " ".join(sentences[:2]).strip()
+
def summarize_single_ots(self, content):
f = open(paths['ainews.output'] + 'content.txt', 'w')
f.write(content)
View
@@ -18,12 +18,59 @@
import pickle
import locale
import ConfigParser
+from unidecode import unidecode
+
+# from http://www.jamesmurty.com/2011/12/30/python-code-utf8-to-latin1/
+def encode_utf8_to_iso88591(utf8_text):
+ '''
+ Encode and return the given UTF-8 text as ISO-8859-1 (latin1) with
+ unsupported characters replaced by '?', except for common special
+ characters like smart quotes and symbols that we handle as well as we can.
+ For example, the copyright symbol => '(c)' etc.
+
+ If the given value is not a string it is returned unchanged.
+
+ References:
+ en.wikipedia.org/wiki/Quotation_mark_glyphs#Quotation_marks_in_Unicode
+ en.wikipedia.org/wiki/Copyright_symbol
+ en.wikipedia.org/wiki/Registered_trademark_symbol
+ en.wikipedia.org/wiki/Sound_recording_copyright_symbol
+ en.wikipedia.org/wiki/Service_mark_symbol
+ en.wikipedia.org/wiki/Trademark_symbol
+ '''
+ if not isinstance(utf8_text, basestring):
+ return utf8_text
+ # Replace "smart" and other single-quote like things
+ utf8_text = re.sub(
+ u'[\u02bc\u2018\u2019\u201a\u201b\u2039\u203a\u300c\u300d(\xe2\x80\x99)\xe2]',
+ "'", utf8_text)
+ # Replace "smart" and other double-quote like things
+ utf8_text = re.sub(
+ u'[\u00ab\u00bb\u201c\u201d\u201e\u201f\u300e\u300f]',
+ '"', utf8_text)
+ # Replace copyright symbol
+ utf8_text = re.sub(u'[\u00a9\u24b8\u24d2]', '(c)', utf8_text)
+ # Replace registered trademark symbol
+ utf8_text = re.sub(u'[\u00ae\u24c7]', '(r)', utf8_text)
+ # Replace sound recording copyright symbol
+ utf8_text = re.sub(u'[\u2117\u24c5\u24df]', '(p)', utf8_text)
+ # Replace service mark symbol
+ utf8_text = re.sub(u'[\u2120]', '(sm)', utf8_text)
+ # Replace trademark symbol
+ utf8_text = re.sub(u'[\u2122]', '(tm)', utf8_text)
+ # Replace mdash
+ utf8_text = re.sub(u'[\xe2]', '---', utf8_text)
+ # Replace umlaut e
+ utf8_text = re.sub(u'[\xc3]', 'e', utf8_text)
+
+ return utf8_text.encode('ISO-8859-1', 'replace')
def convert_to_printable(text):
- result = ""
- for c in text:
- if c in string.printable: result += str(c)
- return result
+ return unidecode(text)
+ #result = ""
+ #for c in text:
+ # if c in string.printable: result += str(c)
+ #return result
def savefile(filename, content):
"""

0 comments on commit 05d0e14

Please sign in to comment.