Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Removed a lot of unneeded code.

  • Loading branch information...
commit f0162bb48690fed69696a82ffd5b2f063087809b 1 parent cf40550
Joshua Eckroth joshuaeckroth authored
38 AINews.py
View
@@ -14,29 +14,23 @@
from AINewsConfig import config, paths
from AINewsCrawler import AINewsCrawler
from AINewsPublisher import AINewsPublisher
-from AINewsSVMClassifier import AINewsSVMClassifier
def usage():
"""
Print out the command-line usage of AINews.py.
"""
- usage = """ AINews Finder
+ usage = """ NewsFinder
USAGE:
python AINews.py COMMAND [OPTION]
COMMAND:
(1) crawl:
- crawl latest news from outside web.
+ Crawl latest news from outside web.
- (2) train:
- train news classifiers based on human rates.
-
- (3) publish:
- publish news from output files to Pmwiki site and send emails.
- It is weekly publish to the public.
-
- View Latest news at:
- http://www.aaai.org/AITopics/AINews
-
+ (2) prepare:
+ Filter and process the news, and create an XML export.
+
+ (3) email:
+ Generate an email form for submitting the weekly alert.
"""
print usage
@@ -47,10 +41,6 @@ def crawl(opts):
crawler.fetch_all_sources(opts)
crawler.fetch_all_articles()
-def train():
- svm = AINewsSVMClassifier()
- svm.train('db:cat_corpus:cat_corpus_cats')
-
def prepare():
publisher = AINewsPublisher()
publisher.filter_and_process()
@@ -61,29 +51,24 @@ def email():
publisher.publish_email_semiauto()
def main():
- """
- Main function of AINews.py
- """
# Set en_US, UTF8
locale.setlocale(locale.LC_ALL,'en_US.UTF-8')
- commands_list = ("train", "crawl", "prepare", "email", "help")
+ commands_list = ("crawl", "prepare", "email", "help")
try:
if len(sys.argv) < 2 or sys.argv[1] not in commands_list:
usage()
sys.exit()
command = sys.argv[1]
- opts, args = getopt.getopt(sys.argv[2:], 'rf:u:s:', ['url=', 'file=', 'rss', 'source='])
+ opts, args = getopt.getopt(sys.argv[2:], 'rf:u:s:',
+ ['url=', 'file=', 'rss', 'source='])
except getopt.GetoptError, err:
# print help information and exit:
print str(err) # will print something like "option -a not recognized"
usage()
sys.exit(2)
- if command == "train":
- train()
-
- elif command == "crawl":
+ if command == "crawl":
crawl(opts)
elif command == "prepare":
@@ -94,4 +79,3 @@ def main():
if __name__ == "__main__":
main()
-
23 AINewsConfig.py
View
@@ -51,29 +51,6 @@
stopwords.add(word.rstrip())
file.close()
-# aitopic_urls is used to assign each news to a category by comparing the
-# similarity with the following webpages.
-aitopic_urls = [
- "http://aaai.org/AITopics/AIOverview",
- "http://aaai.org/AITopics/Agents",
- "http://aaai.org/AITopics/Applications",
- "http://aaai.org/AITopics/CognitiveScience",
- "http://aaai.org/AITopics/Education",
- "http://aaai.org/AITopics/Ethics",
- "http://aaai.org/AITopics/Games",
- "http://aaai.org/AITopics/History",
- "http://aaai.org/AITopics/Interfaces",
- "http://aaai.org/AITopics/MachineLearning",
- "http://aaai.org/AITopics/NaturalLanguage",
- "http://aaai.org/AITopics/Philosophy",
- "http://aaai.org/AITopics/Reasoning",
- "http://aaai.org/AITopics/Representation",
- "http://aaai.org/AITopics/Robots",
- "http://aaai.org/AITopics/ScienceFiction",
- "http://aaai.org/AITopics/Speech",
- "http://aaai.org/AITopics/Systems",
- "http://aaai.org/AITopics/Vision"
- ]
"""
Regular expression used to extract the date from text
44 AINewsEmail.php
View
@@ -1,44 +0,0 @@
-<?php
-/*
- The AINewsEmail.php is used to send twice-monthly AINews notification for
- subscribers.
- It is called in the publish part in AINews.py.
-*/
-
-$paths = parse_ini_file("config/paths.ini", true);
-$config = parse_ini_file("config/config.ini", true);
-
-$AINEWS_DIR = $paths['ainews']['ainews_root'];
-$PMWIKI_DIR = $paths['pmwiki']['dir'];
-$OUTPUT_DIR = $paths['ainews']['output'];
-$filename = $OUTPUT_DIR."email_output.txt";
-
-$handle = fopen($filename, "r");
-// message
-$message = fread($handle, filesize($filename));
-fclose($handle);
-
-// multiple recipients
-$subscribers = $config['email']['subscribers'];
-$sub_array = explode(":", $subscribers);
-$to = "";
-
-foreach($sub_array as $sub){
- $to .= "$sub, ";
-}
-
-// subject
-$today = date("D, F j, Y");
-$subject = "Weekly AI Alert, $today";
-
-// To send HTML mail, the Content-type header must be set
-$headers = 'MIME-Version: 1.0' . "\r\n";
-$headers .= 'Content-type: text/html; charset=utf-8' . "\r\n";
-
-// Additional headers
-$headers .= 'From: AI Alert<admin11@aaai.org>' . "\r\n";
-
-
-// Mail it
-print mail($to, $subject, $message, $headers);
-?>
340 AINewsParser.py
View
@@ -1,340 +0,0 @@
-# This file is part of NewsFinder.
-# https://github.com/joshuaeckroth/AINews
-#
-# Copyright (c) 2011 by the Association for the Advancement of
-# Artificial Intelligence. This program and parts of it may be used and
-# distributed without charge for non-commercial purposes as long as this
-# notice is included.
-
-"""
-The base parser class for extracting text for general news story.
-The urllib2 and urlparse are used to download the HTML pages from the website.
-It utilizes BeautifulSoup library for HTML parsing. It extracts the creation
-date, title, description, and text from the HTML content.
-
-The general noisy removeal algorithm is based on counting the ratio of
-hyperlinked words # VS total words #. If the ratio is greater than certain
-threshold, the HTML block is consider noisy block such as menu, advertisement...
-
-It is the base class for the specific parsers in AINewsSourceParser.py.
-"""
-
-import re
-import sys
-import time
-import urllib2
-import urlparse
-from datetime import date, datetime, timedelta
-from BeautifulSoup import BeautifulSoup, Comment, BeautifulStoneSoup, \
- NavigableString, Declaration, ProcessingInstruction
-from AINewsDB import AINewsDB
-from AINewsSummarizer import AINewsSummarizer
-from AINewsConfig import config, paths, dateformat_regexps
-
-sys.path.append(paths['libraries.tools'])
-import justext
-
-class AINewsParser:
- def __init__(self):
- self.today = date.today()
- self.earliest_date = self.today - timedelta(days = int(config['ainews.period']))
- self.link_density = config['parser.link_density_ratio']
- self.debug = config['ainews.debug']
- self.db = AINewsDB()
- self.clear()
- self.candidates = []
- self.articles = []
- self.summarizer = AINewsSummarizer()
-
- def justext_extract(self, html):
- good_pars = []
- pars = justext.justext(html, justext.get_stoplist('English'))
- for par in pars:
- if par['class'] == 'good':
- good_pars.append(par['text'])
- return "\n".join(good_pars)
-
- def clear(self):
- self.url = ""
- self.html = None
- self.soup = None
- self.title = ""
- self.description = ""
- self.text = ""
- self.pubdate = None
-
- def parse_url(self, url):
- """
- Using urllib2 to parse url and retrieve HTML code from the given url.
- @param url: Target url news story to be parsed.
- @type url: C{string}
- """
- self.clear()
- agent = config['ainews.agent_name']+'/'+config['ainews.version']
- try:
- request = urllib2.Request(url)
- request.add_header('User-Agent', agent)
- opener = urllib2.build_opener()
- response = opener.open(request)
- except urllib2.HTTPError, error:
- if self.debug:
- if error.code == 404:
- print >> sys.stderr, "HTTPERROR: %s -> %s" % (error, error.url)
- elif error.code == 403:
- print >> sys.stderr, "HTTPERROR: %s -> %s" % (error, error.url)
- else :
- print >> sys.stderr, "HTTPERROR: %s" % error
- return False
- except urllib2.URLError, error:
- if self.debug: print >> sys.stderr, "URLERROR: %s" % error
- return False
- except Exception, error:
- if self.debug: print >> sys.stderr, "ERROR: %s" % error
- return False
-
- url = response.geturl()
- self.url = url.split('#')[0] # Remove in-page anchor link
- self.html = response.read()
- return True
-
- def extract_content(self, extractdate = False):
- """
- Using BeautifulSoup to parse HTML and extract metadata and
- news text.
- @param extractdate: Flag for whether run the date-extraction part
- @type extractdate: C{boolean}
- """
- try:
- self.soup = BeautifulSoup(self.html, \
- convertEntities = BeautifulStoneSoup.HTML_ENTITIES)
- except Exception, error:
- #if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
-
- ################################################################
- # Extract title from HTML code
- ################################################################
- head = self.soup.find('head')
- if head != None:
- title = head.find('title')
- if title != None:
- title = (title.string).encode('utf-8')
- self.title = re.sub(r'\s+', ' ', title)
-
- ################################################################
- # Extract meta description from HTML code
- ################################################################
- description = self.soup.find('meta', {'name':'description'})
- if description != None and description.has_key('content'):
- desc = (description['content']).encode('utf-8')
- desc = re.sub(r'<!--.*?-->', ' ', desc)
- desc = re.sub(r'<.*?>', ' ', desc)
- desc = re.sub(r'&.*?;', ' ', desc)
- self.description = re.sub(r'\s+', ' ', desc)
-
- ################################################################
- # Extract meta published (created) date from HTML code
- ################################################################
- if extractdate:
- self.pubdate = None
- metas = self.soup.findAll('meta')
- for meta in metas:
- if meta.has_key('name') \
- and re.search('date|create|time', meta['name'], \
- re.IGNORECASE)!= None:
- self.pubdate = self.extract_date(meta['content'])
- if self.pubdate != None: break
-
- if self.pubdate == None:
- self.pubdate = self.extract_date(self.html)
- if self.pubdate == None:
- self.pubdate = date.today()
-
- ################################################################
- # Remove all the comments, javascripts, css styles, iframes...
- ################################################################
-
- comments = self.soup.findAll(text=lambda text:isinstance(text, Comment))
- [comment.extract() for comment in comments]
- declarations = self.soup.findAll(text=lambda \
- text:isinstance(text, Declaration))
- [declaration.extract() for declaration in declarations]
- instructions = self.soup.findAll(text=lambda \
- text:isinstance(text, ProcessingInstruction))
- [instruction.extract() for instruction in instructions]
-
- headers = self.soup.findAll('head')
- [header.extract() for header in headers]
-
- scripts = self.soup.findAll('script')
- [script.extract() for script in scripts]
- noscripts = self.soup.findAll('noscript')
- [noscript.extract() for noscript in noscripts]
-
- styles = self.soup.findAll('style')
- [style.extract() for style in styles]
-
- links = self.soup.findAll('link')
- [link.extract() for link in links]
-
- iframes = self.soup.findAll('iframe')
- [iframe.extract() for iframe in iframes]
-
- selects = self.soup.findAll('select')
- [select.extract() for select in selects]
-
- doctypes = self.soup.findAll('!DOCTYPE')
- [doctype.extract() for doctype in doctypes]
-
- labels = self.soup.findAll('label')
- [label.extract() for label in labels]
-
- # Remove embeded video and audio
- objects = self.soup.findAll('object')
- [object.extract() for object in objects]
-
- # Remove images
- imgs = self.soup.findAll('img')
- [img.extract() for img in imgs]
-
- ################################################################
- # Extact the major news text content from HTML code by
- # calling traverse() to remove noisy HTML blocks by link density
- ################################################################
- self.traverse(self.soup)
-
- text = self.extract_genenraltext(self.soup)
-
- # Use regular expression to filter extra comments and tags
- text = re.sub(r'<!--.*?-->', ' ', text)
- text = re.sub(r'<.*?>', ' ', text)
-
- text = re.sub(r'\s+', ' ', text)
- self.text = text.encode('utf-8')
-
- return True
-
- def extract_genenraltext(self, mysoup):
- """
- Recursively extract text from a BeautifulSoup object.
- @param mysoup: the target BeautifulSoup object to be extracted
- @type mysoup: C{BeautifulSoup}
- """
- if mysoup == None: return "\n"
- if isinstance(mysoup,NavigableString):
- return mysoup.string
- else:
- text = ""
- for subelement in mysoup.contents:
- text += self.extract_genenraltext(subelement)
- return text
- '''
- def extract_genenraltext(self, mysoup):
- """
- Recursively extract text from a BeautifulSoup object.
- @param mysoup: the target BeautifulSoup object to be extracted
- @type mysoup: C{BeautifulSoup}
- """
- if mysoup == None: return ""
- if type(mysoup) == NavigableString:
- return mysoup.string.strip()
- else:
- text = ""
- for subelement in mysoup.contents:
- text += self.extract_genenraltext(subelement)+' '
- return text.strip()
- '''
- def extract_linktext(self, mysoup):
- """
- Extract all the text which are hyperlinked from a BeautifulSoup object.
- @param mysoup: the target BeautifulSoup object to be extracted
- @type mysoup: C{BeautifulSoup}
- """
- text = ""
- if type(mysoup) == NavigableString:
- return ""
- else:
- hyperlinks = mysoup.findAll('a')
- for hyperlink in hyperlinks:
- text += self.extract_genenraltext(hyperlink)+' '
- return text.strip()
-
- def getwords(self, raw):
- if raw == "": return []
- return re.split(r'\W+',raw)
-
- def traverse(self, mysoup):
- """
- Traverse the beautifulsoup and iteratively remove noisy HTML blocks
- based on link-word density.
- @param mysoup: the target BeautifulSoup object
- @type mysoup: C{BeautifulSoup}
- """
- if type(mysoup) != NavigableString and len(mysoup.contents)>0:
- subelement = mysoup.contents[0]
- while(subelement != None):
- gt = self.extract_genenraltext(subelement)
- word_count = len(self.getwords(gt))
- if word_count == 0:
- next = subelement.nextSibling
- subelement.extract() # Remove subelement from the soup
- subelement = next
- continue
- lt = self.extract_linktext(subelement)
- linkword_count = len(self.getwords(lt))
- ratio = 1.0*linkword_count/word_count
-
- if ratio >= self.link_density:
- next = subelement.nextSibling
- subelement.extract() # Remove subelement from the soup
- subelement = next
- else:
- self.traverse(subelement)
- subelement = subelement.nextSibling
-
- def extract_date(self, text):
- """
- Given a text, it tries all the dateformat and expect to extract the
- first matching date from the text.
- @param text: Target text
- @type text: C{string}
- """
- today = date.today()
- for dateformat in dateformat_regexps:
- regexp = dateformat_regexps[dateformat][0]
- res = re.search(regexp, text, re.IGNORECASE)
- if res == None:
- continue
- else:
- date_str = res.group(0)
- t = time.strptime(date_str,dateformat_regexps[dateformat][1])
- d = date(t[0], t[1], t[2])
- if d > today: continue
- else:return d
- return None
-
- def parse_date(self, date_str, dateformat):
- """
- Given a string of date and a date format, return a date object
- @param date_str: Target date text
- @type date_str: C{string}
- @param dateformat: regular expression of the date format
- @type dateformat: C{string}
- """
- t = time.strptime(date_str,dateformat_regexps[dateformat][1])
- d = date(t[0], t[1], t[2])
- return d
-
- def print_content(self):
- print "\n*** Title *** \n\t", self.title
- print "\n*** URL *** \n\t", self.url
- print "\n*** Meta Description ***\n\t", self.description
- print "\n*** Body Text ***\n\t", self.text
- print "\n*** Publish Date ***\n\t", self.pubdate
-
- def remove_tag(self, soup, name, attr=None, value=None):
- if attr!=None:
- tags = soup.findAll(name, {attr:value})
- else:
- tags = soup.findAll(name)
- [tag.extract() for tag in tags]
63 AINewsParserExperiment.py
View
@@ -1,63 +0,0 @@
-
-from AINewsCorpus import AINewsCorpus
-from AINewsConfig import paths
-from AINewsTools import trunc
-import sys
-sys.path.append(paths['libraries.tools'])
-import justext
-import os
-import glob
-import re
-import ents
-from subprocess import *
-
-
-### modified from: http://www.korokithakis.net/posts/finding-the-levenshtein-distance-in-python/
-def levenshtein_distance(first, second):
- """Find the Levenshtein distance between two arrays of strings."""
- if len(first) > len(second):
- first, second = second, first
- if len(second) == 0:
- return len(first)
- first_length = len(first) + 1
- second_length = len(second) + 1
- distance_matrix = [[0] * second_length for x in range(first_length)]
- for i in range(first_length):
- distance_matrix[i][0] = i
- for j in range(second_length):
- distance_matrix[0][j]=j
- for i in xrange(1, first_length):
- for j in range(1, second_length):
- deletion = distance_matrix[i-1][j] + 1
- insertion = distance_matrix[i][j-1] + 1
- substitution = distance_matrix[i-1][j-1]
- if first[i-1] != second[j-1]:
- substitution += 1
- distance_matrix[i][j] = min(insertion, deletion, substitution)
- return distance_matrix[first_length-1][second_length-1]
-
-
-def evaluate():
- corpus = AINewsCorpus()
- print "urlid,length truewords,length justext,length goose,ld justtext,ld goose"
- for filename in sorted(glob.glob("../../experiments/justext/*.true")):
- truetext = ents.convert(file(filename).read())
- truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False))
- truewords = re.split(r'\s+', truetext)
- urlid = filename[26:30]
- article = corpus.get_article(urlid)
- if article == None: continue
- articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False))
- articlewords = re.split(r'\s+', articletext)
- goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url']
- (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate()
- goosetext = ents.convert(stdout.encode('ascii'))
- goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False))
- goosewords = re.split(r'\s+', goosetext)
- ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords))
- ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords))
- print "%s,%d,%d,%d,%.4f,%.4f" % \
- (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
-
-
-evaluate()
300 AINewsPmwiki.php
View
@@ -1,300 +0,0 @@
-<?php
-/*****************************************************************************
- *
- * AINewsPmwiki.php is used to read-in the AINewsRanker's output text,
- * then save the output text into PmWiki format.
- * The PageStore class is extracted from the pmwiki.php from PmWiki directory.
- *
- ******************************************************************************/
-error_reporting(E_ALL ^ E_NOTICE);
-$paths = parse_ini_file("config/paths.ini", true);
-$PMWIKI_DIR = $paths['pmwiki']['dir'];
-$OUTPUT_DIR = $paths['ainews']['output'];
-
-chdir($PMWIKI_DIR);
-
-
-/******************************************************************************
-*
-* Following code is extracted from pmwiki.php from the Pmwiki directory
-*
-******************************************************************************/
-$FarmD = dirname(__FILE__);
-$WorkDir = 'wiki.d';
-$WikiDir = new PageStore('wiki.d/{$FullName}');
-$WikiLibDirs = array(&$WikiDir,new PageStore('$FarmD/wikilib.d/{$FullName}'));
-$Now=time();
-define('READPAGE_CURRENT', $Now+604800);
-$Version = 1.0;
-
-class PageStore {
- var $dirfmt;
- var $iswrite;
- var $attr;
- function PageStore($d='$WorkDir/$FullName', $w=0, $a=NULL) {
- $this->dirfmt = $d; $this->iswrite = $w; $this->attr = (array)$a;
- $GLOBALS['PageExistsCache'] = array();
- }
- function pagefile($pagename) {
- global $FarmD;
- $dfmt = $this->dirfmt;
- if ($pagename > '') {
- $pagename = str_replace('/', '.', $pagename);
- if ($dfmt == 'wiki.d/{$FullName}') # optimizations for
- return "wiki.d/$pagename"; # standard locations
- if ($dfmt == '$FarmD/wikilib.d/{$FullName}') #
- return "$FarmD/wikilib.d/$pagename"; #
- if ($dfmt == 'wiki.d/{$Group}/{$FullName}')
- return preg_replace('/([^.]+).*/', 'wiki.d/$1/$0', $pagename);
- }
- return FmtPageName($dfmt, $pagename);
- }
- function read($pagename, $since=0) {
- $newline = '';
- $urlencoded = false;
- $pagefile = $this->pagefile($pagename);
- if ($pagefile && ($fp=@fopen($pagefile, "r"))) {
- $page = $this->attr;
- while (!feof($fp)) {
- $line = fgets($fp, 4096);
- while (substr($line, -1, 1) != "\n" && !feof($fp))
- { $line .= fgets($fp, 4096); }
- $line = rtrim($line);
- if ($urlencoded) $line = urldecode(str_replace('+', '%2b', $line));
- @list($k,$v) = explode('=', $line, 2);
- if (!$k) continue;
- if ($k == 'version') {
- $ordered = (strpos($v, 'ordered=1') !== false);
- $urlencoded = (strpos($v, 'urlencoded=1') !== false);
- if (strpos($v, 'pmwiki-0.')!==false) $newline="\262";
- }
- if ($k == 'newline') { $newline = $v; continue; }
- if ($since > 0 && preg_match('/:(\\d+)/', $k, $m) && $m[1] < $since) {
- if ($ordered) break;
- continue;
- }
- if ($newline) $v = str_replace($newline, "\n", $v);
- $page[$k] = $v;
- }
- fclose($fp);
- }
- return @$page;
- }
- function write($pagename,$page) {
- global $Now, $Version;
- $page['name'] = $pagename;
- $page['time'] = $Now;
- $page['host'] = $_SERVER['REMOTE_ADDR'];
- $page['agent'] = @$_SERVER['HTTP_USER_AGENT'];
- $page['rev'] = @$page['rev']+1;
- unset($page['version']); unset($page['newline']);
- uksort($page, 'CmpPageAttr');
- $s = false;
- $pagefile = $this->pagefile($pagename);
- $dir = dirname($pagefile);
- //mkdirp($dir);
- if (!file_exists("$dir/.htaccess") && $fp = @fopen("$dir/.htaccess", "w"))
- { fwrite($fp, "Order Deny,Allow\nDeny from all\n"); fclose($fp); }
- if ($pagefile && ($fp=fopen("$pagefile,new","w"))) {
- $r0 = array('%', "\n", '<');
- $r1 = array('%25', '%0a', '%3c');
- $x = "version=$Version ordered=1 urlencoded=1\n";
- $s = true && fputs($fp, $x); $sz = strlen($x);
- foreach($page as $k=>$v)
- if ($k > '' && $k{0} != '=') {
- $x = str_replace($r0, $r1, "$k=$v") . "\n";
- $s = $s && fputs($fp, $x); $sz += strlen($x);
- }
- $s = fclose($fp) && $s;
- $s = $s && (filesize("$pagefile,new") > $sz * 0.95);
- if (file_exists($pagefile)) $s = $s && unlink($pagefile);
- $s = $s && rename("$pagefile,new", $pagefile);
- }
- $s && fixperms($pagefile);
- if (!$s)
- Abort("Cannot write page to $pagename ($pagefile)...changes not saved");
-
- }
- function exists($pagename) {
- if (!$pagename) return false;
- $pagefile = $this->pagefile($pagename);
- return ($pagefile && file_exists($pagefile));
- }
- function delete($pagename) {
- global $Now;
- $pagefile = $this->pagefile($pagename);
- @rename($pagefile,"$pagefile,del-$Now");
- }
- function ls($pats=NULL) {
- global $GroupPattern, $NamePattern;
- StopWatch("PageStore::ls begin {$this->dirfmt}");
- $pats=(array)$pats;
- array_push($pats, "/^$GroupPattern\.$NamePattern$/");
- $dir = $this->pagefile('$Group.$Name');
- $maxslash = substr_count($dir, '/');
- $dirlist = array(preg_replace('!/*[^/]*\\$.*$!','',$dir));
- $out = array();
- while (count($dirlist)>0) {
- $dir = array_shift($dirlist);
- $dfp = @opendir($dir); if (!$dfp) { continue; }
- $dirslash = substr_count($dir, '/') + 1;
- $o = array();
- while ( ($pagefile = readdir($dfp)) !== false) {
- if ($pagefile{0} == '.') continue;
- if ($dirslash < $maxslash && is_dir("$dir/$pagefile"))
- { array_push($dirlist,"$dir/$pagefile"); continue; }
- if ($dirslash == $maxslash) $o[] = $pagefile;
- }
- closedir($dfp);
- StopWatch("PageStore::ls merge {$this->dirfmt}");
- $out = array_merge($out, MatchPageNames($o, $pats));
- }
- StopWatch("PageStore::ls end {$this->dirfmt}");
- return $out;
- }
-}
-
-function ReadPage($pagename, $since=0) {
- # read a page from the appropriate directories given by $WikiReadDirsFmt.
- global $WikiLibDirs,$Now;
- foreach ($WikiLibDirs as $dir) {
- $page = $dir->read($pagename, $since);
- if ($page) break;
- }
- if (@!$page) $page['ctime'] = $Now;
- if (@!$page['time']) $page['time'] = $Now;
- return $page;
-}
-
-function WritePage($pagename,$page) {
- global $WikiLibDirs,$WikiDir,$LastModFile;
- $WikiDir->iswrite = 1;
- for($i=0; $i<count($WikiLibDirs); $i++) {
- $wd = &$WikiLibDirs[$i];
- if ($wd->iswrite && $wd->exists($pagename)) break;
- }
- if ($i >= count($WikiLibDirs)) $wd = &$WikiDir;
- $wd->write($pagename,$page);
- if ($LastModFile && !@touch($LastModFile))
- { unlink($LastModFile); touch($LastModFile); fixperms($LastModFile); }
-}
-
-## fixperms attempts to correct permissions on a file or directory
-## so that both PmWiki and the account (current dir) owner can manipulate it
-function fixperms($fname, $add = 0) {
- clearstatcache();
- if (!file_exists($fname)) Abort('?no such file');
- $bp = 0;
- if (fileowner($fname)!=@fileowner('.')) $bp = (is_dir($fname)) ? 007 : 006;
- if (filegroup($fname)==@filegroup('.')) $bp <<= 3;
- $bp |= $add;
- if ($bp && (fileperms($fname) & $bp) != $bp)
- @chmod($fname,fileperms($fname)|$bp);
-}
-
-## CmpPageAttr is used with uksort to order a page's elements with
-## the latest items first. This can make some operations more efficient.
-function CmpPageAttr($a, $b) {
- @list($x, $agmt) = explode(':', $a);
- @list($x, $bgmt) = explode(':', $b);
- if ($agmt != $bgmt)
- return ($agmt==0 || $bgmt==0) ? $agmt - $bgmt : $bgmt - $agmt;
- return strcmp($a, $b);
-}
-
-/******************************************************************************
- *
- * Following code is used to read the AINewsRanker's
- * output and save them into PmWiki format
- *
- ****************************************************************************/
-
-# Write Latest News
-$filename = $OUTPUT_DIR."pmwiki_output.txt";
-$handle = fopen($filename, "r");
-$output = fread($handle, filesize($filename));
-fclose($handle);
-
-$pagename_result = "AITopics.AINews";
-$page = ReadPage($pagename_result, READPAGE_CURRENT);
-$page['text'] = $output;
-WritePage($pagename_result, $page);
-
-# Write Today News
-$filename = $OUTPUT_DIR."pmwiki_output_norater.txt";
-$handle = fopen($filename, "r");
-$output = fread($handle, filesize($filename));
-fclose($handle);
-
-$today = date("Y-m-d");
-$pagename_result = "AINewsFinder.$today-News";
-$page = ReadPage($pagename_result, READPAGE_CURRENT);
-$page['text'] = $output;
-WritePage($pagename_result, $page);
-
-# Write All News
-$filename = $OUTPUT_DIR."pmwiki_all.txt";
-$handle = fopen($filename, "r");
-$output = fread($handle, filesize($filename));
-fclose($handle);
-
-$today = date("Y-m-d");
-$pagename_result = "AINewsFinder.$today-AllNews";
-$page = ReadPage($pagename_result, READPAGE_CURRENT);
-$page['text'] = $output;
-WritePage($pagename_result, $page);
-
-# Add all news to AINewsFinder.NewsArchive page
-$curr = date("Y-m-d G:i:s");
-$archivepage = "AINewsFinder.AllNews";
-$page = ReadPage($archivepage, READPAGE_CURRENT);
-if (preg_match("/$curr/", $page['text']) == 0) {
- $page['text'] = "[[".$pagename_result."|$curr AllNews]][[<<]]\n".$page['text'];
- WritePage($archivepage, $page);
-}
-
-
-# Add today news to AITopics.NewsArchive page
-$curr = date("M d");
-$year = date("Y");
-$pagename_result = "AINewsFinder.$today-News";
-$archivepage = "AITopics.NewsArchive";
-$page = ReadPage($archivepage, READPAGE_CURRENT);
-$i = preg_match("/\'\'\'$year\'\'\'/", $page['text']);
-
-if ($i == 0) {
- $pos = strpos($page['text'], "page.");
- $pretext = substr($page['text'], 0, $pos+7);
- $protext = substr($page['text'], $pos+7);
- $page['text'] = $pretext."'''$year'''\n*[[".$pagename_result."|$curr]][[<<]]\n".$protext;
- WritePage($archivepage, $page);
-}else{
- if(preg_match("/$curr/", $page['text']) == 0) {
- $pos = strpos($page['text'], "'''$year'''");
- $pretext = substr($page['text'], 0, $pos+11);
- $protext = substr($page['text'], $pos+11);
- $page['text'] = $pretext."*[[".$pagename_result."|$curr]][[<<]]\n".$protext;
- WritePage($archivepage, $page);
- }
-}
-
-
-# Add each news into AIArticles
-$year = date("Y");
-$file = $OUTPUT_DIR."urlids_output.txt";
-$lines = file($file);
-foreach($lines as $line_num => $id){
-
- $id = trim($id);
- $filename = $OUTPUT_DIR."aiarticles/".$id;
- $handle = fopen($filename, "r");
- $output = fread($handle, filesize($filename));
- fclose($handle);
- $pagename_result = "AIArticles.".$year."-".$id;
- $page = ReadPage($pagename_result, READPAGE_CURRENT);
- $page['text'] = $output;
- WritePage($pagename_result, $page);
-
-}
-
-?>
3  AINewsPublisher.py
View
@@ -18,11 +18,10 @@
from subprocess import *
from datetime import date, datetime, timedelta
from AINewsTools import savefile
-from AINewsConfig import config, paths, aitopic_urls, blacklist_urls
+from AINewsConfig import config, paths, blacklist_urls
from AINewsDB import AINewsDB
from AINewsCorpus import AINewsCorpus
from AINewsDuplicates import AINewsDuplicates
-from AINewsSVMClassifier import AINewsSVMClassifier
from AINewsTextProcessor import AINewsTextProcessor
from AINewsSummarizer import AINewsSummarizer
58 AINewsSVMAnalyzer.py
View
@@ -1,58 +0,0 @@
-# This file is part of NewsFinder.
-# https://github.com/joshuaeckroth/AINews
-#
-# Copyright (c) 2011 by the Association for the Advancement of
-# Artificial Intelligence. This program and parts of it may be used and
-# distributed without charge for non-commercial purposes as long as this
-# notice is included.
-
-import sys
-import re
-import operator
-from datetime import datetime
-from subprocess import *
-from AINewsConfig import paths
-from AINewsCorpus import AINewsCorpus
-
-class AINewsSVMAnalyzer:
- def __init__(self):
- self.corpus = AINewsCorpus()
- self.categories = self.corpus.categories
-
- def model_word_weights(self, category):
- f = open(paths['svm.svm_data']+category+'.model', 'r')
- lines = f.readlines()
- f.close()
- labels = re.match('label (-?1) (-?1)', lines[5]).group(1,2)
- if labels[0] == '1': pos_label = 0
- else: pos_label = 1
-
- cmd = './svm-weight -f %d %s%s.model' % \
- (len(self.corpus.wordids), paths['svm.svm_data'], category)
- (stdout, _) = Popen(cmd, shell = True, stdout = PIPE).communicate()
- weights = {}
- for (wordid,weight) in re.findall('(\d+):(\S+)', stdout):
- weight = float(weight)
- if pos_label == 1: weight = -weight
- weights[self.corpus.wordids[int(wordid)]] = weight
- return weights
-
- def analyze_all(self):
- for cat in self.categories:
- weights = analyzer.model_word_weights(cat)
- weights_sorted = sorted(weights.items(), key=operator.itemgetter(1))
- print "**%s**" % cat
- print "--Least significant:"
- for (word, weight) in weights_sorted[0:10]:
- print ("%s: %.3f, " % (word, weight)),
- print
- print "--Most significant:"
- for (word, weight) in weights_sorted[-10:]:
- print ("%s: %.3f, " % (word, weight)),
- print
- print
-
-if __name__ == "__main__":
- analyzer = AINewsSVMAnalyzer()
- analyzer.analyze_all()
-
260 AINewsSVMClassifier.py
View
@@ -1,260 +0,0 @@
-# This file is part of NewsFinder.
-# https://github.com/joshuaeckroth/AINews
-#
-# Copyright (c) 2011 by the Association for the Advancement of
-# Artificial Intelligence. This program and parts of it may be used and
-# distributed without charge for non-commercial purposes as long as this
-# notice is included.
-
-import sys
-import re
-from datetime import datetime
-from subprocess import *
-from svmutil import *
-from AINewsConfig import paths
-from AINewsCorpus import AINewsCorpus
-from AINewsTools import loadfile, savepickle
-
-class AINewsSVMClassifier:
- def __init__(self):
- self.corpus = AINewsCorpus()
-
- def predict(self, articles):
- urlids = sorted(articles.keys())
- for urlid in articles:
- articles[urlid]['categories'] = []
-
- # produce the test input file
- f = open(paths['svm.svm_data']+'predict', 'w')
- for urlid in urlids:
- for cat in self.corpus.categories:
- articles[urlid]['cat_probs'] = {}
- tfidf = self.corpus.get_tfidf(urlid, articles[urlid]['wordfreq'])
- f.write("+1 ")
- for wordid in sorted(tfidf.keys()):
- f.write("%s:%f " % (wordid, tfidf[wordid]))
- f.write("\n")
- f.close()
-
- # predict each category plus NotRelated
- for cat in self.corpus.categories:
- cmd = 'svm-scale -r "%s" "%s" > "%s"' % \
- (paths['svm.svm_data']+cat+'.range', \
- paths['svm.svm_data']+'predict', \
- paths['svm.svm_data']+'predict-'+cat+'.scaled')
- Popen(cmd, shell = True).wait()
- cmd = 'svm-predict -b 1 "%s" "%s" "%s" > /dev/null' % \
- (paths['svm.svm_data']+'predict-'+cat+'.scaled', \
- paths['svm.svm_data']+cat+'.model',
- paths['svm.svm_data']+'predict-'+cat+'.output')
- Popen(cmd, shell = True).wait()
- f = open(paths['svm.svm_data']+'predict-'+cat+'.output', 'r')
- lines = f.readlines()
- f.close()
- # first line of output file says "labels -1 1" or whatever;
- # the order could be different, so we have to check
- labels = re.match('labels (-?1) (-?1)', lines[0]).group(1,2)
- if labels[0] == '1': pos_label = 0
- else: pos_label = 1
- for i in range(1, len(lines)):
- (prediction, prob1, prob2) = \
- re.match('(-?1) (\d\.?\d*e?-?\d*) (\d\.?\d*e?-?\d*)', lines[i]).group(1,2,3)
- if pos_label == 0: prob_yes = prob1
- else: prob_yes = prob2
- articles[urlids[i-1]]['cat_probs'][cat] = prob_yes
- if prediction == '1':
- articles[urlids[i-1]]['categories'].append(cat)
-
- for urlid in urlids:
- articles[urlid]['categories'] = sorted(articles[urlid]['categories'])
-
- def train(self, ident):
- (train_corpus, _) = self.corpus.load_corpus(ident, 1.0, True)
- self.generate_libsvm_input(train_corpus, 'train')
- print "Done generating SVM input."
- self.libsvm_train(False)
-
- def evaluate(self, ident, pct):
- for i in range(1):
- results = {}
- (train_corpus, predict_corpus) = self.corpus.load_corpus(ident, float(pct), True, True)
- savepickle(paths['svm.svm_data_tmp']+'wordids.pkl', self.corpus.wordids)
- self.generate_libsvm_input(train_corpus, 'train')
- self.generate_libsvm_input(predict_corpus, 'predict')
- print "Done generating SVM input."
- results = self.libsvm_train(True)
- print "Iteration", i, ", pct", pct
- print results
-
- def generate_libsvm_input(self, corpus, suffix):
- train_labels = {}
- train_samples = {}
- for cat in self.corpus.categories:
- train_labels[cat] = []
- train_samples[cat] = []
- for c in corpus:
- cats = c[2].split(' ')
- for cat in self.corpus.categories:
- train_samples[cat].append(self.corpus.get_tfidf(c[0], c[1]))
- if cat in cats:
- train_labels[cat].append("+1")
- else:
- train_labels[cat].append("-1")
-
- for cat in self.corpus.categories:
- # do feature selection
- whole_fsc_dict,whole_imp_v = cal_feat_imp(train_labels[cat], train_samples[cat])
- # choose top 9000 features
- fv = whole_imp_v[:9000]
- tr_sel_samp = select(train_samples[cat], fv)
-
- model = open(paths['svm.svm_data_tmp']+cat+'-'+suffix, 'w')
- for i in range(len(train_samples[cat])):
- model.write("%s " % train_labels[cat][i])
- for wordid in sorted(tr_sel_samp[i].iterkeys()):
- model.write("%s:%f " % (wordid, tr_sel_samp[i][wordid]))
- model.write("\n")
- model.close()
-
- def libsvm_train(self, alsotest):
- results = {}
- # train each category plus NotRelated
- for cat in self.corpus.categories:
- if alsotest:
- sys.stdout.write("Training and testing " + cat + "... ")
- else:
- sys.stdout.write("Training " + cat + "... ")
- sys.stdout.flush()
- if alsotest:
- cmd = 'python svm-easy.py "%s" "%s"' % \
- (paths['svm.svm_data_tmp']+cat+'-train',
- paths['svm.svm_data_tmp']+cat+'-predict')
- else:
- cmd = 'python svm-easy.py "%s"' % (paths['svm.svm_data_tmp']+cat+'-train')
- (stdout, _) = Popen(cmd, shell = True, stdout=PIPE).communicate()
- if alsotest:
- m = re.match('.*Accuracy = (\d+).*', re.sub('\n', '', stdout))
- results[cat] = float(m.group(1))
- sys.stdout.write(str(results[cat]) + "\n")
- sys.stdout.flush()
- return results
-
-### from fselect.py
-### select features and return new data
-def select(sample, feat_v):
- new_samp = []
-
- feat_v.sort()
-
- #for each sample
- for s in sample:
- point={}
- #for each feature to select
- for f in feat_v:
- if f in s: point[f]=s[f]
-
- new_samp.append(point)
-
- return new_samp
-
-### from fselect.py
-### compare function used in list.sort(): sort by element[1]
-#def value_cmpf(x,y):
-# if x[1]>y[1]: return -1
-# if x[1]<y[1]: return 1
-# return 0
-def value_cmpf(x):
- return (-x[1]);
-
-### from fselect.py
-### cal importance of features
-### return fscore_dict and feat with desc order
-def cal_feat_imp(labels,samples):
- score_dict=cal_Fscore(labels,samples)
-
- score_tuples = list(score_dict.items())
- score_tuples.sort(key = value_cmpf)
-
- feat_v = score_tuples
- for i in range(len(feat_v)): feat_v[i]=score_tuples[i][0]
-
- return score_dict,feat_v
-
-### from fselect.py
-### return a dict containing F_j
-def cal_Fscore(labels,samples):
-
- data_num=float(len(samples))
- p_num = {} #key: label; value: data num
- sum_f = [] #index: feat_idx; value: sum
- sum_l_f = {} #dict of lists. key1: label; index2: feat_idx; value: sum
- sumq_l_f = {} #dict of lists. key1: label; index2: feat_idx; value: sum of square
- F={} #key: feat_idx; valud: fscore
- max_idx = -1
-
- ### pass 1: check number of each class and max index of features
- for p in range(len(samples)): # for every data point
- label=labels[p]
- point=samples[p]
-
- if label in p_num: p_num[label] += 1
- else: p_num[label] = 1
-
- for f in point.keys(): # for every feature
- if f>max_idx: max_idx=f
- ### now p_num and max_idx are set
-
- ### initialize variables
- sum_f = [0 for i in range(max_idx)]
- for la in p_num.keys():
- sum_l_f[la] = [0 for i in range(max_idx)]
- sumq_l_f[la] = [0 for i in range(max_idx)]
-
- ### pass 2: calculate some stats of data
- for p in range(len(samples)): # for every data point
- point=samples[p]
- label=labels[p]
- for tuple in point.items(): # for every feature
- f = tuple[0]-1 # feat index
- v = tuple[1] # feat value
- sum_f[f] += v
- sum_l_f[label][f] += v
- sumq_l_f[label][f] += v**2
- ### now sum_f, sum_l_f, sumq_l_f are done
-
- ### for each feature, calculate f-score
- eps = 1e-12
- for f in range(max_idx):
- SB = 0
- for la in p_num.keys():
- SB += (p_num[la] * (sum_l_f[la][f]/p_num[la] - sum_f[f]/data_num)**2 )
-
- SW = eps
- for la in p_num.keys():
- SW += (sumq_l_f[la][f] - (sum_l_f[la][f]**2)/p_num[la])
-
- F[f+1] = SB / SW
-
- return F
-
-
-
-if __name__ == "__main__":
- start = datetime.now()
-
- svm = AINewsSVMClassifier()
- #urlids = []
- #for i in range(0, 2000):
- # if svm.corpus.get_article(i) != None:
- # urlids.append(i)
-
- if len(sys.argv) < 3:
- print("Wrong args.")
- sys.exit()
-
- if sys.argv[1] == "evaluate":
- svm.evaluate(sys.argv[2], sys.argv[3])
- elif sys.argv[1] == "predict":
- svm.predict(sys.argv[2].split(',')) #svm.predict(urlids)
-
- print datetime.now() - start
1,914 AINewsSourceParser.py
View
@@ -1,1914 +0,0 @@
-# This file is part of NewsFinder.
-# https://github.com/joshuaeckroth/AINews
-#
-# Copyright (c) 2011 by the Association for the Advancement of
-# Artificial Intelligence. This program and parts of it may be used and
-# distributed without charge for non-commercial purposes as long as this
-# notice is included.
-
-"""
-AINewsSourceParser includes a set of parsers inherit from the the AINewsParser.
-Each of the parser is desgined specifically for one source/publisher website.
-It might be crawling the search page or RSS/Atom feeds and get a list of
-candiate news stories. Then it crawls all the news stories down and filter
-unrelated and store them into database.
-"""
-
-import re
-import time
-import sys
-import feedparser
-import ents
-from subprocess import *
-from datetime import date, datetime, timedelta
-from BeautifulSoup import BeautifulSoup, Comment, BeautifulStoneSoup, \
- NavigableString, Declaration, ProcessingInstruction
-
-from AINewsConfig import config, paths, dateformat_regexps
-from AINewsParser import AINewsParser
-from AINewsTools import strip_html, loadfile2, convert_to_printable, trunc
-
-def ParserFactory(publisher, type = None):
- """
- A factory method to return a specific parser for the specific news source.
- @param publisher: the source/publisher name
- @type publisher: C{string}
- @param type: either 'search' or 'rss'
- @type type: C{string}
- """
- if publisher == "UserSubmitted":
- parser = UserSubmittedParser()
- elif publisher == "Wall Street Journal" and type == 'search':
- parser = WSJParser()
- elif publisher == "Forbes" and type == 'search':
- parser = ForbesParser()
- elif publisher == "BBC" and type == 'search':
- parser = BBCParser()
- elif publisher == "CNet" and type == 'search':
- parser = CNetParser()
- elif publisher == "Technology Review" and type == 'search':
- parser = TechnologyReviewParser()
- elif publisher == "Scientific American" and type == 'search':
- parser = ScientificAmericanParser()
- elif publisher == "Discovery" and type == 'search':
- parser = DiscoveryParser()
- elif publisher == "Guardian" and type == 'search':
- parser = GuardianParser()
- elif publisher == "TheTimes" and type == 'search':
- parser = TheTimesParser()
- elif publisher == "ScientificAmerican" and type == 'search':
- parser = ScientificAmericanParser()
- elif publisher == "NPR" and type == 'search':
- parser = NPRParser()
- elif publisher == "Independent" and type == 'search':
- parser = IndependentParser()
- elif publisher == "MSNBC" and type == 'search':
- parser = MSNBCParser()
- elif publisher == "Nature" and type == 'search':
- parser = NatureParser()
- elif publisher == "Times" and type == 'search':
- parser = TimesParser()
- elif publisher == "PCWorld" and type == 'search':
- parser = PCWorldParser()
- elif publisher == "NY Times" and type == 'rss':
- parser = NYTRSSParser()
- elif publisher == "Wired" and type == 'rss':
- parser = WiredRSSParser()
- elif publisher == "Popular Science" and type == 'rss':
- parser = PopularScienceRSSParser()
- elif publisher == "CNN" and type == 'rss':
- parser = CNNRSSParser()
- elif publisher == "MITNews" and type == 'rss':
- parser = MITNewsRSSParser()
- elif publisher == "Wash Post" and type == 'rss':
- parser = WashPostRSSParser()
- elif publisher == "GoogleNews" and type == 'rss':
- parser = GoogleNewsRSSParser()
- elif publisher == "NewScientist" and type == 'rss':
- parser = NewScientistRSSParser()
- elif publisher == "ZDNet" and type == 'rss':
- parser = ZDNetRSSParser()
- elif publisher == "Kurzweilai" and type == 'rss':
- parser = KurzweilaiRSSParser()
- elif publisher == "USAToday" and type == 'rss':
- parser = USATodayRSSParser()
- elif publisher == "Engadget" and type == 'rss':
- parser = EngadgetRSSParser()
- elif publisher == "LATimes" and type == 'rss':
- parser = LATimesRSSParser()
- elif publisher == "RobotNet" and type == 'rss':
- parser = RobotNetRSSParser()
- elif publisher == "ScienceDaily" and type == 'rss':
- parser = ScienceDailyRSSParser()
- elif publisher == "IEEE Spectrum" and type == 'rss':
- parser = IEEESpectrumRSSParser()
- elif publisher == "Curata" and type == 'rss':
- parser = CurataRSSParser()
- elif publisher == "RSS":
- parser = RSSParser()
- else:
- parser = None
- return parser
-
-class UserSubmittedParser(AINewsParser):
- """
- Parser for user-submitted news.
- """
- def parse_sourcepage(self, url):
- xmlcontent = loadfile2(url)
- xmlcontent = unicode(xmlcontent, errors = 'ignore')
- try:
- xmlsoup = BeautifulSoup(xmlcontent, \
- convertEntities = BeautifulStoneSoup.HTML_ENTITIES)
- except Exception, error:
- return False
-
- souplist = xmlsoup.findAll('news')
- for soup in souplist:
- type = self.extract_genenraltext(soup.find('type'))
- if type != "NewArticle":
- return
-
- url = self.extract_genenraltext(soup.find('url'))
- date_str = self.extract_genenraltext(soup.find('date'))
- pub_date = self.extract_date(date_str)
-
- earliest_date = date.today() - timedelta(days = int(config['ainews.period']))
- if pub_date is None or pub_date < earliest_date:
- continue
-
- print "Checking if user-submitted URL exists:", url
- res = self.parse_url(url)
- if not res or self.url == None:
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
- title = self.soup.find('title')
- if title != None:
- title = (title.string).encode('utf-8')
- title = re.sub(r'\s+', ' ', title)
- else:
- print "No <title> in", url
- continue
- self.candidates.append([url, title, pub_date])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None or self.db.isindexed(self.url):
- continue
- text = self.justext_extract(self.html)
- if len(text) == 0: continue
- self.candidates[i].append(text)
-
-class WSJParser(AINewsParser):
- """
- Parser for Wall Street Journal.
- e.g. http://topics.wsj.com/subject/a/Artificial-Intelligence/1830
- """
- def parse_sourcepage(self, url):
- """
- Parser for Wall Street Journal's search page.
- @param url: search page's url
- @type url: C{string}
- """
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- self.soup = self.soup.find('ul', {'class': "newsItem"})
-
- boxes = self.soup.findAll('div', {'class':'tipTargetBox'})
- [box.extract() for box in boxes]
- comments = self.soup.findAll('a', {'class':'icon comments'})
- [comment.extract() for comment in comments]
- videos = self.soup.findAll('a', {'class':'icon video'})
- [video.extract() for video in videos]
- pros = self.soup.findAll('a', {'class':'icon pro'})
- [pro.extract() for pro in pros]
-
- newsitems = self.soup.findAll('li')
- for item in newsitems:
- # Extract date
- item_small = item.find('small')
- if item_small == None: continue
- date_str = item.find('small').getText()
- pub_date = self.parse_date(date_str[:-11], "Month DD, YYYY")
- if pub_date < self.begindate: continue
- # Extract URL
- url = item.find('a',href=True)['href']
- if url[7:12]=='blogs': continue
- # Extract title
- title = ' '.join([t.getText() for t in item.findAll('a')])
- # Extract description
-
- self.candidates.append([url, title, pub_date])
-
- def parse_storypage(self):
- """
- Parse the story webpage extracting the latest news. The story text is
- extracted. The story's info are stored in the self.candidates.
- """
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
- args = ("div", "id", "article_story_body")
- mysoups = self.soup.findAll(args[0], {args[1] : args[2]})
- text = ""
- for mysoup in mysoups:
- paragraphs = mysoup.findAll('p')
- for paragraph in paragraphs:
- text += paragraph.getText() + ' '
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-class ForbesParser(AINewsParser):
- """
- Parser for Forbes.
- e.g. http://search.forbes.com/search/find?&start=1&tab=searchtabgeneraldark
- &MT=artificial+intelligence&sort=Date
- """
- def parse_sourcepage(self, url):
- """
- Parser for Forbes's search page.
- @param url: search page's url
- @type url: C{string}
- """
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- mysoups = self.soup.findAll('div', {'class': "head"})
- for mysoup in mysoups:
- item = mysoup.find('a',href=True)
- url = item['href']
- if url[7:12] == 'blogs': continue
- title = item.getText()
- s = re.search('20\d\d\/(0|1)\d\/(0|1|2|3)\d', url)
- if s == None: continue
- date_str = s.group(0)
- t = time.strptime(date_str,"%Y/%m/%d")
- d = date(t[0], t[1], t[2])
- if d > self.today or d < self.begindate: continue
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
-
- descsoup = self.soup.find('meta', {'name': 'description'})
- desc = descsoup['content']
- mysoups = self.soup.findAll("div", {"id" : "storyBody"})
- text = ""
- for mysoup in mysoups:
- cbx = mysoup.find('div',{'id':'controlsbox'})
- if cbx != None: cbx.extract()
- paragraphs = mysoup.findAll('p')
- for paragraph in paragraphs:
- text += paragraph.getText() + ' '
- text = re.sub(r'&.*?;', ' ', text)
- if len(text) == 0: continue
- self.candidates[i].append(text)
-
-class BBCParser(AINewsParser):
- """
- Parser for BBC News.
- e.g. http://search.bbc.co.uk/search?go=toolbar&tab=ns&q=robots&order=date
- &scope=all
- """
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
-
- newssoup = self.soup.find('li', {'class':"DateItem leadDate"})
- date_str = newssoup.find('div',{'class':"newsDateView"}).getText()
- if date_str == "": return
- t = time.strptime(date_str,"%d %B %Y")
- d = date(t[0], t[1], t[2])
- if d > self.today or d < self.begindate: return
- mysoups = newssoup.findAll('li',{'class': "thumbItem lead"})
- for mysoup in mysoups:
- url = mysoup.find('a', href=True)['href']
- item = mysoup.find('a')
- if item == None: continue
-
- #title = item.getText()
- title = self.extract_genenraltext(item)
- """
- desc = mysoup.find('p',{'class': "abstract"}).getText()
- """
-
- self.candidates.append([url, title, d])
-
- #http://www.bbc.co.uk/search/news/artificial_intelligence
- def parse_sourcepage2(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
-
- newssoup = self.soup.find('div', {'id': "newsBbc"})
- mysoups = newssoup.findAll('li')
- for mysoup in mysoups:
- date_str = ""
- for s in mysoup.findAll('span'):
- m = re.search('\d\d \w\w\w \d\d', s.getText())
- if m != None:
- date_str=m.group(0)
- break
- if date_str == "": continue
- t = time.strptime(date_str,"%d %b %y")
- d = date(t[0], t[1], t[2])
- if d > self.today or d < self.begindate: continue
-
- url = mysoup.find('a', href=True)['href']
- item = mysoup.find('a',{'class':'title'})
- if item == None: continue
- title = item.getText()
-
- """
- desc = self.extract_genenraltext( mysoup.find('p'))
- desc = re.sub(r'\n+', ' ', desc)
- """
-
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
- mainsoup = self.soup.find("div", {"id" : "main-content"})
- if mainsoup == None:
- mainsoup = self.soup.find("table", {"class" : "storycontent"})
- if mainsoup == None: continue
- text = ""
- mysoups = mainsoup.findAll('p')
- for mysoup in mysoups:
- text += self.extract_genenraltext(mysoup) + ' '
- dummy="Please turn on JavaScript. Media requires JavaScript to play."
- if text[:61]== dummy:
- text = text[61:]
- text = re.sub(r'&.*?;', ' ', text)
- if len(text) == 0: continue
-
- self.candidates[i][0] = self.url
- self.candidates[i].append(text)
-
-class CNetParser(AINewsParser):
- """
- Parser for CNet.
- e.g. http://news.cnet.com/1770-5_3-0.html?tag=mncol%3Bsort&query=artificial
- +intelligence&searchtype=news&source=news&rpp=10&sort=updateDate+desc
- """
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- newssoup = self.soup.find('div', {'id': "contentBody"})
- mysoups = newssoup.findAll('div',{'class':'resultInfo'})
- for mysoup in mysoups:
- str = mysoup.find('span',{'class':'resultDetails'}).getText()
- regexp = dateformat_regexps["Month DD, YYYY"][0]
- res = re.search(regexp, str, re.IGNORECASE)
- date_str = res.group(0)
- t = time.strptime(date_str,dateformat_regexps["Month DD, YYYY"][1])
- d = date(t[0], t[1], t[2])
- if d > self.today or d < self.begindate: continue
-
- url = mysoup.find('a', href=True)['href']
- if url[:4] != 'http':
- url = "http://news.cnet.com" + url
- title = mysoup.find('a',{'class':'resultName'}).getText()
- if len(title)>=5 and title[-5:] == 'blog)': continue
- if len(title)>=9 and title[-9:] == '(podcast)': continue
- if title[:18] == "This week in Crave":continue
- desc = mysoup.find('div',{'class':'resultSummary'}).getText()
- self.candidates.append([url, title, d, desc])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
-
- mainsoup = self.soup.find("div", {"id" : "contentBody"})
- if mainsoup == None:
- mainsoup = self.soup.find("div", {"class" : "txtWrap"})
- if mainsoup == None:continue
- posts = mainsoup.findAll('div', {'class':'postTalkback'})
- [post.extract() for post in posts]
-
- text = ""
- mysoups = mainsoup.findAll('p')
- for mysoup in mysoups:
- text += mysoup.getText() + ' '
- text = re.sub(r'&.*?;', ' ', text)
-
- if len(text) == 0: continue
- self.candidates[i][0] = self.url
- self.candidates[i].append(text)
-
-class TechnologyReviewParser(AINewsParser):
- """
- Parser for Technology Review.
- e.g. http://www.technologyreview.com/search.aspx?s=artificial%20intelligence
- &limit=computing&sort=date
- """
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- mysoups = self.soup.findAll('div',{'class':'SearchResult'})
- for mysoup in mysoups:
- res = mysoup.find('dd',{'class':'Author'}).getText()
- date_str = res.split('|')[1].strip()
- t = time.strptime(date_str,"%m/%d/%Y")
- d = date(t[0], t[1], t[2])
- if d > self.today or d < self.begindate: continue
-
- res = mysoup.find('dt',{'class':'Headline'})
- title = self.extract_genenraltext(res)
- url = res.find('a')['href']
- res = mysoup.find('dd',{'class':'SearchDek'})
- if res== None: continue
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
-
- text = ""
- mainsoup = self.soup.find("div", {"id" : "articlebody"})
- if mainsoup != None:
- mysoups = mainsoup.findAll('p')
- for mysoup in mysoups:
- text += mysoup.getText().strip() + ' '
- else:
- mainsoups = self.soup.findAll("div", {"class" : "blogcontent"})
- if mainsoups == None: continue
- for mainsoup in mainsoups:
- mysoups = mainsoup.findAll('p')
- for mysoup in mysoups:
- text += self.extract_genenraltext(mysoup) + ' '
- text = re.sub(r'\s+', ' ', text)
-
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-class ScientificAmericanParser(AINewsParser):
- """
- Parser for Scientific American.
- e.g. http://www.scientificamerican.com/search/index.cfm?i=1&q=artificial+
- intelligence&sort=publish_date&submit=submit&submit.x=0&submit.y=0&u1=q
- """
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- newssoup = self.soup.find('div', {'id': "searchpage"})
- newssoup.find('div', {'id':'search_advertise'}).extract()
- items = newssoup.findAll('h5')
- dates = newssoup.findAll('span', {'class': "searchdates"})
- for i, item in enumerate(items):
- title = item.getText()
- title = re.sub(r'&.*?;', ' ', title)
- url = item.find('a')['href']
- if re.search('podcast|blog', url) != None: continue
- if i >= len(dates): break
- date_str = ' '.join(dates[i].getText().split(' ')[:3])
- d = self.parse_date(date_str, "Month DD, YYYY")
- if d > self.today or d < self.begindate: continue
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
- mainsoup = self.soup.find("div", {"id" : "article"})
- if mainsoup == None: continue
- text = ""
- mysoups = mainsoup.findAll('p')
- for mysoup in mysoups:
- text += mysoup.getText().strip() + ' '
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-class DiscoveryParser(AINewsParser):
- """
- Parser for Discovery News.
- e.g. http://news.discovery.com/robots/
- """
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- mysoups = self.soup.findAll('dl',{'class':'asset-items clear clearfix'})
- for mysoup in mysoups:
- source = mysoup.find('p',{"class":"source"}).getText()
- m = re.search(dateformat_regexps['Mon DD, YYYY'][0], source)
- if m != None:
- date_str = m.group(0)
- d = self.parse_date(date_str,'Mon DD, YYYY')
- if d > self.today or d < self.begindate: continue
- else:
- d = self.today
-
- item = mysoup.find('h2',{"class":"title"})
- title = item.getText()
- if title[-7:] == '[VIDEO]': continue
- url = item.find('a')['href']
-
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
-
- mainsoup = self.soup.find("div", {"id" : "article-body"})
- if mainsoup == None: continue
- text = ""
- mysoups = mainsoup.findAll('p')
- for mysoup in mysoups:
- text += mysoup.getText() + ' '
- text = re.sub(r'\s+', ' ', text)
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-class GuardianParser(AINewsParser):
- """
- Parser for guardian.co.uk News.
- e.g. http://browse.guardian.co.uk/search?search=%22artificial+intelligence \
- %22&sitesearch-radio=guardian&go-guardian=Search
- """
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- self.soup = self.soup.find('div',{'class':'most-recent-results'})
- mysoups = self.soup.findAll('li',{'class':'l1'})
- for mysoup in mysoups:
- source = mysoup.find('p',{'class':'publication'}).getText()
- m = re.search(dateformat_regexps['DD Mon YYYY'][0], source)
- if m != None:
- date_str = m.group(0)
- d = self.parse_date(date_str,'DD Mon YYYY')
- if d > self.today or d < self.begindate: continue
- else:
- d = self.today
-
- item = mysoup.find('h3',{'class':'t2'})
- title = item.getText()
- url = item.find('a')['href']
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
- mainsoup = self.soup.find("div", {"id" : "article-wrapper"})
- if mainsoup == None: continue
- text = ""
- mysoups = mainsoup.findAll('p')
- for mysoup in mysoups:
- #text += mysoup.getText() + ' '
- text += self.extract_genenraltext(mysoup)+' '
- text = re.sub(r'\s+', ' ', text)
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-class TheTimesParser(AINewsParser):
- """
- Parser for The Times.
- Default is turned off since all the news in TheTimes require
- registration and pay fee.
- e.g. http://www.thetimes.co.uk/tto/public/sitesearch.do?querystring=
- artifical+intelligence&sectionId=342&p=tto&pf=all
- """
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
-
- mainmysoup = self.soup.find('div', {'class': "content-box-margin"})
- mysoups = mainmysoup.findAll('div',{'class': 'search-result'})
- for mysoup in mysoups:
- item = mysoup.find('a',href=True)
- url = item['href']
- title = item.getText()
- info = mysoup.find('div', {'class':'search-result-info'})
- s = ""
- s = ' '.join([li.getText() for li in info.findAll('li')])
- m = re.search(dateformat_regexps['DD Month YYYY'][0], s)
- if m != None:
- date_str = m.group(0)
- d = self.parse_date(date_str,'DD Month YYYY')
- if d > self.today or d < self.begindate: continue
- else:
- d = self.today
-
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- pass
- """
- # It is pending because the news require registration and pay.
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
- """
-class ScientificAmericanParser(AINewsParser):
- """
- Parser for ScientificAmerican.
- e.g. http://www.scientificamerican.com/topic.cfm?id=artificial-intelligence
- Date: Dec.21st, 2010
- """
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
-
- mainmysoup = self.soup.find('div', {'id': "mainCol"})
- mysoups = mainmysoup.findAll('li',{'class': 'hasThumb message_box'})
- for mysoup in mysoups:
- titlesoup = mysoup.find('h3')
- item = titlesoup.find('a',href=True)
- url = item['href']
- title = self.extract_genenraltext(item)
- info = mysoup.find('span', {'class':'datestamp'})
- s = self.extract_genenraltext(info)
- m = re.search(dateformat_regexps['Mon DD, YYYY'][0], s)
- if m != None:
- date_str = m.group(0)
- d = self.parse_date(date_str,'Mon DD, YYYY')
- if d > self.today or d < self.begindate: continue
- else:
- d = self.today
-
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None \
- or self.db.isindexed(self.url):
- continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
- mainsoup = self.soup.find("div", {"id" : "articleContent"})
- if mainsoup == None: continue
- text = ""
- imgs = mainsoup.findAll("p",{"class":"in-article-image"})
- [img.extract() for img in imgs]
- spans = mainsoup.findAll("span")
- [span.extract() for span in spans]
- text = self.extract_genenraltext(mainsoup)
- text = re.sub(r'\s+', ' ', text)
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-class NPRParser(AINewsParser):
- """
- Parser for NPR.
- e.g. http://www.npr.org/templates/search/index.php?searchinput=artificial+intelligence&tabId=all&sort=date
- Date: Dec.22nd, 2010
- """
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
-
- mainmysoup = self.soup.find('div', {'id': "searchresults"})
- mainmysoup = mainmysoup.find('ol',{'class':'result'})
- mysoups = mainmysoup.findAll('li',{'class': 'buildOut'})
- for mysoup in mysoups:
- titlesoup = mysoup.find('h3')
- item = titlesoup.find('a',href=True)
- if item == None: continue
- url = item['href']
- if url.find("movie")!=-1: continue
- title = self.extract_genenraltext(item)
- info = mysoup.find('span', {'class':'date'})
- s = self.extract_genenraltext(info)
- m = re.search(dateformat_regexps['Month DD, YYYY'][0], s)
- if m != None:
- date_str = m.group(0)
- d = self.parse_date(date_str,'Month DD, YYYY')
- if d > self.today or d < self.begindate: continue
- else:
- d = self.today
-
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None or self.db.isindexed(self.url): continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
-
- titlesoup = self.soup.find('title')
- if titlesoup!=None:
- title = self.extract_genenraltext(titlesoup)
- self.candidates[i][1] = title
- mainsoup = self.soup.find("div", {"id" : "storytext"})
- if mainsoup == None: continue
-
- comments = mainsoup.findAll(text=lambda text:isinstance(text, Comment))
- [comment.extract() for comment in comments]
- #wraps = mainsoup.findAll("div",{"class":"captionwrap"})
- #[wrap.extract() for wrap in wraps]
- self.remove_tag(mainsoup, "div","class","captionwrap")
- self.remove_tag(mainsoup, "div","class","dateblock")
- self.remove_tag(mainsoup, "div","class","bucket")
- self.remove_tag(mainsoup, "div","id","res132205459")
- self.remove_tag(mainsoup, "div","class","container con1col btmbar")
- self.remove_tag(mainsoup, "div","class","captionwrap enlarge")
-
- text = self.extract_genenraltext(mainsoup)
- text = re.sub(r'\s+', ' ', text)
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-class IndependentParser(AINewsParser):
- '''
- Parser for Independent UK
- e.g. http://search.independent.co.uk/topic/artificial-intelligence
- Date: Dec.23rd, 2010
- '''
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- mainmysoup = self.soup.find('ul', {'class': "ukn-results ukn-col-first"})
- mysoups = mainmysoup.findAll('li')
- for mysoup in mysoups:
- titlesoup = mysoup.find('h2')
- item = titlesoup.find('a',href=True)
- if item == None: continue
- url = item['href']
- title = item['title']
- info = mysoup.find('span', {'class':'ukn-result-meta-date'})
- s = self.extract_genenraltext(info)
- m = re.search(dateformat_regexps['DD Month YYYY'][0], s)
- if m != None:
- date_str = m.group(0)
- d = self.parse_date(date_str,'DD Month YYYY')
- if d > self.today or d < self.begindate: continue
- else:
- d = self.today
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None or self.db.isindexed(self.url): continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
-
- mainsoup = self.soup.find("div", {"id" : "article"})
- if mainsoup == None: continue
- comments = mainsoup.findAll(text=lambda text:isinstance(text, Comment))
- [comment.extract() for comment in comments]
- mainsoup.prettify()
- self.remove_tag(mainsoup, 'script')
- self.remove_tag(mainsoup, 'p','class','title')
- self.remove_tag(mainsoup, 'p','class','author')
- self.remove_tag(mainsoup, 'p','class','info')
-
- mysoups = mainsoup.findAll('p')
- text = ""
- for mysoup in mysoups:
- text += self.extract_genenraltext(mysoup) + ' '
- text = re.sub(r'\s+', ' ', text)
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-class MSNBCParser(AINewsParser):
- '''
- Parser for MSNBC
- e.g. http://www.msnbc.msn.com/id/33732970/
- Date: Dec.23rd, 2010
- '''
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- mainmysoup = self.soup.find('div', {'id': "cover"})
- mysoups = mainmysoup.findAll('div',{'class':'text'})
- for mysoup in mysoups:
- item = mysoup.find('a',href=True)
- if item == None: continue
- url = item['href']
- if url[:10]=='javascript': continue
- title = self.extract_genenraltext(item)
-
- d = self.today
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None or self.db.isindexed(self.url): continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
-
- datesoup = self.soup.find("div", {"class" : "txt timestamp"})
- if datesoup!=None:
- date_str = datesoup['content'][:10]
- d = self.parse_date(date_str,'YYYY-MM-DD')
- self.candidates[i][2] = d
- if d > self.today or d < self.begindate: continue
-
- mainsoup = self.soup.find("div", {"class" : "page i1 txt"})
- if mainsoup == None: continue
-
- self.remove_tag(mainsoup, 'span','class','copyright')
- self.remove_tag(mainsoup, 'ul','class','extshare hlist')
-
- mysoups = mainsoup.findAll('p')
- text = ""
- for mysoup in mysoups:
- text += self.extract_genenraltext(mysoup) + ' '
- text = re.sub(r'\s+', ' ', text)
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-class NatureParser(AINewsParser):
- '''
- Parser for Nature News
- e.g. http://www.nature.com/search/executeSearch?exclude-collections=
- journals_palgrave%2Clab_animal&sp-q-1=&include-collections=journals_nature%2Ccrawled_content&
- sp-a=sp1001702d&sp-x-1=ujournal&sp-sfvl-field=subject|ujournal&sp-q=robot&sp-p=all&
- sp-p-1=phrase&sp-s=date_descending&sp-c=5
- '''
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- mainmysoup = self.soup.find('ol', {'class': "results-list"})
- mysoups = mainmysoup.findAll('li')
- for mysoup in mysoups:
- titlesoup = mysoup.find('h2')
- item = titlesoup.find('a',href=True)
- # an "access" span element indicates "free"
- access = titlesoup.find('span', {'class': 'access'})
- if item == None or access == None: continue
- url = item['href']
- title = self.extract_genenraltext(item).strip()
- if title == 'News in brief': continue
- date_str = mysoup.find('span', {'class': 'date'}).content
- d = self.extract_date(date_str)
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None or self.db.isindexed(self.url): continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
- mainsoup = self.soup.find("div", {"class" : "content"})
- if mainsoup == None: continue
- self.remove_tag(mainsoup, 'div', 'class', 'article-tools')
- mysoups = mainsoup.findAll('p')
- text = ""
- for mysoup in mysoups:
- text += self.extract_genenraltext(mysoup) + ' '
-
- text = re.sub(r'\s+', ' ', text)
- text = re.sub(r'&.*?;', ' ', text)
- self.candidates[i].append(text)
-
-
-class TimesParser(AINewsParser):
- '''
- Parser for Times
- e.g. http://search.time.com/results.html?cmd=tags&D=robot&
- sid=12D1588BC3C6&Ntt=robot&internalid=endeca_dimension
- &Ns=p_date_range|1&p=0&N=34&Nty=1&srchCat=Full+Archive
- Date: Dec.23rd, 2010
- '''
- def parse_sourcepage(self, url):
- self.parse_url(url)
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- if self.debug: print >> sys.stderr, "SOUP ERROR: %s" % error
- return False
- mainmysoup = self.soup.find('div', {'class': "resultsCol"})
- if mainmysoup == None: return
- mysoups = mainmysoup.findAll('div', {'class':'tout'})
- for mysoup in mysoups:
- titlesoup = mysoup.find('h3')
- item = titlesoup.find('a',href=True)
- if item == None: continue
- url = item['href']
- title = self.extract_genenraltext(titlesoup)
- info = mysoup.find('span', {'class':'date'})
- s = self.extract_genenraltext(info)
- m = re.search(dateformat_regexps['Mon DD, YYYY'][0], s)
- if m != None:
- date_str = m.group(0)
- d = self.parse_date(date_str,'Mon DD, YYYY')
- if d > self.today or d < self.begindate: continue
- else:
- d = self.today
- self.candidates.append([url, title, d])
-
- def parse_storypage(self):
- for i, candidate in enumerate(self.candidates):
- res = self.parse_url(candidate[0])
- if not res or self.url == None or self.db.isindexed(self.url): continue
- try:
- self.soup = BeautifulSoup(self.html)
- except Exception, error:
- print >> sys.stderr, "SOUP ERROR: %s" % error
- continue
-
- if candidate[0].find('newsfeed')!=-1:
- # is newsfeed
- mainsoup = self.soup.find("div", {"id" : "content"})
- if mainsoup == None: continue
- self.remove_tag(mainsoup, 'p','id','description')
- self.remove_tag(mainsoup, 'p','id','caption')
- else:
- # not newsfeed
- mainsoup = self.soup.find("div", {"class" : "artTxt"})
- if mainsoup == None: continue
- comments = mainsoup.findAll(text=lambda text:isinstance(text, Comment))
- [comment.extract() for comment in comments]
- mysoups = mainsoup.findAll('p')
- text = ""
- for mysoup in mysoups:
- text += self.extract_genenraltext(mysoup) + ' '
- text = re.sub(r'\s+', '