Permalink
Browse files

OTS summarizer, parser experimenter, better SVM experimenter, feature…

… selection in SVM training, and faster corpus (re)loading.
  • Loading branch information...
1 parent 245ee65 commit aae315e17d37c64d1ba972be869a204b925e9841 @joshuaeckroth joshuaeckroth committed Oct 6, 2011
Showing with 369 additions and 51 deletions.
  1. +30 −13 AINewsCorpus.py
  2. +4 −1 AINewsDupExperiment.py
  3. +63 −0 AINewsParserExperiment.py
  4. +58 −0 AINewsSVMAnalyzer.py
  5. +157 −32 AINewsSVMClassifier.py
  6. +49 −5 AINewsSummarizer.py
  7. +8 −0 config/paths.ini.sample
View
@@ -15,7 +15,7 @@
from AINewsConfig import config, paths
from AINewsDB import AINewsDB
from AINewsTextProcessor import AINewsTextProcessor
-from AINewsTools import loadpickle
+from AINewsTools import loadpickle, trunc
class AINewsCorpus:
"""
@@ -63,6 +63,8 @@ def __init__(self):
rows = self.db.selectall("select parser, relevance from sources")
for row in rows:
self.sources[row[0].split('::')[0]] = int(row[1])
+
+ self.retained_db_docs = None
self.restore_corpus()
@@ -103,7 +105,7 @@ def get_tfidf(self, urlid, wordfreq):
distsq = 0.0
for wordid in wordid_freq_pairs:
tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \
- (math.log(self.corpus_count, 2) - \
+ (math.log(self.corpus_count + 1, 2) - \
math.log(wordid_freq_pairs[wordid][1] + 1, 2))
data[wordid] = tfidf
distsq += tfidf * tfidf
@@ -133,31 +135,42 @@ def cos_sim(self, tfidf1, tfidf2):
return sim
def get_article(self, urlid, corpus = False):
+ row = None
if corpus:
table = 'cat_corpus'
cat_table = 'cat_corpus_cats'
+ row = self.db.selectone("""select u.url, u.title, u.content
+ from %s as u where u.urlid = %s""" % (table, urlid))
+
else:
table = 'urllist'
cat_table = 'categories'
-
- row = self.db.selectone("""select u.url, u.title, u.content, u.pubdate,
- u.crawldate, u.processed, u.published, u.publisher from %s as u where u.urlid = %s""" % \
- (table, urlid))
+ row = self.db.selectone("""select u.url, u.title, u.content, u.pubdate,
+ u.crawldate, u.processed, u.published, u.publisher from %s as u where u.urlid = %s""" % \
+ (table, urlid))
if row != None:
wordfreq = self.txtpro.simpletextprocess(urlid, row[2])
processed = False
- if row[5] == 1: processed = True
+ if not corpus and row[5] == 1: processed = True
published = False
- if row[6] == 1: published = True
+ if not corpus and row[6] == 1: published = True
+ pubdate = ""
+ if not corpus: pubdate = row[3]
+ crawldate = ""
+ if not corpus: crawldate = row[4]
+ publisher = ""
+ if not corpus: publisher = row[7]
categories = []
cat_rows = self.db.selectall("""select category from %s
where urlid = %s""" % (cat_table, urlid))
for cat_row in cat_rows:
categories.append(cat_row[0])
return {'urlid': urlid, 'url': row[0], 'title': row[1],
- 'content': row[2], 'pubdate': row[3], 'crawldate': row[4],
+ 'content': trunc(row[2], max_pos=3000),
+ 'content_all': row[2],
+ 'pubdate': pubdate, 'crawldate': crawldate,
'processed': processed, 'published': published,
- 'publisher': row[7],
+ 'publisher': publisher,
'categories': categories, 'duplicates': [],
'wordfreq': wordfreq, 'tfidf': self.get_tfidf(urlid, wordfreq)}
else:
@@ -222,15 +235,15 @@ def commit_freq_index(self, table):
self.dftext[word] = self.wordcounts[word]
self.wordcounts = {}
- def load_corpus(self, ident, pct, debug = False):
+ def load_corpus(self, ident, pct, debug = False, retain = False):
if debug:
print "Loading corpus..."
source = ident.split(':')[0]
name = ident.split(':')[1:]
if source == "file":
docs = self.load_file_corpus(name, debug)
elif source == "db":
- docs = self.load_db_corpus(name, debug)
+ docs = self.load_db_corpus(name, debug, retain)
print
random.shuffle(docs)
@@ -294,13 +307,15 @@ def load_file_corpus(self, name, debug = False):
sys.stdout.flush()
return docs
- def load_db_corpus(self, name, debug = False):
+ def load_db_corpus(self, name, debug = False, retain = False):
rows = self.db.selectall("""select c.urlid, c.content,
group_concat(cc.category separator ' ')
from %s as c, %s as cc
where c.urlid = cc.urlid
group by c.urlid order by c.urlid desc""" % (name[0], name[1]))
print "Processing %d articles..." % len(rows)
+ if retain and self.retained_db_docs != None:
+ return self.retained_db_docs
docs = []
for row in rows:
wordfreq = self.txtpro.simpletextprocess(row[0], row[1])
@@ -309,5 +324,7 @@ def load_db_corpus(self, name, debug = False):
if debug:
sys.stdout.write('.')
sys.stdout.flush()
+ if retain:
+ self.retained_db_docs = docs
return docs
View
@@ -126,6 +126,9 @@
for j in range(i+1, n):
checklist.add(tuple([int(sortedlist[i]),int(sortedlist[j])]))
+print len(checklist)
+print len(idset)
+
def recallprecision(articles):
"""
recall and precision for simularity method, make all pairs from urllist
@@ -179,7 +182,7 @@ def recallprecision(articles):
# print "false pos:", key, cutoff
falsepos[x] += 1
cutoff += 0.01
-
+ print "pos:",pos
best_cutoff = 0
best_f1 = 0
best_p = 0
View
@@ -0,0 +1,63 @@
+
+from AINewsCorpus import AINewsCorpus
+from AINewsConfig import paths
+from AINewsTools import trunc
+import sys
+sys.path.append(paths['libraries.tools'])
+import justext
+import os
+import glob
+import re
+import ents
+from subprocess import *
+
+
+### modified from: http://www.korokithakis.net/posts/finding-the-levenshtein-distance-in-python/
+def levenshtein_distance(first, second):
+ """Find the Levenshtein distance between two arrays of strings."""
+ if len(first) > len(second):
+ first, second = second, first
+ if len(second) == 0:
+ return len(first)
+ first_length = len(first) + 1
+ second_length = len(second) + 1
+ distance_matrix = [[0] * second_length for x in range(first_length)]
+ for i in range(first_length):
+ distance_matrix[i][0] = i
+ for j in range(second_length):
+ distance_matrix[0][j]=j
+ for i in xrange(1, first_length):
+ for j in range(1, second_length):
+ deletion = distance_matrix[i-1][j] + 1
+ insertion = distance_matrix[i][j-1] + 1
+ substitution = distance_matrix[i-1][j-1]
+ if first[i-1] != second[j-1]:
+ substitution += 1
+ distance_matrix[i][j] = min(insertion, deletion, substitution)
+ return distance_matrix[first_length-1][second_length-1]
+
+
+def evaluate():
+ corpus = AINewsCorpus()
+ print "urlid,length truewords,length justext,length goose,ld justtext,ld goose"
+ for filename in sorted(glob.glob("../../experiments/justext/*.true")):
+ truetext = ents.convert(file(filename).read())
+ truetext = re.sub(r'[^\w\s]', ' ', trunc(truetext, max_pos=3000, ellipsis=False))
+ truewords = re.split(r'\s+', truetext)
+ urlid = filename[26:30]
+ article = corpus.get_article(urlid)
+ if article == None: continue
+ articletext = re.sub(r'[^\w\s]', ' ', trunc((article['content_all']).encode('ascii'), max_pos=3000, ellipsis=False))
+ articlewords = re.split(r'\s+', articletext)
+ goosecmd = "cd /home/josh/aitopics/AINews/tools/goose; /opt/maven/bin/mvn exec:java -Dexec.mainClass=com.jimplush.goose.TalkToMeGoose -Dexec.args='%s' -q 2>>/home/josh/log.txt" % article['url']
+ (stdout, _) = Popen(goosecmd, shell = True, stdout = PIPE).communicate()
+ goosetext = ents.convert(stdout.encode('ascii'))
+ goosetext = re.sub(r'[^\w\s]', ' ', trunc(goosetext, max_pos=3000, ellipsis=False))
+ goosewords = re.split(r'\s+', goosetext)
+ ld_1 = (levenshtein_distance(truewords, articlewords))/float(len(truewords))
+ ld_2 = (levenshtein_distance(truewords, goosewords))/float(len(truewords))
+ print "%s,%d,%d,%d,%.4f,%.4f" % \
+ (urlid, len(truewords), len(articlewords), len(goosewords), ld_1, ld_2)
+
+
+evaluate()
View
@@ -0,0 +1,58 @@
+# This file is part of NewsFinder.
+# https://github.com/joshuaeckroth/AINews
+#
+# Copyright (c) 2011 by the Association for the Advancement of
+# Artificial Intelligence. This program and parts of it may be used and
+# distributed without charge for non-commercial purposes as long as this
+# notice is included.
+
+import sys
+import re
+import operator
+from datetime import datetime
+from subprocess import *
+from AINewsConfig import paths
+from AINewsCorpus import AINewsCorpus
+
+class AINewsSVMAnalyzer:
+ def __init__(self):
+ self.corpus = AINewsCorpus()
+ self.categories = self.corpus.categories
+
+ def model_word_weights(self, category):
+ f = open(paths['svm.svm_data']+category+'.model', 'r')
+ lines = f.readlines()
+ f.close()
+ labels = re.match('label (-?1) (-?1)', lines[5]).group(1,2)
+ if labels[0] == '1': pos_label = 0
+ else: pos_label = 1
+
+ cmd = './svm-weight -f %d %s%s.model' % \
+ (len(self.corpus.wordids), paths['svm.svm_data'], category)
+ (stdout, _) = Popen(cmd, shell = True, stdout = PIPE).communicate()
+ weights = {}
+ for (wordid,weight) in re.findall('(\d+):(\S+)', stdout):
+ weight = float(weight)
+ if pos_label == 1: weight = -weight
+ weights[self.corpus.wordids[int(wordid)]] = weight
+ return weights
+
+ def analyze_all(self):
+ for cat in self.categories:
+ weights = analyzer.model_word_weights(cat)
+ weights_sorted = sorted(weights.items(), key=operator.itemgetter(1))
+ print "**%s**" % cat
+ print "--Least significant:"
+ for (word, weight) in weights_sorted[0:10]:
+ print ("%s: %.3f, " % (word, weight)),
+ print
+ print "--Most significant:"
+ for (word, weight) in weights_sorted[-10:]:
+ print ("%s: %.3f, " % (word, weight)),
+ print
+ print
+
+if __name__ == "__main__":
+ analyzer = AINewsSVMAnalyzer()
+ analyzer.analyze_all()
+
Oops, something went wrong.

0 comments on commit aae315e

Please sign in to comment.