Skip to content

Commit

Permalink
Some code documentation and reorganization for clarity.
Browse files Browse the repository at this point in the history
  • Loading branch information
joshuaeckroth committed Sep 9, 2011
1 parent 2af6122 commit 273056e
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 63 deletions.
95 changes: 62 additions & 33 deletions AINewsCorpus.py
Expand Up @@ -18,20 +18,46 @@
from AINewsTools import loadpickle from AINewsTools import loadpickle


class AINewsCorpus: class AINewsCorpus:
"""
A corpus is a set of news articles (each with a title, content,
and categories) that are used for training and comparison
purposes. For training, the corpus provides the training
examples. For comparison, the corpus provides the data for various
measures like word frequency. This is important in the prediction
process: we only want to predict a new article's categories based
on word frequencies, and other measures, from the corpus; we don't
want articles that have not been "vetted" (articles not part of
the corpus) to contribute to these measures.
A corpus can be "loaded" via C{load_corpus()} or "restored" via
C{restore_corpus()}. The difference is the following: when loading a
corpus, word frequencies are measured and stored in the database
table C{wordlist_eval}; when restoring a corpus, word frequencies
are simply retrieved from the database table C{wordlist}. In other
words, we load a corpus when we are training or evaluating our
training procedures, and we restore a corpus when we are
predicting.
"""
def __init__(self): def __init__(self):
self.txtpro = AINewsTextProcessor() self.txtpro = AINewsTextProcessor()
self.cache_urls = {} self.cache_urls = {}


self.wordlist = {} #: A dictionary of word=>word freq in corpus
self.dftext = {}

#: A dictionary of word=>wordid
self.idwords = {}

#: A dictionary of wordid=>word
self.wordids = {} self.wordids = {}


self.db = AINewsDB() self.db = AINewsDB()


self.categories =["AIOverview","Agents", "Applications", \ self.categories = ["AIOverview","Agents", "Applications", \
"CognitiveScience","Education","Ethics", "Games", "History",\ "CognitiveScience", "Education", "Ethics", "Games", "History", \
"Interfaces","MachineLearning","NaturalLanguage","Philosophy",\ "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \
"Reasoning","Representation", "Robots","ScienceFiction",\ "Reasoning", "Representation", "Robots", "ScienceFiction", \
"Speech", "Systems","Vision"] "Speech", "Systems", "Vision"]


self.sources = {} self.sources = {}
rows = self.db.selectall("select parser, relevance from sources") rows = self.db.selectall("select parser, relevance from sources")
Expand Down Expand Up @@ -68,17 +94,17 @@ def get_tfidf(self, urlid, wordfreq):
""" """
if urlid in self.cache_urls: if urlid in self.cache_urls:
return self.cache_urls[urlid] return self.cache_urls[urlid]
wordids = {} wordid_freq_pairs = {}
for word in wordfreq: for word in wordfreq:
if word in self.dftext: if word in self.dftext:
wordids[self.dftext[word][0]] = (wordfreq[word], self.dftext[word][1]) wordid_freq_pairs[self.idwords[word]] = (wordfreq[word], self.dftext[word])


data = {} data = {}
distsq = 0.0 distsq = 0.0
for wordid in wordids: for wordid in wordid_freq_pairs:
tfidf = math.log(wordids[wordid][0] + 1, 2) * \ tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \
(math.log(self.corpus_count, 2) - \ (math.log(self.corpus_count, 2) - \
math.log(wordids[wordid][1] + 1, 2)) math.log(wordid_freq_pairs[wordid][1] + 1, 2))
data[wordid] = tfidf data[wordid] = tfidf
distsq += tfidf * tfidf distsq += tfidf * tfidf
dist = math.sqrt(distsq) dist = math.sqrt(distsq)
Expand All @@ -89,14 +115,14 @@ def get_tfidf(self, urlid, wordfreq):
return data return data


def cos_sim(self, tfidf1, tfidf2): def cos_sim(self, tfidf1, tfidf2):
''' """
A helper function to compute the cos simliarity between A helper function to compute the cos simliarity between
news story and centroid. news story and centroid.
@param data: target news story tfidf vector. @param tfidf1: target news story tfidf vector.
@type data: C{dict} @type tfidf1: C{dict}
@param centroid: centroid tfidf vector. @param tfidf2: centroid tfidf vector.
@type centroid: C{dict} @type tfidf2: C{dict}
''' """
sim = 0.0 sim = 0.0
for key in tfidf1: for key in tfidf1:
if key in tfidf2: if key in tfidf2:
Expand All @@ -106,21 +132,6 @@ def cos_sim(self, tfidf1, tfidf2):
sim += a*b sim += a*b
return sim return sim


def add_freq_index(self, urlid, wordfreq, categories = []):
for word in wordfreq:
self.wordlist.setdefault(word, 0)
self.wordlist[word] += 1

def commit_freq_index(self, table):
self.dftext = {}
self.wordids = {}
for word in self.wordlist:
rowid = self.db.execute("insert into "+table+" (word, dftext) " + \
"values(%s, %s)", (word, self.wordlist[word]))
self.wordids[rowid] = word
self.dftext[word] = (rowid, self.wordlist[word])
self.wordlist = {}

def get_article(self, urlid, corpus = False): def get_article(self, urlid, corpus = False):
if corpus: if corpus:
table = 'cat_corpus' table = 'cat_corpus'
Expand Down Expand Up @@ -191,9 +202,26 @@ def restore_corpus(self):
rows = self.db.selectall("select rowid, word, dftext from wordlist") rows = self.db.selectall("select rowid, word, dftext from wordlist")
for row in rows: for row in rows:
self.wordids[row[0]] = row[1] self.wordids[row[0]] = row[1]
self.dftext[row[1]] = (row[0], row[2]) self.idwords[row[1]] = row[0]
self.dftext[row[1]] = row[2]
self.corpus_count = self.db.selectone("select count(*) from cat_corpus")[0] self.corpus_count = self.db.selectone("select count(*) from cat_corpus")[0]


def add_freq_index(self, urlid, wordfreq, categories = []):
for word in wordfreq:
self.wordcounts.setdefault(word, 0)
self.wordcounts[word] += 1

def commit_freq_index(self, table):
self.dftext = {}
self.wordids = {}
for word in self.wordcounts:
rowid = self.db.execute("insert into "+table+" (word, dftext) " + \
"values(%s, %s)", (word, self.wordcounts[word]))
self.wordids[rowid] = word
self.idwords[word] = rowid
self.dftext[word] = self.wordcounts[word]
self.wordcounts = {}

def load_corpus(self, ident, pct, debug = False): def load_corpus(self, ident, pct, debug = False):
if debug: if debug:
print "Loading corpus..." print "Loading corpus..."
Expand Down Expand Up @@ -222,6 +250,7 @@ def load_corpus(self, ident, pct, debug = False):
self.db.execute("delete from wordlist_eval") self.db.execute("delete from wordlist_eval")
self.db.execute("alter table wordlist_eval auto_increment = 0") self.db.execute("alter table wordlist_eval auto_increment = 0")
self.wordids = {} self.wordids = {}
self.wordcounts = {}
self.cache_urls = {} self.cache_urls = {}
for c in train_corpus: for c in train_corpus:
self.add_freq_index(c[0], c[1], c[2].split()) self.add_freq_index(c[0], c[1], c[2].split())
Expand Down
62 changes: 32 additions & 30 deletions CorpusCategories.py
Expand Up @@ -9,37 +9,39 @@
import sys import sys
from AINewsDB import AINewsDB from AINewsDB import AINewsDB


categories =["AIOverview","Agents", "Applications", \ if __name__ == "__main__":
"CognitiveScience","Education","Ethics", "Games", "History",\
"Interfaces","MachineLearning","NaturalLanguage","Philosophy",\
"Reasoning","Representation", "Robots","ScienceFiction",\
"Speech", "Systems","Vision"]


db = AINewsDB() categories =["AIOverview","Agents", "Applications", \

"CognitiveScience","Education","Ethics", "Games", "History",\
url_counts = {} "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\

"Reasoning","Representation", "Robots","ScienceFiction",\
cat_counts = {} "Speech", "Systems","Vision"]
for cat in categories:
cat_counts[cat] = 0 db = AINewsDB()


rows = db.selectall( \ url_counts = {}
"select c.urlid, c.content, group_concat(cc.category separator ' ') " +
"from cat_corpus as c, cat_corpus_cats as cc where c.urlid = cc.urlid " + cat_counts = {}
"group by c.urlid") for cat in categories:
for row in rows: cat_counts[cat] = 0
url_counts[row[0]] = len(row[2].split(' '))
for cat in row[2].split(' '): rows = db.selectall( \
cat_counts[cat] += 1 "select c.urlid, c.content, group_concat(cc.category separator ' ') " +

"from cat_corpus as c, cat_corpus_cats as cc where c.urlid = cc.urlid " +
if sys.argv[1] == "bar": "group by c.urlid")
print "Category,Count" for row in rows:
for cat in sorted(cat_counts.keys(),reverse=True): url_counts[row[0]] = len(row[2].split(' '))
print "%s,%d" % (cat, cat_counts[cat]) for cat in row[2].split(' '):
elif sys.argv[1] == "hist": cat_counts[cat] += 1
print "URL,Count"
for urlid in url_counts: if sys.argv[1] == "bar":
print "%d,%d" % (urlid, url_counts[urlid]) print "Category,Count"
for cat in sorted(cat_counts.keys(),reverse=True):
print "%s,%d" % (cat, cat_counts[cat])
elif sys.argv[1] == "hist":
print "URL,Count"
for urlid in url_counts:
print "%d,%d" % (urlid, url_counts[urlid])






Expand Down

0 comments on commit 273056e

Please sign in to comment.