diff --git a/AINewsCorpus.py b/AINewsCorpus.py index bb2a009..a7fa370 100644 --- a/AINewsCorpus.py +++ b/AINewsCorpus.py @@ -18,20 +18,46 @@ from AINewsTools import loadpickle class AINewsCorpus: + """ + A corpus is a set of news articles (each with a title, content, + and categories) that are used for training and comparison + purposes. For training, the corpus provides the training + examples. For comparison, the corpus provides the data for various + measures like word frequency. This is important in the prediction + process: we only want to predict a new article's categories based + on word frequencies, and other measures, from the corpus; we don't + want articles that have not been "vetted" (articles not part of + the corpus) to contribute to these measures. + + A corpus can be "loaded" via C{load_corpus()} or "restored" via + C{restore_corpus()}. The difference is the following: when loading a + corpus, word frequencies are measured and stored in the database + table C{wordlist_eval}; when restoring a corpus, word frequencies + are simply retrieved from the database table C{wordlist}. In other + words, we load a corpus when we are training or evaluating our + training procedures, and we restore a corpus when we are + predicting. + """ def __init__(self): self.txtpro = AINewsTextProcessor() self.cache_urls = {} - self.wordlist = {} + #: A dictionary of word=>word freq in corpus + self.dftext = {} + + #: A dictionary of word=>wordid + self.idwords = {} + + #: A dictionary of wordid=>word self.wordids = {} self.db = AINewsDB() - self.categories =["AIOverview","Agents", "Applications", \ - "CognitiveScience","Education","Ethics", "Games", "History",\ - "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\ - "Reasoning","Representation", "Robots","ScienceFiction",\ - "Speech", "Systems","Vision"] + self.categories = ["AIOverview","Agents", "Applications", \ + "CognitiveScience", "Education", "Ethics", "Games", "History", \ + "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \ + "Reasoning", "Representation", "Robots", "ScienceFiction", \ + "Speech", "Systems", "Vision"] self.sources = {} rows = self.db.selectall("select parser, relevance from sources") @@ -68,17 +94,17 @@ def get_tfidf(self, urlid, wordfreq): """ if urlid in self.cache_urls: return self.cache_urls[urlid] - wordids = {} + wordid_freq_pairs = {} for word in wordfreq: if word in self.dftext: - wordids[self.dftext[word][0]] = (wordfreq[word], self.dftext[word][1]) + wordid_freq_pairs[self.idwords[word]] = (wordfreq[word], self.dftext[word]) data = {} distsq = 0.0 - for wordid in wordids: - tfidf = math.log(wordids[wordid][0] + 1, 2) * \ + for wordid in wordid_freq_pairs: + tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \ (math.log(self.corpus_count, 2) - \ - math.log(wordids[wordid][1] + 1, 2)) + math.log(wordid_freq_pairs[wordid][1] + 1, 2)) data[wordid] = tfidf distsq += tfidf * tfidf dist = math.sqrt(distsq) @@ -89,14 +115,14 @@ def get_tfidf(self, urlid, wordfreq): return data def cos_sim(self, tfidf1, tfidf2): - ''' + """ A helper function to compute the cos simliarity between news story and centroid. - @param data: target news story tfidf vector. - @type data: C{dict} - @param centroid: centroid tfidf vector. - @type centroid: C{dict} - ''' + @param tfidf1: target news story tfidf vector. + @type tfidf1: C{dict} + @param tfidf2: centroid tfidf vector. + @type tfidf2: C{dict} + """ sim = 0.0 for key in tfidf1: if key in tfidf2: @@ -106,21 +132,6 @@ def cos_sim(self, tfidf1, tfidf2): sim += a*b return sim - def add_freq_index(self, urlid, wordfreq, categories = []): - for word in wordfreq: - self.wordlist.setdefault(word, 0) - self.wordlist[word] += 1 - - def commit_freq_index(self, table): - self.dftext = {} - self.wordids = {} - for word in self.wordlist: - rowid = self.db.execute("insert into "+table+" (word, dftext) " + \ - "values(%s, %s)", (word, self.wordlist[word])) - self.wordids[rowid] = word - self.dftext[word] = (rowid, self.wordlist[word]) - self.wordlist = {} - def get_article(self, urlid, corpus = False): if corpus: table = 'cat_corpus' @@ -191,9 +202,26 @@ def restore_corpus(self): rows = self.db.selectall("select rowid, word, dftext from wordlist") for row in rows: self.wordids[row[0]] = row[1] - self.dftext[row[1]] = (row[0], row[2]) + self.idwords[row[1]] = row[0] + self.dftext[row[1]] = row[2] self.corpus_count = self.db.selectone("select count(*) from cat_corpus")[0] + def add_freq_index(self, urlid, wordfreq, categories = []): + for word in wordfreq: + self.wordcounts.setdefault(word, 0) + self.wordcounts[word] += 1 + + def commit_freq_index(self, table): + self.dftext = {} + self.wordids = {} + for word in self.wordcounts: + rowid = self.db.execute("insert into "+table+" (word, dftext) " + \ + "values(%s, %s)", (word, self.wordcounts[word])) + self.wordids[rowid] = word + self.idwords[word] = rowid + self.dftext[word] = self.wordcounts[word] + self.wordcounts = {} + def load_corpus(self, ident, pct, debug = False): if debug: print "Loading corpus..." @@ -222,6 +250,7 @@ def load_corpus(self, ident, pct, debug = False): self.db.execute("delete from wordlist_eval") self.db.execute("alter table wordlist_eval auto_increment = 0") self.wordids = {} + self.wordcounts = {} self.cache_urls = {} for c in train_corpus: self.add_freq_index(c[0], c[1], c[2].split()) diff --git a/CorpusCategories.py b/CorpusCategories.py index 8c29b20..5d4d821 100644 --- a/CorpusCategories.py +++ b/CorpusCategories.py @@ -9,37 +9,39 @@ import sys from AINewsDB import AINewsDB -categories =["AIOverview","Agents", "Applications", \ - "CognitiveScience","Education","Ethics", "Games", "History",\ - "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\ - "Reasoning","Representation", "Robots","ScienceFiction",\ - "Speech", "Systems","Vision"] +if __name__ == "__main__": -db = AINewsDB() - -url_counts = {} - -cat_counts = {} -for cat in categories: - cat_counts[cat] = 0 - -rows = db.selectall( \ - "select c.urlid, c.content, group_concat(cc.category separator ' ') " + - "from cat_corpus as c, cat_corpus_cats as cc where c.urlid = cc.urlid " + - "group by c.urlid") -for row in rows: - url_counts[row[0]] = len(row[2].split(' ')) - for cat in row[2].split(' '): - cat_counts[cat] += 1 - -if sys.argv[1] == "bar": - print "Category,Count" - for cat in sorted(cat_counts.keys(),reverse=True): - print "%s,%d" % (cat, cat_counts[cat]) -elif sys.argv[1] == "hist": - print "URL,Count" - for urlid in url_counts: - print "%d,%d" % (urlid, url_counts[urlid]) + categories =["AIOverview","Agents", "Applications", \ + "CognitiveScience","Education","Ethics", "Games", "History",\ + "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\ + "Reasoning","Representation", "Robots","ScienceFiction",\ + "Speech", "Systems","Vision"] + + db = AINewsDB() + + url_counts = {} + + cat_counts = {} + for cat in categories: + cat_counts[cat] = 0 + + rows = db.selectall( \ + "select c.urlid, c.content, group_concat(cc.category separator ' ') " + + "from cat_corpus as c, cat_corpus_cats as cc where c.urlid = cc.urlid " + + "group by c.urlid") + for row in rows: + url_counts[row[0]] = len(row[2].split(' ')) + for cat in row[2].split(' '): + cat_counts[cat] += 1 + + if sys.argv[1] == "bar": + print "Category,Count" + for cat in sorted(cat_counts.keys(),reverse=True): + print "%s,%d" % (cat, cat_counts[cat]) + elif sys.argv[1] == "hist": + print "URL,Count" + for urlid in url_counts: + print "%d,%d" % (urlid, url_counts[urlid])