Some code documentation and reorganization for clarity.

AAAI · Sep 9, 2011 · 273056e · 273056e
1 parent 2af6122
commit 273056e
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 63 deletions.
diff --git a/AINewsCorpus.py b/AINewsCorpus.py
@@ -18,20 +18,46 @@
 from AINewsTools import loadpickle
 
 class AINewsCorpus:
+    """
+    A corpus is a set of news articles (each with a title, content,
+    and categories) that are used for training and comparison
+    purposes. For training, the corpus provides the training
+    examples. For comparison, the corpus provides the data for various
+    measures like word frequency. This is important in the prediction
+    process: we only want to predict a new article's categories based
+    on word frequencies, and other measures, from the corpus; we don't
+    want articles that have not been "vetted" (articles not part of
+    the corpus) to contribute to these measures.
+
+    A corpus can be "loaded" via C{load_corpus()} or "restored" via
+    C{restore_corpus()}. The difference is the following: when loading a
+    corpus, word frequencies are measured and stored in the database
+    table C{wordlist_eval}; when restoring a corpus, word frequencies
+    are simply retrieved from the database table C{wordlist}. In other
+    words, we load a corpus when we are training or evaluating our
+    training procedures, and we restore a corpus when we are
+    predicting.
+    """
     def __init__(self):
         self.txtpro = AINewsTextProcessor()
         self.cache_urls = {}
 
-        self.wordlist = {}
+        #: A dictionary of word=>word freq in corpus
+        self.dftext = {}
+
+        #: A dictionary of word=>wordid
+        self.idwords = {}
+
+        #: A dictionary of wordid=>word
         self.wordids = {}
 
         self.db = AINewsDB()
 
-        self.categories =["AIOverview","Agents", "Applications", \
+        self.categories = ["AIOverview","Agents", "Applications", \
-                 "CognitiveScience","Education","Ethics", "Games", "History",\
+                 "CognitiveScience", "Education", "Ethics", "Games", "History", \
-                 "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\
+                 "Interfaces", "MachineLearning", "NaturalLanguage", "Philosophy", \
-                 "Reasoning","Representation", "Robots","ScienceFiction",\
+                 "Reasoning", "Representation", "Robots", "ScienceFiction", \
-                 "Speech", "Systems","Vision"]
+                 "Speech", "Systems", "Vision"]
 
         self.sources = {}
         rows = self.db.selectall("select parser, relevance from sources")
@@ -68,17 +94,17 @@ def get_tfidf(self, urlid, wordfreq):
         """
         if urlid in self.cache_urls:
             return self.cache_urls[urlid]
-        wordids = {}
+        wordid_freq_pairs = {}
         for word in wordfreq:
             if word in self.dftext:
-                wordids[self.dftext[word][0]] = (wordfreq[word], self.dftext[word][1])
+                wordid_freq_pairs[self.idwords[word]] = (wordfreq[word], self.dftext[word])
 
         data = {}
         distsq = 0.0
-        for wordid in wordids:
+        for wordid in wordid_freq_pairs:
-            tfidf = math.log(wordids[wordid][0] + 1, 2) * \
+            tfidf = math.log(wordid_freq_pairs[wordid][0] + 1, 2) * \
                     (math.log(self.corpus_count, 2) - \
-                    math.log(wordids[wordid][1] + 1, 2))
+                    math.log(wordid_freq_pairs[wordid][1] + 1, 2))
             data[wordid] = tfidf
             distsq += tfidf * tfidf
         dist = math.sqrt(distsq)
@@ -89,14 +115,14 @@ def get_tfidf(self, urlid, wordfreq):
         return data
 
     def cos_sim(self, tfidf1, tfidf2):
-        '''
+        """
         A helper function to compute the cos simliarity between
         news story and centroid.
-        @param  data: target news story tfidf vector.
+        @param  tfidf1: target news story tfidf vector.
-        @type  data: C{dict}
+        @type  tfidf1: C{dict}
-        @param centroid: centroid tfidf vector.
+        @param tfidf2: centroid tfidf vector.
-        @type  centroid: C{dict}
+        @type  tfidf2: C{dict}
-        '''
+        """
         sim = 0.0
         for key in tfidf1:
             if key in tfidf2:
@@ -106,21 +132,6 @@ def cos_sim(self, tfidf1, tfidf2):
                 sim += a*b
         return sim
 
-    def add_freq_index(self, urlid, wordfreq, categories = []):
-        for word in wordfreq:
-            self.wordlist.setdefault(word, 0)
-            self.wordlist[word] += 1
-
-    def commit_freq_index(self, table):
-        self.dftext = {}
-        self.wordids = {}
-        for word in self.wordlist:
-            rowid = self.db.execute("insert into "+table+" (word, dftext) " + \
-                "values(%s, %s)", (word, self.wordlist[word]))
-            self.wordids[rowid] = word
-            self.dftext[word] = (rowid, self.wordlist[word])
-        self.wordlist = {}
-
     def get_article(self, urlid, corpus = False):
         if corpus:
             table = 'cat_corpus'
@@ -191,9 +202,26 @@ def restore_corpus(self):
         rows = self.db.selectall("select rowid, word, dftext from wordlist")
         for row in rows:
             self.wordids[row[0]] = row[1]
-            self.dftext[row[1]] = (row[0], row[2])
+            self.idwords[row[1]] = row[0]
+            self.dftext[row[1]] = row[2]
         self.corpus_count = self.db.selectone("select count(*) from cat_corpus")[0]
 
+    def add_freq_index(self, urlid, wordfreq, categories = []):
+        for word in wordfreq:
+            self.wordcounts.setdefault(word, 0)
+            self.wordcounts[word] += 1
+
+    def commit_freq_index(self, table):
+        self.dftext = {}
+        self.wordids = {}
+        for word in self.wordcounts:
+            rowid = self.db.execute("insert into "+table+" (word, dftext) " + \
+                "values(%s, %s)", (word, self.wordcounts[word]))
+            self.wordids[rowid] = word
+            self.idwords[word] = rowid
+            self.dftext[word] = self.wordcounts[word]
+        self.wordcounts = {}
+
     def load_corpus(self, ident, pct, debug = False):
         if debug:
             print "Loading corpus..."
@@ -222,6 +250,7 @@ def load_corpus(self, ident, pct, debug = False):
         self.db.execute("delete from wordlist_eval")
         self.db.execute("alter table wordlist_eval auto_increment = 0")
         self.wordids = {}
+        self.wordcounts = {}
         self.cache_urls = {}
         for c in train_corpus:
             self.add_freq_index(c[0], c[1], c[2].split())

diff --git a/CorpusCategories.py b/CorpusCategories.py
@@ -9,37 +9,39 @@
 import sys
 from AINewsDB import AINewsDB
 
-categories =["AIOverview","Agents", "Applications", \
+if __name__ == "__main__":
-             "CognitiveScience","Education","Ethics", "Games", "History",\
-             "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\
-             "Reasoning","Representation", "Robots","ScienceFiction",\
-             "Speech", "Systems","Vision"]
 
-db = AINewsDB()
+    categories =["AIOverview","Agents", "Applications", \
-
+                     "CognitiveScience","Education","Ethics", "Games", "History",\
-url_counts = {}
+                     "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\
-
+                     "Reasoning","Representation", "Robots","ScienceFiction",\
-cat_counts = {}
+                     "Speech", "Systems","Vision"]
-for cat in categories:
+
-    cat_counts[cat] = 0
+    db = AINewsDB()
-
+
-rows = db.selectall( \
+    url_counts = {}
-        "select c.urlid, c.content, group_concat(cc.category separator ' ') " +
+
-        "from cat_corpus as c, cat_corpus_cats as cc where c.urlid = cc.urlid " +
+    cat_counts = {}
-        "group by c.urlid")
+    for cat in categories:
-for row in rows:
+        cat_counts[cat] = 0
-    url_counts[row[0]] = len(row[2].split(' '))
+
-    for cat in row[2].split(' '):
+        rows = db.selectall( \
-        cat_counts[cat] += 1
+            "select c.urlid, c.content, group_concat(cc.category separator ' ') " +
-
+            "from cat_corpus as c, cat_corpus_cats as cc where c.urlid = cc.urlid " +
-if sys.argv[1] == "bar":
+            "group by c.urlid")
-    print "Category,Count"
+        for row in rows:
-    for cat in sorted(cat_counts.keys(),reverse=True):
+            url_counts[row[0]] = len(row[2].split(' '))
-        print "%s,%d" % (cat, cat_counts[cat])
+            for cat in row[2].split(' '):
-elif sys.argv[1] == "hist":
+                cat_counts[cat] += 1
-    print "URL,Count"
+
-    for urlid in url_counts:
+    if sys.argv[1] == "bar":
-        print "%d,%d" % (urlid, url_counts[urlid])
+        print "Category,Count"
+        for cat in sorted(cat_counts.keys(),reverse=True):
+            print "%s,%d" % (cat, cat_counts[cat])
+    elif sys.argv[1] == "hist":
+        print "URL,Count"
+        for urlid in url_counts:
+            print "%d,%d" % (urlid, url_counts[urlid])