From ad5322eb1ee6d77780167445dbe50619b78ac1b0 Mon Sep 17 00:00:00 2001
From: Joshua Eckroth <eckroth@cse.ohio-state.edu>
Date: Tue, 14 Jun 2011 18:35:06 -0400
Subject: [PATCH] Path fixes, SVM bug fixed (w.r.t. labels), other cleanups.

---
 AINewsPublisher.py     |   6 +-
 AINewsSVM.py           |  26 ++--
 AINewsSVMClassifier.py | 284 -----------------------------------------
 svm-easy.py            |  20 ++-
 svm-grid.py            |   5 +-
 5 files changed, 29 insertions(+), 312 deletions(-)
 delete mode 100644 AINewsSVMClassifier.py

diff --git a/AINewsPublisher.py b/AINewsPublisher.py
index 95d21c6..3b23b64 100644
--- a/AINewsPublisher.py
+++ b/AINewsPublisher.py
@@ -120,8 +120,8 @@ def publish_email(self):
         """
         Call AINewsEmail.php to send email through PHP Mail Server
         """
-        cmd = 'php AINewsEmail.php'
-        Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate()
+        #cmd = 'php AINewsEmail.php'
+        #Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate()
         self.publish_email_semiauto()
         
     def publish_email_semiauto(self):
@@ -153,7 +153,7 @@ def publish_pmwiki(self):
         Call AINewsPmwiki.php to publish latest news to AAAI Pmwiki website.
         """
         cmd = 'php AINewsPmwiki.php'
-        Popen(cmd, shell = True, stdout = PIPE).communicate()
+        Popen(cmd, shell = True).wait()
         
     def update_rss(self):
         rssitems = []
diff --git a/AINewsSVM.py b/AINewsSVM.py
index 1afd7e0..c8fc1a4 100644
--- a/AINewsSVM.py
+++ b/AINewsSVM.py
@@ -53,7 +53,6 @@ def collect_feedback(self):
         rater_count_cutoff = int(config['feedback.rater_count_cutoff'])
         stdev_cutoff  = float(config['feedback.stdev_cutoff'])
         output = ""
-        #admins = ("Bgbuchanan's", "Rgsmith's", "Ldong's")
         admins = [name + '\'s' for name in config['svm.admins'].split(':')]
         for infile in glob.glob( path.join(feedback_path, '*.rating') ):
             urlid = int(infile.split('/')[-1][:-7])
@@ -73,7 +72,7 @@ def collect_feedback(self):
             self.db.execute(sql)
             
             if False:
-                # Deprecated. Dr.Buchanan wants only his and Dr.Reids' rating
+                # Deprecated. Dr.Buchanan wants only admin ratings
                 # be used in re-training
                 if n <= rater_count_cutoff: continue
                 rates = []
@@ -84,7 +83,7 @@ def collect_feedback(self):
                 if sd > stdev_cutoff: continue
                 newsscore = mean
             else:
-                # Only use Dr.Buchanan and Dr.Reids' rating for re-training
+                # Only use admin ratings for re-training
                 admincount = 0
                 adminsum = 0
                 for line in lines:
@@ -186,9 +185,9 @@ def train(self, filename, pos_range):
         @type pos_range: C{tuple}
         """
         # Generate the specific input format file
-        self.__generate_libsvm_input(pos_range,  filename)
+        self.__generate_libsvm_input(pos_range,  paths['svm.svm_data'] + filename)
         # Using the input file to train SVM
-        self.__libsvm_train(filename)
+        self.__libsvm_train(paths['svm.svm_data'] + filename)
         
     def train_all(self):
         """
@@ -223,7 +222,7 @@ def __generate_libsvm_input(self, pos, filename):
             for wordid in sorted(self.allnews[urlid].keys()):
                 line += ' '+str(wordid)+':'+str(self.allnews[urlid][wordid])
             content += line + '\n'
-        savefile('svm/'+filename, content)
+        savefile(filename, content)
     
     def __libsvm_train(self,filename):
         """
@@ -234,7 +233,7 @@ def __libsvm_train(self,filename):
         @type filename: C{string}
         """
         cmd = 'python svm-easy.py "%s"' % filename
-        Popen(cmd, shell = True, stdout = PIPE).communicate()
+        Popen(cmd, shell = True).wait()
             
       
     
@@ -290,14 +289,18 @@ def predict_probability(self, filename, urlids):
         """
         svm_path = paths['svm.svm_data']
         mysvm = svm_load_model(svm_path + filename + ".model")
+        # figure out which label is +1
+        labels = mysvm.get_labels()
+        if labels[0] == 1: positive_idx = 0
+        else: positive_idx = 1
         self.__load_range(svm_path + filename + ".range")
         results = []
         for urlid in urlids:
             data = self.__retrieve_url_tfidf(urlid)
             p = svm_predict([0], [data], mysvm, "-b 1")
             # p = ([1.0], _, [[0.62317989329642587 0.3768201067035743]])
-            # where the first prob is for -1, the second for 1
-            results.append(p[2][0][1])
+            # where the first prob is labels[0] and second is labels[1]
+            results.append(p[2][0][positive_idx])
         return results
     
         
@@ -379,8 +382,9 @@ def train_isrelated(self):
         savefile(paths['svm.svm_data'] + 'IsRelated', content)
         
         # use libsvm command tool to train
-        cmd = 'python svm-easy.py IsRelated'
-        Popen(cmd, shell = True, stdout = PIPE).communicate()
+        print "Training 'is related'..."
+        cmd = 'python svm-easy.py "' + paths['svm.svm_data'] + 'IsRelated"'
+        Popen(cmd, shell = True).wait()
             
     def get_related(self, urlid):
         sql = "select topic from urllist where rowid = %d" % urlid
diff --git a/AINewsSVMClassifier.py b/AINewsSVMClassifier.py
deleted file mode 100644
index 613b63b..0000000
--- a/AINewsSVMClassifier.py
+++ /dev/null
@@ -1,284 +0,0 @@
-"""
-AINewsSVMClassifier aims to use SVM to train and predict 19 AI news categories.
-
-The classification task was performed by AINewsTopic.py, but it is a rather
-simply method.
-
-I crawled 1281 documents using LuceneCategoryCrawler.py from 19 categories from
-AITopic (http://www.aaai.org/AITopics/pmwiki/pmwiki.php/AITopics/HomePage)
-
-Date: Dec.19th, 2010
-Author: Liang Dong
-"""
-
-import os
-import math
-from svm import *
-from subprocess import *
-import time
-from datetime import datetime
-
-from AINewsDB import AINewsDB
-from AINewsTextProcessor import AINewsTextProcessor
-from AINewsTools import loadfile2, savefile, savepickle, loadpickle, loadfile
-from AINewsConfig import config
-
-
-class AINewsSVMClassifier:
-    def __init__(self):
-        self.txtpro = AINewsTextProcessor()
-        self.db = AINewsDB()
-        total_doc = self.db.get_totaldoc()
-        self.logN = math.log(total_doc+1,2)
-        self.upper_df = total_doc * float(config['svm.docfreq_upper_ratio'])
-        self.lower_df = total_doc * float(config['svm.docfreq_lower_ratio'])
-        #self.categories = loadpickle("category/all_categories.pkl")
-        self.categories =["AIOverview","Agents", "Applications", \
-                 "CognitiveScience","Education","Ethics", "Games", "History",\
-                 "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\
-                 "Reasoning","Representation", "Robots","ScienceFiction",\
-                 "Speech", "Systems","Vision"]   
-    ##############################
-    #
-    #           Train 
-    #
-    ##############################
-    def train(self, src_dir, dest_dir):
-        
-        print "(1) Extracting words from training data"
-        allw, artw, artt, artcat, allcat = self.getarticlewords(src_dir)
-        #self.categories = sorted(list(allcat))
-        #savepickle("category/all_categories.pkl", self.categories)
-        
-        print "(2) Computing TFIDF and base libsvm format"
-        formats, wordmap = self.make_libsvm_input(allw, artw)
-        
-        print len(formats), len(artcat), len(allcat)
-        
-        print "(3) Building LibSVM training input format"
-        for category in self.categories:
-            content = ""
-            for (i,artcategory) in enumerate(artcat):
-                if category == artcategory:
-                    content += "+1   "+formats[i]+'\n'
-                else:
-                    content += "-1   "+formats[i]+'\n'
-            target_file = os.path.join(dest_dir, category+"_train")
-            savefile(target_file, content)
-        
-        
-        print "(4) Training 1-against-rest classifier for each category"
-        
-        for category in self.categories:
-            print "\tTraining ", category
-            start = datetime.now()
-            filename = os.path.join(paths['ainews.category_data'], dest_dir, category+"_train")
-            cmd = 'python svm-easy.py "%s" ' % filename
-            Popen(cmd, shell = True, stdout = PIPE).communicate()
-            end = datetime.now()
-            print "\tTime spent:", end - start
-        
-        print "(5) Done"
-        
-        
-    def train_data_generator(self, src_dir):
-        '''
-        Python Generator to browse all the training files under given directory.
-        '''
-        dirs = sorted([f for f in os.listdir(src_dir) 
-                if os.path.isdir(os.path.join(src_dir, f)) ])
-        for dir in dirs:
-            files = sorted([f for f in os.listdir(os.path.join(src_dir,dir))])
-            for file in files:
-                yield (dir, file)
-                
-    def getarticlewords(self, src_dir):
-        '''
-        Process all the words from the training corpus. Codes are referred from 
-        book 'collective intelligence' chapter 10.
-        '''
-        allwords={}
-        articlewords=[]
-        articletitles=[]
-        articlecategories = []
-        allcategories = set()
-        cnt = 0
-        
-        train_data = self.train_data_generator(src_dir)
-        for file_data in train_data:
-            file = os.path.join(src_dir, file_data[0], file_data[1])
-            title = file_data[1].encode('utf8')
-            
-            # Extract the words
-            content = loadfile2(file)
-            text = title + ' ' + content
-            wordfreq = self.txtpro.simpletextprocess(text)
-            articlewords.append({})
-            articletitles.append(title)
-            articlecategories.append(file_data[0])
-            allcategories.add(file_data[0])
-            
-            # Increase the counts for this word in allwords 
-            for word in wordfreq.keys():
-                allwords.setdefault(word,0)
-                allwords[word] += 1
-                articlewords[cnt][word] = wordfreq[word]
-            cnt += 1
-            
-        return allwords,articlewords,articletitles, articlecategories, allcategories
-    
-    def make_libsvm_input(self, allw, articlew):
-        '''
-        Build the base libsvm input format for all the articles.
-        '''
-        
-        wordmap = {}    # Mapping word->AINewsDB's wordlist (id, log(N/df))
-
-        # Only take words that are common but not too common
-        # From allwords
-        N = len(articlew)
-        upper = N * 0.6
-        lower = 3
-        for w,c in allw.items():
-            if c > lower and c < upper:
-                sql = "select rowid, dftext from wordlist where word = '%s'" % w
-                row = self.db.selectone(sql)
-                if row == None:
-                    # print '\'',w, "\' not found"
-                    continue
-                wordmap[w] = (row[0], (self.logN - math.log(row[1]+1, 2)))
-                
-                
-        # Create the libsvm input 
-        # TFIDF the value (Added by Liang Dong)
-        l1 = []
-        cnt = 0
-        for f in articlew:
-            l1.append({})
-            for word in f.keys():
-                if word in wordmap.keys():
-                    l1[cnt][wordmap[word][0]] = math.log(f[word]+1,2)*wordmap[word][1]
-            cnt += 1
-            
-        baseformats = []
-        for item in l1:
-            text = ""
-            for wordid in sorted(item.keys()):
-                text += str(wordid)+":"+str(item[wordid])+" "
-            baseformats.append(text)
-            
-        return baseformats, wordmap
-    
-    ##############################
-    #
-    #           Predict 
-    #
-    ##############################
-    def init_predict(self, model_dir):
-        self.allwords_idf = {}
-        self.build_allwords_idf()
-        self.models = []
-        
-        for category in self.categories:
-            file = os.path.join(model_dir, category+"_train.model")
-            print "Loading SVM model:", file
-            self.models.append(svm_model(file))
-        
-        self.range = {}
-        rangefile = os.path.join(model_dir, "AIOverview_train.range")
-        self.__load_range(rangefile)
-    
-    def build_allwords_idf(self):
-        """
-        Pre-calculate the idf value for all the words whose doc freq value
-        belongs to the certain range (lower_df, upper_df).
-        """
-        sql = '''select rowid, dftext from wordlist
-                 where dftext > %d and dftext < %d
-              ''' % (self.lower_df, self.upper_df)
-        rows = self.db.selectall(sql)
-        for row in rows:
-            idf = self.logN - math.log(row[1]+1, 2)
-            self.allwords_idf[row[0]] = idf
-            
-    def __load_range(self, filename):
-        """
-        Read in the range file generated by svm-train tool which list the min
-        and max value of each feature. Since the min value is always 0, only
-        the max value is read and stored in a dictionary
-        self.range[wordid] = max_value of the feature
-        @param filename: the libSVM formatted input file
-        @type filename: C{string}
-        """
-        lines = loadfile(filename)
-        for line in lines[2:]:
-            items = line[:-1].split(' ')
-            self.range[int(items[0])] = float(items[2]) 
-    
-    def __retrieve_url_tfidf(self, urlid):
-        """
-        Retrieve the tfidf of each word based on the urlid.
-        @param  urlid: target news story's urlid.
-        @type  urlid: C{int}
-        """
-        sql = '''select t.wordid,t.freq from textwordurl as t, wordlist as w
-                    where urlid = %d and t.wordid = w.rowid and dftext > %d
-                    and dftext < %d''' % (urlid, self.lower_df, self.upper_df)
-        rows = self.db.selectall(sql)
-        data = {}
-        for row in rows:
-            if row[0] not in self.range.keys():
-                continue
-            tfidf = (math.log(row[1]+1, 2)) * self.allwords_idf[row[0]]
-            data[row[0]] = tfidf / self.range[row[0]]
-        return data
-    
-    def predict(self, urlid):
-        data = self.__retrieve_url_tfidf(urlid)
-        max_prob = 0
-        max_i = 0
-        for (i, model) in enumerate(self.models):
-            prob = model.predict_probability(data)
-            print self.categories[i], prob
-            if prob[1][1] > max_prob:
-                max_i = i
-                max_prob = prob[1][1]
-        print urlid, self.categories[max_i], max_prob
-        
-if __name__ == "__main__":
-    start = datetime.now()
-    
-    cat = AINewsSVMClassifier()
-    
-    VIEW_ALL_FILE, TRAIN, PREDICT = range(0,3)
-    
-    type =  PREDICT
-    
-    if type == VIEW_ALL_FILE:
-        src_dir = "category/data"
-        dirs = sorted([f for f in os.listdir(src_dir) 
-                    if os.path.isdir(os.path.join(src_dir, f)) ])
-        cnt = 0
-        for dir in dirs:
-            files = sorted([f for f in os.listdir(os.path.join(src_dir,dir))])
-            for (i,file) in enumerate(files):
-                print cnt, i, dir, file
-                cnt += 1
-                
-    elif type == TRAIN:
-        src_dir = "category/newdata"
-        dest_dir = "category/newmodels"
-        cat.train(src_dir, dest_dir)
-        
-    elif type == PREDICT:
-        model_dir = "category/newmodels"
-        cat.init_predict(model_dir)
-        for urlid in range(650,675):
-            cat.predict(urlid)
-        
-        
-    print datetime.now() - start    
-        
-       
-        
-     
diff --git a/svm-easy.py b/svm-easy.py
index 8cd455a..d19f3dd 100644
--- a/svm-easy.py
+++ b/svm-easy.py
@@ -15,21 +15,19 @@
 
 train_pathname = sys.argv[1]
 assert os.path.exists(train_pathname),"training file not found"
-file_name = os.path.split(train_pathname)[1]
-scaled_file = file_name + ".scale"
-model_file = file_name + ".model"
-range_file = file_name + ".range"
+scaled_file = train_pathname + ".scale"
+model_file = train_pathname + ".model"
+range_file = train_pathname + ".range"
 
 if len(sys.argv) > 2:
 	test_pathname = sys.argv[2]
-	file_name = os.path.split(test_pathname)[1]
 	assert os.path.exists(test_pathname),"testing file not found"
-	scaled_test_file = file_name + ".scale"
-	predict_test_file = file_name + ".predict"
+	scaled_test_file = test_pathname + ".scale"
+	predict_test_file = test_pathname + ".predict"
 
 cmd = '%s -l 0 -u 1 -s "%s" "%s" > "%s"' % (svmscale_exe, range_file, train_pathname, scaled_file)
 print('Scaling training data...')
-Popen(cmd, shell = True, stdout = PIPE).communicate()	
+Popen(cmd, shell = True).wait()
 
 cmd = '%s -svmtrain "%s" "%s"' % (grid_py, svmtrain_exe, scaled_file)
 print('Cross validation...')
@@ -47,17 +45,17 @@
 cmd = '%s -b 1 -t 0 -c %s -g %s "%s" "%s"' % (svmtrain_exe,c,g,scaled_file,model_file)
 #cmd = '%s -b 1 -log2c -1,2,1 -log2g 1,1,1 -t 0  "%s" "%s"' % (svmtrain_exe,scaled_file,model_file)
 print('Training...')
-Popen(cmd, shell = True, stdout = PIPE).communicate()
+Popen(cmd, shell = True).wait()
 
 print('Output model: %s' % model_file)
 if len(sys.argv) > 2:
 	cmd = '%s -r "%s" "%s" > "%s"' % (svmscale_exe, range_file, test_pathname, scaled_test_file)
 	print('Scaling testing data...')
-	Popen(cmd, shell = True, stdout = PIPE).communicate()	
+	Popen(cmd, shell = True).wait()
 
 	cmd = '%s "%s" "%s" "%s"' % (svmpredict_exe, scaled_test_file, model_file, predict_test_file)
 	print('Testing...')
-	Popen(cmd, shell = True).communicate()	
+	Popen(cmd, shell = True).wait()
 
 	print('Output prediction: %s' % predict_test_file)
 
diff --git a/svm-grid.py b/svm-grid.py
index 1636161..9f726d2 100755
--- a/svm-grid.py
+++ b/svm-grid.py
@@ -47,9 +47,8 @@ def process_options(argv=sys.argv):
         sys.exit(1)
 
     dataset_pathname = argv[-1]
-    dataset_title = os.path.split(dataset_pathname)[1]
-    out_filename = '%s.out' % dataset_title
-    png_filename = '%s.png' % dataset_title
+    out_filename = '%s.out' % dataset_pathname
+    png_filename = '%s.png' % dataset_pathname
     pass_through_options = []
 
     i = 1