From ad5322eb1ee6d77780167445dbe50619b78ac1b0 Mon Sep 17 00:00:00 2001 From: Joshua Eckroth Date: Tue, 14 Jun 2011 18:35:06 -0400 Subject: [PATCH] Path fixes, SVM bug fixed (w.r.t. labels), other cleanups. --- AINewsPublisher.py | 6 +- AINewsSVM.py | 26 ++-- AINewsSVMClassifier.py | 284 ----------------------------------------- svm-easy.py | 20 ++- svm-grid.py | 5 +- 5 files changed, 29 insertions(+), 312 deletions(-) delete mode 100644 AINewsSVMClassifier.py diff --git a/AINewsPublisher.py b/AINewsPublisher.py index 95d21c6..3b23b64 100644 --- a/AINewsPublisher.py +++ b/AINewsPublisher.py @@ -120,8 +120,8 @@ def publish_email(self): """ Call AINewsEmail.php to send email through PHP Mail Server """ - cmd = 'php AINewsEmail.php' - Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate() + #cmd = 'php AINewsEmail.php' + #Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate() self.publish_email_semiauto() def publish_email_semiauto(self): @@ -153,7 +153,7 @@ def publish_pmwiki(self): Call AINewsPmwiki.php to publish latest news to AAAI Pmwiki website. """ cmd = 'php AINewsPmwiki.php' - Popen(cmd, shell = True, stdout = PIPE).communicate() + Popen(cmd, shell = True).wait() def update_rss(self): rssitems = [] diff --git a/AINewsSVM.py b/AINewsSVM.py index 1afd7e0..c8fc1a4 100644 --- a/AINewsSVM.py +++ b/AINewsSVM.py @@ -53,7 +53,6 @@ def collect_feedback(self): rater_count_cutoff = int(config['feedback.rater_count_cutoff']) stdev_cutoff = float(config['feedback.stdev_cutoff']) output = "" - #admins = ("Bgbuchanan's", "Rgsmith's", "Ldong's") admins = [name + '\'s' for name in config['svm.admins'].split(':')] for infile in glob.glob( path.join(feedback_path, '*.rating') ): urlid = int(infile.split('/')[-1][:-7]) @@ -73,7 +72,7 @@ def collect_feedback(self): self.db.execute(sql) if False: - # Deprecated. Dr.Buchanan wants only his and Dr.Reids' rating + # Deprecated. Dr.Buchanan wants only admin ratings # be used in re-training if n <= rater_count_cutoff: continue rates = [] @@ -84,7 +83,7 @@ def collect_feedback(self): if sd > stdev_cutoff: continue newsscore = mean else: - # Only use Dr.Buchanan and Dr.Reids' rating for re-training + # Only use admin ratings for re-training admincount = 0 adminsum = 0 for line in lines: @@ -186,9 +185,9 @@ def train(self, filename, pos_range): @type pos_range: C{tuple} """ # Generate the specific input format file - self.__generate_libsvm_input(pos_range, filename) + self.__generate_libsvm_input(pos_range, paths['svm.svm_data'] + filename) # Using the input file to train SVM - self.__libsvm_train(filename) + self.__libsvm_train(paths['svm.svm_data'] + filename) def train_all(self): """ @@ -223,7 +222,7 @@ def __generate_libsvm_input(self, pos, filename): for wordid in sorted(self.allnews[urlid].keys()): line += ' '+str(wordid)+':'+str(self.allnews[urlid][wordid]) content += line + '\n' - savefile('svm/'+filename, content) + savefile(filename, content) def __libsvm_train(self,filename): """ @@ -234,7 +233,7 @@ def __libsvm_train(self,filename): @type filename: C{string} """ cmd = 'python svm-easy.py "%s"' % filename - Popen(cmd, shell = True, stdout = PIPE).communicate() + Popen(cmd, shell = True).wait() @@ -290,14 +289,18 @@ def predict_probability(self, filename, urlids): """ svm_path = paths['svm.svm_data'] mysvm = svm_load_model(svm_path + filename + ".model") + # figure out which label is +1 + labels = mysvm.get_labels() + if labels[0] == 1: positive_idx = 0 + else: positive_idx = 1 self.__load_range(svm_path + filename + ".range") results = [] for urlid in urlids: data = self.__retrieve_url_tfidf(urlid) p = svm_predict([0], [data], mysvm, "-b 1") # p = ([1.0], _, [[0.62317989329642587 0.3768201067035743]]) - # where the first prob is for -1, the second for 1 - results.append(p[2][0][1]) + # where the first prob is labels[0] and second is labels[1] + results.append(p[2][0][positive_idx]) return results @@ -379,8 +382,9 @@ def train_isrelated(self): savefile(paths['svm.svm_data'] + 'IsRelated', content) # use libsvm command tool to train - cmd = 'python svm-easy.py IsRelated' - Popen(cmd, shell = True, stdout = PIPE).communicate() + print "Training 'is related'..." + cmd = 'python svm-easy.py "' + paths['svm.svm_data'] + 'IsRelated"' + Popen(cmd, shell = True).wait() def get_related(self, urlid): sql = "select topic from urllist where rowid = %d" % urlid diff --git a/AINewsSVMClassifier.py b/AINewsSVMClassifier.py deleted file mode 100644 index 613b63b..0000000 --- a/AINewsSVMClassifier.py +++ /dev/null @@ -1,284 +0,0 @@ -""" -AINewsSVMClassifier aims to use SVM to train and predict 19 AI news categories. - -The classification task was performed by AINewsTopic.py, but it is a rather -simply method. - -I crawled 1281 documents using LuceneCategoryCrawler.py from 19 categories from -AITopic (http://www.aaai.org/AITopics/pmwiki/pmwiki.php/AITopics/HomePage) - -Date: Dec.19th, 2010 -Author: Liang Dong -""" - -import os -import math -from svm import * -from subprocess import * -import time -from datetime import datetime - -from AINewsDB import AINewsDB -from AINewsTextProcessor import AINewsTextProcessor -from AINewsTools import loadfile2, savefile, savepickle, loadpickle, loadfile -from AINewsConfig import config - - -class AINewsSVMClassifier: - def __init__(self): - self.txtpro = AINewsTextProcessor() - self.db = AINewsDB() - total_doc = self.db.get_totaldoc() - self.logN = math.log(total_doc+1,2) - self.upper_df = total_doc * float(config['svm.docfreq_upper_ratio']) - self.lower_df = total_doc * float(config['svm.docfreq_lower_ratio']) - #self.categories = loadpickle("category/all_categories.pkl") - self.categories =["AIOverview","Agents", "Applications", \ - "CognitiveScience","Education","Ethics", "Games", "History",\ - "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\ - "Reasoning","Representation", "Robots","ScienceFiction",\ - "Speech", "Systems","Vision"] - ############################## - # - # Train - # - ############################## - def train(self, src_dir, dest_dir): - - print "(1) Extracting words from training data" - allw, artw, artt, artcat, allcat = self.getarticlewords(src_dir) - #self.categories = sorted(list(allcat)) - #savepickle("category/all_categories.pkl", self.categories) - - print "(2) Computing TFIDF and base libsvm format" - formats, wordmap = self.make_libsvm_input(allw, artw) - - print len(formats), len(artcat), len(allcat) - - print "(3) Building LibSVM training input format" - for category in self.categories: - content = "" - for (i,artcategory) in enumerate(artcat): - if category == artcategory: - content += "+1 "+formats[i]+'\n' - else: - content += "-1 "+formats[i]+'\n' - target_file = os.path.join(dest_dir, category+"_train") - savefile(target_file, content) - - - print "(4) Training 1-against-rest classifier for each category" - - for category in self.categories: - print "\tTraining ", category - start = datetime.now() - filename = os.path.join(paths['ainews.category_data'], dest_dir, category+"_train") - cmd = 'python svm-easy.py "%s" ' % filename - Popen(cmd, shell = True, stdout = PIPE).communicate() - end = datetime.now() - print "\tTime spent:", end - start - - print "(5) Done" - - - def train_data_generator(self, src_dir): - ''' - Python Generator to browse all the training files under given directory. - ''' - dirs = sorted([f for f in os.listdir(src_dir) - if os.path.isdir(os.path.join(src_dir, f)) ]) - for dir in dirs: - files = sorted([f for f in os.listdir(os.path.join(src_dir,dir))]) - for file in files: - yield (dir, file) - - def getarticlewords(self, src_dir): - ''' - Process all the words from the training corpus. Codes are referred from - book 'collective intelligence' chapter 10. - ''' - allwords={} - articlewords=[] - articletitles=[] - articlecategories = [] - allcategories = set() - cnt = 0 - - train_data = self.train_data_generator(src_dir) - for file_data in train_data: - file = os.path.join(src_dir, file_data[0], file_data[1]) - title = file_data[1].encode('utf8') - - # Extract the words - content = loadfile2(file) - text = title + ' ' + content - wordfreq = self.txtpro.simpletextprocess(text) - articlewords.append({}) - articletitles.append(title) - articlecategories.append(file_data[0]) - allcategories.add(file_data[0]) - - # Increase the counts for this word in allwords - for word in wordfreq.keys(): - allwords.setdefault(word,0) - allwords[word] += 1 - articlewords[cnt][word] = wordfreq[word] - cnt += 1 - - return allwords,articlewords,articletitles, articlecategories, allcategories - - def make_libsvm_input(self, allw, articlew): - ''' - Build the base libsvm input format for all the articles. - ''' - - wordmap = {} # Mapping word->AINewsDB's wordlist (id, log(N/df)) - - # Only take words that are common but not too common - # From allwords - N = len(articlew) - upper = N * 0.6 - lower = 3 - for w,c in allw.items(): - if c > lower and c < upper: - sql = "select rowid, dftext from wordlist where word = '%s'" % w - row = self.db.selectone(sql) - if row == None: - # print '\'',w, "\' not found" - continue - wordmap[w] = (row[0], (self.logN - math.log(row[1]+1, 2))) - - - # Create the libsvm input - # TFIDF the value (Added by Liang Dong) - l1 = [] - cnt = 0 - for f in articlew: - l1.append({}) - for word in f.keys(): - if word in wordmap.keys(): - l1[cnt][wordmap[word][0]] = math.log(f[word]+1,2)*wordmap[word][1] - cnt += 1 - - baseformats = [] - for item in l1: - text = "" - for wordid in sorted(item.keys()): - text += str(wordid)+":"+str(item[wordid])+" " - baseformats.append(text) - - return baseformats, wordmap - - ############################## - # - # Predict - # - ############################## - def init_predict(self, model_dir): - self.allwords_idf = {} - self.build_allwords_idf() - self.models = [] - - for category in self.categories: - file = os.path.join(model_dir, category+"_train.model") - print "Loading SVM model:", file - self.models.append(svm_model(file)) - - self.range = {} - rangefile = os.path.join(model_dir, "AIOverview_train.range") - self.__load_range(rangefile) - - def build_allwords_idf(self): - """ - Pre-calculate the idf value for all the words whose doc freq value - belongs to the certain range (lower_df, upper_df). - """ - sql = '''select rowid, dftext from wordlist - where dftext > %d and dftext < %d - ''' % (self.lower_df, self.upper_df) - rows = self.db.selectall(sql) - for row in rows: - idf = self.logN - math.log(row[1]+1, 2) - self.allwords_idf[row[0]] = idf - - def __load_range(self, filename): - """ - Read in the range file generated by svm-train tool which list the min - and max value of each feature. Since the min value is always 0, only - the max value is read and stored in a dictionary - self.range[wordid] = max_value of the feature - @param filename: the libSVM formatted input file - @type filename: C{string} - """ - lines = loadfile(filename) - for line in lines[2:]: - items = line[:-1].split(' ') - self.range[int(items[0])] = float(items[2]) - - def __retrieve_url_tfidf(self, urlid): - """ - Retrieve the tfidf of each word based on the urlid. - @param urlid: target news story's urlid. - @type urlid: C{int} - """ - sql = '''select t.wordid,t.freq from textwordurl as t, wordlist as w - where urlid = %d and t.wordid = w.rowid and dftext > %d - and dftext < %d''' % (urlid, self.lower_df, self.upper_df) - rows = self.db.selectall(sql) - data = {} - for row in rows: - if row[0] not in self.range.keys(): - continue - tfidf = (math.log(row[1]+1, 2)) * self.allwords_idf[row[0]] - data[row[0]] = tfidf / self.range[row[0]] - return data - - def predict(self, urlid): - data = self.__retrieve_url_tfidf(urlid) - max_prob = 0 - max_i = 0 - for (i, model) in enumerate(self.models): - prob = model.predict_probability(data) - print self.categories[i], prob - if prob[1][1] > max_prob: - max_i = i - max_prob = prob[1][1] - print urlid, self.categories[max_i], max_prob - -if __name__ == "__main__": - start = datetime.now() - - cat = AINewsSVMClassifier() - - VIEW_ALL_FILE, TRAIN, PREDICT = range(0,3) - - type = PREDICT - - if type == VIEW_ALL_FILE: - src_dir = "category/data" - dirs = sorted([f for f in os.listdir(src_dir) - if os.path.isdir(os.path.join(src_dir, f)) ]) - cnt = 0 - for dir in dirs: - files = sorted([f for f in os.listdir(os.path.join(src_dir,dir))]) - for (i,file) in enumerate(files): - print cnt, i, dir, file - cnt += 1 - - elif type == TRAIN: - src_dir = "category/newdata" - dest_dir = "category/newmodels" - cat.train(src_dir, dest_dir) - - elif type == PREDICT: - model_dir = "category/newmodels" - cat.init_predict(model_dir) - for urlid in range(650,675): - cat.predict(urlid) - - - print datetime.now() - start - - - - diff --git a/svm-easy.py b/svm-easy.py index 8cd455a..d19f3dd 100644 --- a/svm-easy.py +++ b/svm-easy.py @@ -15,21 +15,19 @@ train_pathname = sys.argv[1] assert os.path.exists(train_pathname),"training file not found" -file_name = os.path.split(train_pathname)[1] -scaled_file = file_name + ".scale" -model_file = file_name + ".model" -range_file = file_name + ".range" +scaled_file = train_pathname + ".scale" +model_file = train_pathname + ".model" +range_file = train_pathname + ".range" if len(sys.argv) > 2: test_pathname = sys.argv[2] - file_name = os.path.split(test_pathname)[1] assert os.path.exists(test_pathname),"testing file not found" - scaled_test_file = file_name + ".scale" - predict_test_file = file_name + ".predict" + scaled_test_file = test_pathname + ".scale" + predict_test_file = test_pathname + ".predict" cmd = '%s -l 0 -u 1 -s "%s" "%s" > "%s"' % (svmscale_exe, range_file, train_pathname, scaled_file) print('Scaling training data...') -Popen(cmd, shell = True, stdout = PIPE).communicate() +Popen(cmd, shell = True).wait() cmd = '%s -svmtrain "%s" "%s"' % (grid_py, svmtrain_exe, scaled_file) print('Cross validation...') @@ -47,17 +45,17 @@ cmd = '%s -b 1 -t 0 -c %s -g %s "%s" "%s"' % (svmtrain_exe,c,g,scaled_file,model_file) #cmd = '%s -b 1 -log2c -1,2,1 -log2g 1,1,1 -t 0 "%s" "%s"' % (svmtrain_exe,scaled_file,model_file) print('Training...') -Popen(cmd, shell = True, stdout = PIPE).communicate() +Popen(cmd, shell = True).wait() print('Output model: %s' % model_file) if len(sys.argv) > 2: cmd = '%s -r "%s" "%s" > "%s"' % (svmscale_exe, range_file, test_pathname, scaled_test_file) print('Scaling testing data...') - Popen(cmd, shell = True, stdout = PIPE).communicate() + Popen(cmd, shell = True).wait() cmd = '%s "%s" "%s" "%s"' % (svmpredict_exe, scaled_test_file, model_file, predict_test_file) print('Testing...') - Popen(cmd, shell = True).communicate() + Popen(cmd, shell = True).wait() print('Output prediction: %s' % predict_test_file) diff --git a/svm-grid.py b/svm-grid.py index 1636161..9f726d2 100755 --- a/svm-grid.py +++ b/svm-grid.py @@ -47,9 +47,8 @@ def process_options(argv=sys.argv): sys.exit(1) dataset_pathname = argv[-1] - dataset_title = os.path.split(dataset_pathname)[1] - out_filename = '%s.out' % dataset_title - png_filename = '%s.png' % dataset_title + out_filename = '%s.out' % dataset_pathname + png_filename = '%s.png' % dataset_pathname pass_through_options = [] i = 1