Skip to content
Browse files

Path fixes, SVM bug fixed (w.r.t. labels), other cleanups.

  • Loading branch information...
1 parent 65abae8 commit ad5322eb1ee6d77780167445dbe50619b78ac1b0 @joshuaeckroth joshuaeckroth committed Jun 14, 2011
Showing with 29 additions and 312 deletions.
  1. +3 −3 AINewsPublisher.py
  2. +15 −11 AINewsSVM.py
  3. +0 −284 AINewsSVMClassifier.py
  4. +9 −11 svm-easy.py
  5. +2 −3 svm-grid.py
View
6 AINewsPublisher.py
@@ -120,8 +120,8 @@ def publish_email(self):
"""
Call AINewsEmail.php to send email through PHP Mail Server
"""
- cmd = 'php AINewsEmail.php'
- Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate()
+ #cmd = 'php AINewsEmail.php'
+ #Popen(cmd, shell = True, stdout = PIPE, stderr = STDOUT).communicate()
self.publish_email_semiauto()
def publish_email_semiauto(self):
@@ -153,7 +153,7 @@ def publish_pmwiki(self):
Call AINewsPmwiki.php to publish latest news to AAAI Pmwiki website.
"""
cmd = 'php AINewsPmwiki.php'
- Popen(cmd, shell = True, stdout = PIPE).communicate()
+ Popen(cmd, shell = True).wait()
def update_rss(self):
rssitems = []
View
26 AINewsSVM.py
@@ -53,7 +53,6 @@ def collect_feedback(self):
rater_count_cutoff = int(config['feedback.rater_count_cutoff'])
stdev_cutoff = float(config['feedback.stdev_cutoff'])
output = ""
- #admins = ("Bgbuchanan's", "Rgsmith's", "Ldong's")
admins = [name + '\'s' for name in config['svm.admins'].split(':')]
for infile in glob.glob( path.join(feedback_path, '*.rating') ):
urlid = int(infile.split('/')[-1][:-7])
@@ -73,7 +72,7 @@ def collect_feedback(self):
self.db.execute(sql)
if False:
- # Deprecated. Dr.Buchanan wants only his and Dr.Reids' rating
+ # Deprecated. Dr.Buchanan wants only admin ratings
# be used in re-training
if n <= rater_count_cutoff: continue
rates = []
@@ -84,7 +83,7 @@ def collect_feedback(self):
if sd > stdev_cutoff: continue
newsscore = mean
else:
- # Only use Dr.Buchanan and Dr.Reids' rating for re-training
+ # Only use admin ratings for re-training
admincount = 0
adminsum = 0
for line in lines:
@@ -186,9 +185,9 @@ def train(self, filename, pos_range):
@type pos_range: C{tuple}
"""
# Generate the specific input format file
- self.__generate_libsvm_input(pos_range, filename)
+ self.__generate_libsvm_input(pos_range, paths['svm.svm_data'] + filename)
# Using the input file to train SVM
- self.__libsvm_train(filename)
+ self.__libsvm_train(paths['svm.svm_data'] + filename)
def train_all(self):
"""
@@ -223,7 +222,7 @@ def __generate_libsvm_input(self, pos, filename):
for wordid in sorted(self.allnews[urlid].keys()):
line += ' '+str(wordid)+':'+str(self.allnews[urlid][wordid])
content += line + '\n'
- savefile('svm/'+filename, content)
+ savefile(filename, content)
def __libsvm_train(self,filename):
"""
@@ -234,7 +233,7 @@ def __libsvm_train(self,filename):
@type filename: C{string}
"""
cmd = 'python svm-easy.py "%s"' % filename
- Popen(cmd, shell = True, stdout = PIPE).communicate()
+ Popen(cmd, shell = True).wait()
@@ -290,14 +289,18 @@ def predict_probability(self, filename, urlids):
"""
svm_path = paths['svm.svm_data']
mysvm = svm_load_model(svm_path + filename + ".model")
+ # figure out which label is +1
+ labels = mysvm.get_labels()
+ if labels[0] == 1: positive_idx = 0
+ else: positive_idx = 1
self.__load_range(svm_path + filename + ".range")
results = []
for urlid in urlids:
data = self.__retrieve_url_tfidf(urlid)
p = svm_predict([0], [data], mysvm, "-b 1")
# p = ([1.0], _, [[0.62317989329642587 0.3768201067035743]])
- # where the first prob is for -1, the second for 1
- results.append(p[2][0][1])
+ # where the first prob is labels[0] and second is labels[1]
+ results.append(p[2][0][positive_idx])
return results
@@ -379,8 +382,9 @@ def train_isrelated(self):
savefile(paths['svm.svm_data'] + 'IsRelated', content)
# use libsvm command tool to train
- cmd = 'python svm-easy.py IsRelated'
- Popen(cmd, shell = True, stdout = PIPE).communicate()
+ print "Training 'is related'..."
+ cmd = 'python svm-easy.py "' + paths['svm.svm_data'] + 'IsRelated"'
+ Popen(cmd, shell = True).wait()
def get_related(self, urlid):
sql = "select topic from urllist where rowid = %d" % urlid
View
284 AINewsSVMClassifier.py
@@ -1,284 +0,0 @@
-"""
-AINewsSVMClassifier aims to use SVM to train and predict 19 AI news categories.
-
-The classification task was performed by AINewsTopic.py, but it is a rather
-simply method.
-
-I crawled 1281 documents using LuceneCategoryCrawler.py from 19 categories from
-AITopic (http://www.aaai.org/AITopics/pmwiki/pmwiki.php/AITopics/HomePage)
-
-Date: Dec.19th, 2010
-Author: Liang Dong
-"""
-
-import os
-import math
-from svm import *
-from subprocess import *
-import time
-from datetime import datetime
-
-from AINewsDB import AINewsDB
-from AINewsTextProcessor import AINewsTextProcessor
-from AINewsTools import loadfile2, savefile, savepickle, loadpickle, loadfile
-from AINewsConfig import config
-
-
-class AINewsSVMClassifier:
- def __init__(self):
- self.txtpro = AINewsTextProcessor()
- self.db = AINewsDB()
- total_doc = self.db.get_totaldoc()
- self.logN = math.log(total_doc+1,2)
- self.upper_df = total_doc * float(config['svm.docfreq_upper_ratio'])
- self.lower_df = total_doc * float(config['svm.docfreq_lower_ratio'])
- #self.categories = loadpickle("category/all_categories.pkl")
- self.categories =["AIOverview","Agents", "Applications", \
- "CognitiveScience","Education","Ethics", "Games", "History",\
- "Interfaces","MachineLearning","NaturalLanguage","Philosophy",\
- "Reasoning","Representation", "Robots","ScienceFiction",\
- "Speech", "Systems","Vision"]
- ##############################
- #
- # Train
- #
- ##############################
- def train(self, src_dir, dest_dir):
-
- print "(1) Extracting words from training data"
- allw, artw, artt, artcat, allcat = self.getarticlewords(src_dir)
- #self.categories = sorted(list(allcat))
- #savepickle("category/all_categories.pkl", self.categories)
-
- print "(2) Computing TFIDF and base libsvm format"
- formats, wordmap = self.make_libsvm_input(allw, artw)
-
- print len(formats), len(artcat), len(allcat)
-
- print "(3) Building LibSVM training input format"
- for category in self.categories:
- content = ""
- for (i,artcategory) in enumerate(artcat):
- if category == artcategory:
- content += "+1 "+formats[i]+'\n'
- else:
- content += "-1 "+formats[i]+'\n'
- target_file = os.path.join(dest_dir, category+"_train")
- savefile(target_file, content)
-
-
- print "(4) Training 1-against-rest classifier for each category"
-
- for category in self.categories:
- print "\tTraining ", category
- start = datetime.now()
- filename = os.path.join(paths['ainews.category_data'], dest_dir, category+"_train")
- cmd = 'python svm-easy.py "%s" ' % filename
- Popen(cmd, shell = True, stdout = PIPE).communicate()
- end = datetime.now()
- print "\tTime spent:", end - start
-
- print "(5) Done"
-
-
- def train_data_generator(self, src_dir):
- '''
- Python Generator to browse all the training files under given directory.
- '''
- dirs = sorted([f for f in os.listdir(src_dir)
- if os.path.isdir(os.path.join(src_dir, f)) ])
- for dir in dirs:
- files = sorted([f for f in os.listdir(os.path.join(src_dir,dir))])
- for file in files:
- yield (dir, file)
-
- def getarticlewords(self, src_dir):
- '''
- Process all the words from the training corpus. Codes are referred from
- book 'collective intelligence' chapter 10.
- '''
- allwords={}
- articlewords=[]
- articletitles=[]
- articlecategories = []
- allcategories = set()
- cnt = 0
-
- train_data = self.train_data_generator(src_dir)
- for file_data in train_data:
- file = os.path.join(src_dir, file_data[0], file_data[1])
- title = file_data[1].encode('utf8')
-
- # Extract the words
- content = loadfile2(file)
- text = title + ' ' + content
- wordfreq = self.txtpro.simpletextprocess(text)
- articlewords.append({})
- articletitles.append(title)
- articlecategories.append(file_data[0])
- allcategories.add(file_data[0])
-
- # Increase the counts for this word in allwords
- for word in wordfreq.keys():
- allwords.setdefault(word,0)
- allwords[word] += 1
- articlewords[cnt][word] = wordfreq[word]
- cnt += 1
-
- return allwords,articlewords,articletitles, articlecategories, allcategories
-
- def make_libsvm_input(self, allw, articlew):
- '''
- Build the base libsvm input format for all the articles.
- '''
-
- wordmap = {} # Mapping word->AINewsDB's wordlist (id, log(N/df))
-
- # Only take words that are common but not too common
- # From allwords
- N = len(articlew)
- upper = N * 0.6
- lower = 3
- for w,c in allw.items():
- if c > lower and c < upper:
- sql = "select rowid, dftext from wordlist where word = '%s'" % w
- row = self.db.selectone(sql)
- if row == None:
- # print '\'',w, "\' not found"
- continue
- wordmap[w] = (row[0], (self.logN - math.log(row[1]+1, 2)))
-
-
- # Create the libsvm input
- # TFIDF the value (Added by Liang Dong)
- l1 = []
- cnt = 0
- for f in articlew:
- l1.append({})
- for word in f.keys():
- if word in wordmap.keys():
- l1[cnt][wordmap[word][0]] = math.log(f[word]+1,2)*wordmap[word][1]
- cnt += 1
-
- baseformats = []
- for item in l1:
- text = ""
- for wordid in sorted(item.keys()):
- text += str(wordid)+":"+str(item[wordid])+" "
- baseformats.append(text)
-
- return baseformats, wordmap
-
- ##############################
- #
- # Predict
- #
- ##############################
- def init_predict(self, model_dir):
- self.allwords_idf = {}
- self.build_allwords_idf()
- self.models = []
-
- for category in self.categories:
- file = os.path.join(model_dir, category+"_train.model")
- print "Loading SVM model:", file
- self.models.append(svm_model(file))
-
- self.range = {}
- rangefile = os.path.join(model_dir, "AIOverview_train.range")
- self.__load_range(rangefile)
-
- def build_allwords_idf(self):
- """
- Pre-calculate the idf value for all the words whose doc freq value
- belongs to the certain range (lower_df, upper_df).
- """
- sql = '''select rowid, dftext from wordlist
- where dftext > %d and dftext < %d
- ''' % (self.lower_df, self.upper_df)
- rows = self.db.selectall(sql)
- for row in rows:
- idf = self.logN - math.log(row[1]+1, 2)
- self.allwords_idf[row[0]] = idf
-
- def __load_range(self, filename):
- """
- Read in the range file generated by svm-train tool which list the min
- and max value of each feature. Since the min value is always 0, only
- the max value is read and stored in a dictionary
- self.range[wordid] = max_value of the feature
- @param filename: the libSVM formatted input file
- @type filename: C{string}
- """
- lines = loadfile(filename)
- for line in lines[2:]:
- items = line[:-1].split(' ')
- self.range[int(items[0])] = float(items[2])
-
- def __retrieve_url_tfidf(self, urlid):
- """
- Retrieve the tfidf of each word based on the urlid.
- @param urlid: target news story's urlid.
- @type urlid: C{int}
- """
- sql = '''select t.wordid,t.freq from textwordurl as t, wordlist as w
- where urlid = %d and t.wordid = w.rowid and dftext > %d
- and dftext < %d''' % (urlid, self.lower_df, self.upper_df)
- rows = self.db.selectall(sql)
- data = {}
- for row in rows:
- if row[0] not in self.range.keys():
- continue
- tfidf = (math.log(row[1]+1, 2)) * self.allwords_idf[row[0]]
- data[row[0]] = tfidf / self.range[row[0]]
- return data
-
- def predict(self, urlid):
- data = self.__retrieve_url_tfidf(urlid)
- max_prob = 0
- max_i = 0
- for (i, model) in enumerate(self.models):
- prob = model.predict_probability(data)
- print self.categories[i], prob
- if prob[1][1] > max_prob:
- max_i = i
- max_prob = prob[1][1]
- print urlid, self.categories[max_i], max_prob
-
-if __name__ == "__main__":
- start = datetime.now()
-
- cat = AINewsSVMClassifier()
-
- VIEW_ALL_FILE, TRAIN, PREDICT = range(0,3)
-
- type = PREDICT
-
- if type == VIEW_ALL_FILE:
- src_dir = "category/data"
- dirs = sorted([f for f in os.listdir(src_dir)
- if os.path.isdir(os.path.join(src_dir, f)) ])
- cnt = 0
- for dir in dirs:
- files = sorted([f for f in os.listdir(os.path.join(src_dir,dir))])
- for (i,file) in enumerate(files):
- print cnt, i, dir, file
- cnt += 1
-
- elif type == TRAIN:
- src_dir = "category/newdata"
- dest_dir = "category/newmodels"
- cat.train(src_dir, dest_dir)
-
- elif type == PREDICT:
- model_dir = "category/newmodels"
- cat.init_predict(model_dir)
- for urlid in range(650,675):
- cat.predict(urlid)
-
-
- print datetime.now() - start
-
-
-
-
View
20 svm-easy.py
@@ -15,21 +15,19 @@
train_pathname = sys.argv[1]
assert os.path.exists(train_pathname),"training file not found"
-file_name = os.path.split(train_pathname)[1]
-scaled_file = file_name + ".scale"
-model_file = file_name + ".model"
-range_file = file_name + ".range"
+scaled_file = train_pathname + ".scale"
+model_file = train_pathname + ".model"
+range_file = train_pathname + ".range"
if len(sys.argv) > 2:
test_pathname = sys.argv[2]
- file_name = os.path.split(test_pathname)[1]
assert os.path.exists(test_pathname),"testing file not found"
- scaled_test_file = file_name + ".scale"
- predict_test_file = file_name + ".predict"
+ scaled_test_file = test_pathname + ".scale"
+ predict_test_file = test_pathname + ".predict"
cmd = '%s -l 0 -u 1 -s "%s" "%s" > "%s"' % (svmscale_exe, range_file, train_pathname, scaled_file)
print('Scaling training data...')
-Popen(cmd, shell = True, stdout = PIPE).communicate()
+Popen(cmd, shell = True).wait()
cmd = '%s -svmtrain "%s" "%s"' % (grid_py, svmtrain_exe, scaled_file)
print('Cross validation...')
@@ -47,17 +45,17 @@
cmd = '%s -b 1 -t 0 -c %s -g %s "%s" "%s"' % (svmtrain_exe,c,g,scaled_file,model_file)
#cmd = '%s -b 1 -log2c -1,2,1 -log2g 1,1,1 -t 0 "%s" "%s"' % (svmtrain_exe,scaled_file,model_file)
print('Training...')
-Popen(cmd, shell = True, stdout = PIPE).communicate()
+Popen(cmd, shell = True).wait()
print('Output model: %s' % model_file)
if len(sys.argv) > 2:
cmd = '%s -r "%s" "%s" > "%s"' % (svmscale_exe, range_file, test_pathname, scaled_test_file)
print('Scaling testing data...')
- Popen(cmd, shell = True, stdout = PIPE).communicate()
+ Popen(cmd, shell = True).wait()
cmd = '%s "%s" "%s" "%s"' % (svmpredict_exe, scaled_test_file, model_file, predict_test_file)
print('Testing...')
- Popen(cmd, shell = True).communicate()
+ Popen(cmd, shell = True).wait()
print('Output prediction: %s' % predict_test_file)
View
5 svm-grid.py
@@ -47,9 +47,8 @@ def process_options(argv=sys.argv):
sys.exit(1)
dataset_pathname = argv[-1]
- dataset_title = os.path.split(dataset_pathname)[1]
- out_filename = '%s.out' % dataset_title
- png_filename = '%s.png' % dataset_title
+ out_filename = '%s.out' % dataset_pathname
+ png_filename = '%s.png' % dataset_pathname
pass_through_options = []
i = 1

0 comments on commit ad5322e

Please sign in to comment.
Something went wrong with that request. Please try again.