Permalink
Fetching contributors…
Cannot retrieve contributors at this time
255 lines (218 sloc) 10.4 KB
# This file is part of NewsFinder.
# https://github.com/joshuaeckroth/AINews
#
# Copyright (c) 2011 by the Association for the Advancement of
# Artificial Intelligence. This program and parts of it may be used and
# distributed without charge for non-commercial purposes as long as this
# notice is included.
import re
import pickle
import arff
import csv
from nltk.probability import FreqDist
from os import listdir, remove
from subprocess import *
from AINewsCorpus import AINewsCorpus
from AINewsConfig import config, paths
from AINewsTextProcessor import AINewsTextProcessor
class AINewsWekaClassifier:
def __init__(self):
self.txtpro = AINewsTextProcessor()
def __save_bag_of_words(self, tid, fieldidx):
# find all unique words in the arff 'title' field, remove stop
# words, perform stemming, collect their frequencies
phrases = []
f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
for record in f['data']:
phrases.append(record[fieldidx])
bag = self.txtpro.simpletextprocess(0, ' '.join(phrases))
smallerbag = FreqDist()
i = 0
for word in bag:
if i == 1000:
break
smallerbag[word] = bag[word]
i += 1
p = open("%sbag_of_words-%d.pickle" % (paths['weka.bag_of_words_dir'], fieldidx), 'w')
pickle.dump(smallerbag, p)
p.close()
def __prepare_arff(self, tid):
p = open("%sbag_of_words-0.pickle" % paths['weka.bag_of_words_dir'], 'r')
bag_title = pickle.load(p)
p.close()
p = open("%sbag_of_words-1.pickle" % paths['weka.bag_of_words_dir'], 'r')
bag_body = pickle.load(p)
p.close()
data = {'attributes': [], 'data': [], 'description': u'', 'relation': tid}
for word in bag_title:
data['attributes'].append(("title-%s" % word, 'NUMERIC'))
for word in bag_body:
data['attributes'].append(("body-%s" % word, 'NUMERIC'))
data['attributes'].append(('class', ['yes', 'no']))
f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
for record in f['data']:
record_bag_title = self.txtpro.simpletextprocess(0, record[0])
record_bag_body = self.txtpro.simpletextprocess(0, record[1])
record_data = []
# iterate through original bag, figure out freq in this record's bag
for word in bag_title:
if word in record_bag_title:
record_data.append(record_bag_title[word])
else:
record_data.append(0)
for word in bag_body:
if word in record_bag_body:
record_data.append(record_bag_body[word])
else:
record_data.append(0)
record_data.append(record[2])
data['data'].append(record_data)
fnew = open("%s%d-wordvec-nonsparse.arff" % \
(paths['weka.training_arff_dir'], tid), 'w')
arff.dump(fnew, data)
fnew.close()
# convert to sparse format
Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " +
"-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \
(paths['weka.weka_jar'],
paths['weka.training_arff_dir'], tid,
paths['weka.training_arff_dir'], tid),
shell = True).communicate()
remove("%s%d-wordvec-nonsparse.arff" % (paths['weka.training_arff_dir'], tid))
# 1. load unprocessed arff files, from just one tid, from family_resemblance export
# 2. gather all titles, parse into a bag of words
# 3. save bag of words (list? need to keep the order) in a pickle file
# 4. write new sparse arff files for each tid using this sorted bag of words
def __get_tids(self):
tids = []
files = listdir(paths['weka.training_arff_dir'])
for f in files:
m = re.match(r'^(\d+).arff$', f)
if m:
if m.group(1) == '0': continue
tids.append(int(m.group(1)))
return tids
def train(self):
tids = self.__get_tids()
# all tid arffs have same entries, so use the first to grab the bag of words
print "Saving bag of words..."
self.__save_bag_of_words(tids[0], 0)
self.__save_bag_of_words(tids[0], 1)
for tid in sorted(tids):
print "Preparing tid %d" % tid
self.__prepare_arff(tid)
for tid in sorted(tids):
print "Spread subsampling for tid %d" % tid
Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " +
"-M 1.0 -X 0.0 -S 1 -c last " +
"-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \
(paths['weka.weka_jar'],
paths['weka.training_arff_dir'], tid,
paths['weka.training_arff_dir'], tid),
shell = True).communicate()
print "Training random forests for tid %d" % tid
Popen(("java -cp %s %s %s -v " +
"-t %s%d-wordvec-subsample.arff -d %s%d.model") % \
(paths['weka.weka_jar'],
config['weka.classifier'],
config['weka.classifier_params'],
paths['weka.training_arff_dir'], tid,
paths['weka.training_arff_dir'], tid),
shell = True, stdout = PIPE).communicate()
print out
def train_experiment(self):
model_scores = {}
models = {'random-forest': ('weka.classifiers.trees.RandomForest', '-I 20 -K 0'),
'naive-bayes': ('weka.classifiers.bayes.NaiveBayes', ''),
'bayesnet': ('weka.classifiers.bayes.BayesNet', ''),
'j48': ('weka.classifiers.trees.J48', ''),
'knn': ('weka.classifiers.lazy.IBk', '-K 3')}
tids = self.__get_tids()
# all tid arffs have same entries, so use the first to grab the bag of words
print "Saving bag of words..."
self.__save_bag_of_words(tids[0], 0)
self.__save_bag_of_words(tids[0], 1)
for tid in sorted(tids):
print "Preparing tid %d" % tid
self.__prepare_arff(tid)
for tid in sorted(tids):
print "Spread subsampling for tid %d" % tid
Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " +
"-M 1.0 -X 0.0 -S 1 -c last " +
"-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \
(paths['weka.weka_jar'],
paths['weka.training_arff_dir'], tid,
paths['weka.training_arff_dir'], tid),
shell = True).communicate()
for tid in sorted(tids):
model_scores[tid] = {}
for model in models.keys():
print "Training %s for tid %d" % (models[model][0], tid)
(out, _) = Popen(("java -cp %s %s %s -v " +
"-t %s%d-wordvec-subsample.arff -d %s%d.model") % \
(paths['weka.weka_jar'],
models[model][0], models[model][1],
paths['weka.training_arff_dir'], tid,
paths['weka.training_arff_dir'], tid),
shell = True, stdout = PIPE).communicate()
correct = 0.0
for line in out.splitlines():
m = re.search(r'Correctly Classified Instances\s+\d+\s+(.*) %', line)
if m:
correct = float(m.group(1))
break
model_scores[tid][model] = correct
with open('training_experiment.csv', 'w') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(['model', 'tid', 'correct'])
for tid in model_scores.keys():
for model in model_scores[tid].keys():
writer.writerow([model, tid, model_scores[tid][model]])
def __predict_arff(self):
tids = self.__get_tids()
# the testing file should always be 0.arff
self.__prepare_arff(0)
predictions = {}
for tid in sorted(tids):
predictions[tid] = []
print "Predicting tid %d" % tid
(out, err) = Popen(("java -cp %s %s " +
"-T %s0-wordvec.arff -l %s%d.model -p last") % \
(paths['weka.weka_jar'],
config['weka.classifier'],
paths['weka.training_arff_dir'],
paths['weka.training_arff_dir'], tid),
shell = True, stdout = PIPE).communicate()
for line in out.splitlines():
m = re.search(r'2:no\s+[12]:(no|yes)\s+\+?\s+(\d+\.?\d*)', line)
if m:
answer = False
if m.group(1) == 'yes':
answer = True
conf = float(m.group(2))
if conf < 0.75:
answer = False
predictions[tid].append((answer, conf))
return predictions
def predict(self, articles):
# modifies the provided articles dict
data = {'attributes': [('title', 'STRING'),
('body', 'STRING'),
('class', ['yes', 'no'])],
'data': [], 'description': u'', 'relation': '0'}
for urlid in sorted(articles.keys()):
title = re.sub(r'\W', ' ', articles[urlid]['title'])
body = re.sub(r'\W', ' ', articles[urlid]['summary'])
data['data'].append([title, body, 'no'])
# make the testing file 0.arff
fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w')
arff.dump(fnew, data)
fnew.close()
predictions = self.__predict_arff()
for urlid in sorted(articles.keys()):
articles[urlid]['categories'] = []
tids = self.__get_tids()
for tid in sorted(tids):
for (i, urlid) in enumerate(sorted(articles.keys())):
if predictions[tid][i][0]:
articles[urlid]['categories'].append(str(tid))