Skip to content
Browse files

Training and classifying with Weka seems to work.

  • Loading branch information...
1 parent a8fa28f commit 5c9434c2b86d2c1c93037ad6818d9d1e248e60db @joshuaeckroth joshuaeckroth committed Feb 10, 2013
Showing with 254 additions and 228 deletions.
  1. +12 −2 AINews.py
  2. +0 −1 AINewsCorpus.py
  3. +15 −1 AINewsCrawler.py
  4. +10 −88 AINewsPublisher.py
  5. +5 −3 AINewsTextProcessor.py
  6. +0 −132 AINewsTools.py
  7. +198 −0 AINewsWekaClassifier.py
  8. +6 −1 README.md
  9. +8 −0 config/paths.ini.sample
View
14 AINews.py
@@ -14,6 +14,7 @@
from AINewsConfig import config, paths
from AINewsCrawler import AINewsCrawler
from AINewsPublisher import AINewsPublisher
+from AINewsWekaClassifier import AINewsWekaClassifier
def usage():
"""
@@ -31,11 +32,13 @@ def usage():
(3) email:
Generate an email form for submitting the weekly alert.
+
+ (4) train:
+ Update the classifier models.
"""
print usage
-
def crawl(opts):
crawler = AINewsCrawler()
crawler.fetch_all_sources(opts)
@@ -50,11 +53,15 @@ def email():
publisher = AINewsPublisher()
publisher.publish_email_semiauto()
+def train():
+ weka = AINewsWekaClassifier()
+ weka.train()
+
def main():
# Set en_US, UTF8
locale.setlocale(locale.LC_ALL,'en_US.UTF-8')
- commands_list = ("crawl", "prepare", "email", "help")
+ commands_list = ("crawl", "prepare", "email", "train", "help")
try:
if len(sys.argv) < 2 or sys.argv[1] not in commands_list:
usage()
@@ -77,5 +84,8 @@ def main():
elif command == "email":
email()
+ elif command == "train":
+ train()
+
if __name__ == "__main__":
main()
View
1 AINewsCorpus.py
@@ -15,7 +15,6 @@
from AINewsConfig import config, paths
from AINewsDB import AINewsDB
from AINewsTextProcessor import AINewsTextProcessor
-from AINewsTools import loadpickle, trunc, convert_to_printable
class AINewsCorpus:
"""
View
16 AINewsCrawler.py
@@ -53,7 +53,13 @@ def get_sources(self, opts):
def fetch_all_sources(self, opts):
for source in self.get_sources(opts):
print "CRAWL: Crawling \"%s\"..." % source['title']
- f = feedparser.parse(source['link'])
+ try:
+ f = feedparser.parse(source['link'])
+ except Exception, e:
+ print "Exception while parsing feed: %s" % (source['link'],)
+ print e
+ continue
+
for entry in f.entries:
d = None
try:
@@ -74,6 +80,8 @@ def fetch_all_sources(self, opts):
if d > self.today or d < self.earliest_date: continue
if entry.title[-6:] == '(blog)' \
or entry.title[-15:] == '(press release)':
+ print "Blog or press release in title. (%s) (%s)" % \
+ (entry.link, entry.title)
continue
try:
url = urllib2.urlopen(entry.link).geturl()
@@ -84,12 +92,18 @@ def fetch_all_sources(self, opts):
# attempt to skip blogs
if re.match('^.*blog.*$', url):
+ print "'blog' in url (%s) (%s)" % \
+ (entry.link, entry.title)
continue
# attempt to skip job postings
if re.match('^.*job.*$', url):
+ print "'job' in url (%s) (%s)" % \
+ (entry.link, entry.title)
continue
# skip urls we have already crawled
if self.db.crawled(url):
+ print "Seen this url before (%s) (%s)" % \
+ (entry.link, entry.title)
continue
title = cgi.escape(convert_to_printable(entry.title)).strip()
View
98 AINewsPublisher.py
@@ -17,13 +17,14 @@
from random import shuffle
from subprocess import *
from datetime import date, datetime, timedelta
-from AINewsTools import savefile, convert_to_printable
-from AINewsConfig import config, paths, aitopic_urls, blacklist_urls
+from AINewsTools import savefile
+from AINewsConfig import config, paths, blacklist_urls
from AINewsDB import AINewsDB
from AINewsCorpus import AINewsCorpus
from AINewsDuplicates import AINewsDuplicates
from AINewsTextProcessor import AINewsTextProcessor
from AINewsSummarizer import AINewsSummarizer
+from AINewsWekaClassifier import AINewsWekaClassifier
sys.path.append(paths['templates.compiled'])
from FeedImport import FeedImport
@@ -37,19 +38,12 @@ def __init__(self):
self.db = AINewsDB()
self.corpus = AINewsCorpus()
self.duplicates = AINewsDuplicates()
- self.svm_classifier = AINewsSVMClassifier()
self.txtpro = AINewsTextProcessor()
+ self.weka = AINewsWekaClassifier()
self.articles = {}
self.semiauto_email_output = ""
- self.topicids = {"AIOverview":0, "Agents":1, "Applications":2,
- "CognitiveScience":3, "Education":4,"Ethics":5,
- "Games":6, "History":7, "Interfaces":8, "MachineLearning":9,
- "NaturalLanguage":10, "Philosophy":11, "Reasoning":12,
- "Representation":13, "Robots":14, "ScienceFiction":15,"Speech":16,
- "Systems":17, "Vision":18}
-
def filter_and_process(self):
self.articles = self.corpus.get_unprocessed()
@@ -102,20 +96,11 @@ def filter_and_process(self):
self.articles[urlid]['transcript'].append(
'Rejected due to only one or no whitelisted terms')
- # update categories based on SVM classifier predictions
+ # update categories based on classifier predictions
print "Classifying..."
- self.svm_classifier.predict(self.articles)
-
- # drop articles classified as 'NotRelated' unless the article
- # is user-submitted
- for urlid in self.articles:
- if 'NotRelated' in self.articles[urlid]['categories'] \
- and self.articles[urlid]['source'] != 'User Submitted':
- self.articles[urlid]['publish'] = False
- self.articles[urlid]['transcript'].append(
- 'Rejected due to NotRelated classification')
+ weka.predict(articles)
- # drop articles with no categories (even if user-submitted)
+ # drop articles with no categories
print "Dropping articles with no categories..."
for urlid in self.articles:
if len(self.articles[urlid]['categories']) == 0:
@@ -147,44 +132,6 @@ def filter_and_process(self):
print "Marking as processed."
self.corpus.mark_processed(self.articles.itervalues())
- # save sorted list of articles to be read by AINewsPublisher; sort by
- # duplicate count (more = better), then relevance of source,
- # then by number of categories (more = better)
- unpublished_articles = sorted(
- filter(lambda x: x['publish'], self.articles.values()),
- cmp=lambda x,y: self.corpus.compare_articles(x, y),
- reverse = True)
-
- max_cat_count = int(config['publisher.max_cat_count'])
- max_count = int(config['publisher.max_count'])
- cat_counts = {}
- for cat in self.corpus.categories:
- cat_counts[cat] = 0
- # choose stories such that no category has more than max_cat_count
- # members and no more than max_count stories have been selected
- # (independent of category); only one of the article's categories needs
- # to have "free space"
- self.publishable_articles = []
- for article in unpublished_articles:
- if len(self.publishable_articles) == max_count:
- break
- free_cat = False
- for cat in article['categories']:
- if cat_counts[cat] < max_cat_count:
- free_cat = True
- break
- # if there is a free category or this article has only the
- # Applications category, then it can be published
- if free_cat or (article['categories'] == ['Applications']):
- self.publishable_articles.append(article)
- self.articles[article['urlid']]['transcript'].append('Published')
- self.articles[article['urlid']]['published'] = True
- for cat in article['categories']:
- cat_counts[cat] += 1
-
- # record that these articles are publishable
- self.corpus.mark_publishable(self.publishable_articles)
-
def grab_convert_image(self, article):
if len(article['image_url']) == 0:
article['image_path'] = ''
@@ -196,7 +143,8 @@ def grab_convert_image(self, article):
img.close()
# produces [urlid].jpg
Popen("%s -format jpg -gravity Center -thumbnail 200x200 %s%s" % \
- (paths['imagemagick.mogrify'], paths['ainews.image_dir'], str(article['urlid'])),
+ (paths['imagemagick.mogrify'], paths['ainews.image_dir'],
+ str(article['urlid'])),
shell = True).communicate()
# remove [urlid] file (with no extension)
remove("%s%s" % (paths['ainews.image_dir'], str(article['urlid'])))
@@ -218,32 +166,6 @@ def generate_feed_import(self):
xml = FeedImport()
for article in self.articles.values():
article['source'] = re.sub(r'&', '&amp;', article['source'])
- cats_fixed = []
- for cat in article['categories']:
- if cat == "Agents": continue
- if cat == "AIOverview":
- cat = "AI Overview"
- if cat == "CognitiveScience":
- cat = "Cognitive Science"
- if cat == "Education": continue
- if cat == "Ethics":
- cat = "Ethics &amp; Social Issues"
- if cat == "Games":
- cat = "Games &amp; Puzzles"
- if cat == "MachineLearning":
- cat = "Machine Learning"
- if cat == "NaturalLanguage":
- cat = "Natural Language"
- if cat == "Reasoning":
- cat = "Representation &amp; Reasoning"
- if cat == "Representation":
- cat = "Representation &amp; Reasoning"
- if cat == "ScienceFiction":
- cat = "Science Fiction"
- if cat == "Systems":
- cat = "Systems &amp; Languages"
- cats_fixed.append(cat)
- article['categories_fixed'] = cats_fixed
xml.news = self.articles.values()
savefile(paths['ainews.output_xml'] + "news.xml", str(xml))
@@ -261,7 +183,7 @@ def generate_email_output(self):
'pubdate': date(int(published[0:4]),
int(published[5:7]),
int(published[8:10])),
- 'summary': re.sub(r'</p>(</blockquote>)?$', '', re.sub(r'^(<blockquote>)?<p>', '', convert_to_printable(node.findtext("Body")))),
+ 'summary': re.sub(r'</p>(</blockquote>)?$', '', re.sub(r'^(<blockquote>)?<p>', '', node.findtext("Body"))),
'url': node.findtext("Original_link"),
'link': re.sub(r'/news/', 'http://aitopics.org/news/', node.findtext("Link")),
'image': re.sub(r'<img', '<img align="left" style="margin: 8px 8px 8px 0; border: 1px solid #ccc; padding: 5px; background: white;" ',
View
8 AINewsTextProcessor.py
@@ -80,12 +80,14 @@ def simpletextprocess(self, urlid, raw):
@param raw: the raw text to be processed.
@type raw: C{string}
"""
- if urlid in self.cache:
+ if urlid > 0 and urlid in self.cache:
return self.cache[urlid]
unigrams = map(lambda w: self.stem(w), self.unigrams(raw))
- self.cache[urlid] = nltk.FreqDist(unigrams)
- return self.cache[urlid]
+ dist = nltk.FreqDist(unigrams)
+ if urlid > 0:
+ self.cache[urlid] = dist
+ return dist
def whiteprocess(self, urlid, raw):
"""
View
132 AINewsTools.py
@@ -13,64 +13,13 @@
import os
import sys
import re
-import csv
import string
import pickle
-import locale
import ConfigParser
from unidecode import unidecode
-# from http://www.jamesmurty.com/2011/12/30/python-code-utf8-to-latin1/
-def encode_utf8_to_iso88591(utf8_text):
- '''
- Encode and return the given UTF-8 text as ISO-8859-1 (latin1) with
- unsupported characters replaced by '?', except for common special
- characters like smart quotes and symbols that we handle as well as we can.
- For example, the copyright symbol => '(c)' etc.
-
- If the given value is not a string it is returned unchanged.
-
- References:
- en.wikipedia.org/wiki/Quotation_mark_glyphs#Quotation_marks_in_Unicode
- en.wikipedia.org/wiki/Copyright_symbol
- en.wikipedia.org/wiki/Registered_trademark_symbol
- en.wikipedia.org/wiki/Sound_recording_copyright_symbol
- en.wikipedia.org/wiki/Service_mark_symbol
- en.wikipedia.org/wiki/Trademark_symbol
- '''
- if not isinstance(utf8_text, basestring):
- return utf8_text
- # Replace "smart" and other single-quote like things
- utf8_text = re.sub(
- u'[\u02bc\u2018\u2019\u201a\u201b\u2039\u203a\u300c\u300d(\xe2\x80\x99)\xe2]',
- "'", utf8_text)
- # Replace "smart" and other double-quote like things
- utf8_text = re.sub(
- u'[\u00ab\u00bb\u201c\u201d\u201e\u201f\u300e\u300f]',
- '"', utf8_text)
- # Replace copyright symbol
- utf8_text = re.sub(u'[\u00a9\u24b8\u24d2]', '(c)', utf8_text)
- # Replace registered trademark symbol
- utf8_text = re.sub(u'[\u00ae\u24c7]', '(r)', utf8_text)
- # Replace sound recording copyright symbol
- utf8_text = re.sub(u'[\u2117\u24c5\u24df]', '(p)', utf8_text)
- # Replace service mark symbol
- utf8_text = re.sub(u'[\u2120]', '(sm)', utf8_text)
- # Replace trademark symbol
- utf8_text = re.sub(u'[\u2122]', '(tm)', utf8_text)
- # Replace mdash
- utf8_text = re.sub(u'[\xe2]', '---', utf8_text)
- # Replace umlaut e
- utf8_text = re.sub(u'[\xc3]', 'e', utf8_text)
-
- return utf8_text.encode('ISO-8859-1', 'replace')
-
def convert_to_printable(text):
return unidecode(text)
- #result = ""
- #for c in text:
- # if c in string.printable: result += str(c)
- #return result
def savefile(filename, content):
"""
@@ -121,46 +70,6 @@ def loadfile2(filename):
f.close()
return content
-def loadpickle(filename):
- """
- Helper function to load content by Python's Pickle module.
- @param filename: target file's name
- @type filename: C{string}
- """
- pkl_file = open(filename, 'rb')
- content = pickle.load(pkl_file)
- pkl_file.close()
- return content
-
-def savepickle(filename, content):
- """
- Helper function to save content into file.
- @param filename: save content into target file's name
- @type filename: C{string}
- @param content: the content to be saved
- @type content: C{string}
- """
- output = open(filename, 'wb')
- pickle.dump(content, output)
- output.close()
-
-def loadcsv(filename):
- """
- Read csv files and return rows
- @param filename: target file's name
- @type filename: C{string}
- """
- rows = []
- try:
- file = open(filename, 'r')
- except IOError , e:
- print "Fail to csv file because of %s" % e
- else:
- for row in csv.reader(file):
- rows.append(row)
- file.close()
- return rows
-
def unescape(url):
"""
The url retrieved from MySQL database has extra slash('\') for all the
@@ -190,47 +99,6 @@ def loadconfig(filename, config={}):
string.strip(cp.get(sec, opt))
return config
-def loadpmwiki(filename):
- """
- Deprecated.
- Load Pmwiki page from wiki.d directory and extract contents.
- """
- lines = loadfile(filename)
- page = {}
- for line in lines:
- pos = re.search("=", line)
- if pos != None:
- page[line[:pos.start()]] = line[pos.end():]
- return page
-
-def savepmwiki(filename, page):
- """
- Deprecated.
- Save Pmwiki page from wiki.d directory
- """
- content = ""
- for key in page:
- content += key + '=' + page[key]
- savefile(filename, content)
-
-def strip_html(html):
- """
- Helper function to quickly remove all the <> tags from the html code.
- @param html: target raw html code
- @type html: C{string}
- """
- res = ''
- start = 0
- for char in html:
- if char == '<':
- start = 1
- elif char == '>':
- start = 0
- res += ' '
- elif start == 0:
- res += char
- return res
-
def getwords(raw):
"""
Helper function to extract bag of words from the raw text.
View
198 AINewsWekaClassifier.py
@@ -0,0 +1,198 @@
+# This file is part of NewsFinder.
+# https://github.com/joshuaeckroth/AINews
+#
+# Copyright (c) 2011 by the Association for the Advancement of
+# Artificial Intelligence. This program and parts of it may be used and
+# distributed without charge for non-commercial purposes as long as this
+# notice is included.
+
+import re
+import pickle
+import arff
+from os import listdir, remove
+from subprocess import *
+from AINewsCorpus import AINewsCorpus
+from AINewsConfig import config, paths
+from AINewsTextProcessor import AINewsTextProcessor
+
+class AINewsWekaClassifier:
+ def __init__(self):
+ self.txtpro = AINewsTextProcessor()
+
+ def classify_many(self, articles):
+ # create arff file
+ arff = open("%snewsfinder.arff" % paths['weka.tmp_arff_dir'], 'w')
+ arff.write("@relation newsfinder\n")
+ arff.write("@attribute title string\n")
+ arff.write("@attribute class {1,0}\n")
+ arff.write("@data\n")
+
+ sorted_urlids = sorted(articles.keys())
+ for urlid in sorted_urlids:
+ title = re.sub(r'\'', '', articles[urlid]['title'])
+ arff.write("'%s',0\n" % title)
+
+ Popen("java -cp %s %s -i %snewsfinder.arff -o %snewsfinder-wordvec.arff" % \
+ (paths['weka.weka_jar'], config['weka.wordvec_params'],
+ paths['weka.tmp_arff_dir'], paths['weka.tmp_arff_dir']),
+ shell = True).communicate()
+
+ print "java -cp %s %s -i %snewsfinder-wordvec.arff -o %snewsfinder-reorder.arff" % \
+ (paths['weka.weka_jar'], config['weka.reorder_params'],
+ paths['weka.tmp_arff_dir'], paths['weka.tmp_arff_dir'])
+
+ Popen("java -cp %s %s -i %snewsfinder-wordvec.arff -o %snewsfinder-reorder.arff" % \
+ (paths['weka.weka_jar'], config['weka.reorder_params'],
+ paths['weka.tmp_arff_dir'], paths['weka.tmp_arff_dir']),
+ shell = True).communicate()
+
+ def __save_bag_of_words(self, tid):
+ # find all unique words in the arff 'title' field, remove stop
+ # words, perform stemming, collect their frequencies
+ titles = []
+ f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
+ for record in f['data']:
+ titles.append(record[0])
+ bag = self.txtpro.simpletextprocess(0, ' '.join(titles))
+ p = open(paths['weka.bag_of_words'], 'w')
+ pickle.dump(bag, p)
+ p.close()
+
+ def __prepare_arff(self, tid):
+ # read titles from the arff, create a new arff with word vectors
+ p = open(paths['weka.bag_of_words'], 'r')
+ bag = pickle.load(p)
+ p.close()
+
+ data = {'attributes': [], 'data': [], 'description': u'', 'relation': tid}
+ for word in bag:
+ data['attributes'].append(("title-%s" % word, 'NUMERIC'))
+ data['attributes'].append(('class', ['yes', 'no']))
+
+ f = arff.load(open("%s%d.arff" % (paths['weka.training_arff_dir'], tid), 'r'))
+ for record in f['data']:
+ record_bag = self.txtpro.simpletextprocess(0, record[0])
+ record_data = []
+ # iterate through original bag, figure out freq in this record's bag
+ for word in bag:
+ if word in record_bag:
+ record_data.append(record_bag[word])
+ else:
+ record_data.append(0)
+ record_data.append(record[1])
+ data['data'].append(record_data)
+
+ fnew = open("%s%d-wordvec-nonsparse.arff" % \
+ (paths['weka.training_arff_dir'], tid), 'w')
+ arff.dump(fnew, data)
+ fnew.close()
+
+ # convert to sparse format
+ Popen(("java -cp %s weka.filters.unsupervised.instance.NonSparseToSparse " +
+ "-i %s%d-wordvec-nonsparse.arff -o %s%d-wordvec.arff") % \
+ (paths['weka.weka_jar'],
+ paths['weka.training_arff_dir'], tid,
+ paths['weka.training_arff_dir'], tid),
+ shell = True).communicate()
+
+ remove("%s%d-wordvec-nonsparse.arff" % (paths['weka.training_arff_dir'], tid))
+
+ # 1. load unprocessed arff files, from just one tid, from family_resemblance export
+ # 2. gather all titles, parse into a bag of words
+ # 3. save bag of words (list? need to keep the order) in a pickle file
+ # 4. write new sparse arff files for each tid using this sorted bag of words
+
+ def __get_tids(self):
+ tids = []
+ files = listdir(paths['weka.training_arff_dir'])
+ for f in files:
+ m = re.match(r'^(\d+).arff$', f)
+ if m:
+ tids.append(int(m.group(1)))
+ return tids
+
+ def train(self):
+ tids = self.__get_tids()
+
+ # all tid arffs have same entries, so use the first to grab the bag of words
+ print "Saving bag of words..."
+ self.__save_bag_of_words(tids[0])
+
+ for tid in sorted(tids):
+ print "Preparing tid %d" % tid
+ self.__prepare_arff(tid)
+
+ for tid in sorted(tids):
+ print "Spread subsampling for tid %d" % tid
+ Popen(("java -cp %s weka.filters.supervised.instance.SpreadSubsample " +
+ "-M 1.0 -X 0.0 -S 1 -c last " +
+ "-i %s%d-wordvec.arff -o %s%d-wordvec-subsample.arff") % \
+ (paths['weka.weka_jar'],
+ paths['weka.training_arff_dir'], tid,
+ paths['weka.training_arff_dir'], tid),
+ shell = True).communicate()
+
+ print "Training random forests for tid %d" % tid
+ (out, err) = Popen(("java -cp %s weka.classifiers.trees.RandomForest " +
+ "-I 20 -K 0 -v " +
+ "-t %s%d-wordvec-subsample.arff -d %s%d.model") % \
+ (paths['weka.weka_jar'],
+ paths['weka.training_arff_dir'], tid,
+ paths['weka.training_arff_dir'], tid),
+ shell = True, stdout = PIPE).communicate()
+ print out
+
+ def __predict_arff(self):
+ tids = self.__get_tids()
+
+ # the testing file should always be 0.arff
+ self.__prepare_arff(0)
+
+ predictions = {}
+ for tid in sorted(tids):
+ predictions[tid] = []
+
+ print "Predicting tid %d" % tid
+ (out, err) = Popen(("java -cp %s weka.classifiers.trees.RandomForest " +
+ "-T %s0-wordvec.arff -l %s%d.model -p last") % \
+ (paths['weka.weka_jar'],
+ paths['weka.training_arff_dir'],
+ paths['weka.training_arff_dir'], tid),
+ shell = True, stdout = PIPE).communicate()
+ for line in out.splitlines():
+ m = re.search(r'2:no\s+[12]:(no|yes)\s+\+?\s+(\d+\.\d+)', line)
+ if m:
+ answer = False
+ if m.group(1) == 'yes':
+ answer = True
+ conf = float(m.group(2))
+ predictions[tid].append((answer, conf))
+ return predictions
+
+ def predict(self, articles):
+ # modifies the provided articles dict
+
+ data = {'attributes': [('title', 'STRING'), ('class', ['yes', 'no'])],
+ 'data': [], 'description': u'', 'relation': '0'}
+
+ for urlid in sorted(articles.keys()):
+ title = re.sub(r'\W', ' ', articles[urlid]['title'])
+ title = re.sub(r'\s+', ' ', title)
+ data['data'].append([title, 'no'])
+
+ # make the testing file 0.arff
+ fnew = open("%s0.arff" % paths['weka.training_arff_dir'], 'w')
+ arff.dump(fnew, data)
+ fnew.close()
+
+ predictions = self.__predict_arff()
+
+ for urlid in sorted(articles.keys()):
+ articles[urlid]['categories'] = []
+
+ tids = self.__get_tids()
+ for tid in sorted(tids):
+ for (i, urlid) in enumerate(sorted(articles.keys())):
+ if predictions[tid][i][0]:
+ articles[urlid]['categories'].append(tid)
+
View
7 README.md
@@ -41,13 +41,14 @@ NewsFinder is primarily coded in Python and requires the following libraries:
- [Beautiful Soup](http://www.crummy.com/software/BeautifulSoup/)
- [PyRSS2Gen](http://www.dalkescientific.com/Python/PyRSS2Gen.html)
- [feedparser](http://www.feedparser.org/)
+ - [Unidecode](http://pypi.python.org/pypi/Unidecode)
Packages for Ubuntu:
<pre>
sudo apt-get install python-mysqldb libsvm-tools python-libsvm \
python-cheetah python-nltk python-beautifulsoup \
- python-pyrss2gen python-feedparser
+ python-pyrss2gen python-feedparser python-unidecode
</pre>
"Installation" of NewsFinder should only involve downloading the code in
@@ -115,3 +116,7 @@ Copyright (c) 2011 by the Association for the Advancement of
Artificial Intelligence. This program and parts of it may be used and
distributed without charge for non-commercial purposes as long as this
notice is included.
+
+The file `arff.py` is pulled from the
+[laic-arff](https://github.com/renatopp/liac-arff) package, which is
+distributed under the MIT License.
View
8 config/paths.ini.sample
@@ -95,3 +95,11 @@ editor_score = /path/to/resource/editor.csv
; (leave trailing slash)
submit_news_data = /path/to/submit_news_data/
+[weka]
+
+weka_jar = /usr/share/java/weka.jar
+
+bag_of_words = /home/josh/git/artifice/NewsFinder/weka_training/bag_of_words.pickle
+
+; (leave trailing slash)
+training_arff_dir = /home/josh/git/artifice/NewsFinder/weka_training/

0 comments on commit 5c9434c

Please sign in to comment.
Something went wrong with that request. Please try again.