Merge branch 'release/0.21.0'

EducationalTestingService · Nov 9, 2013 · 998d3bf · 998d3bf
2 parents 8eb7d0d + b44b6e7
commit 998d3bf
Show file tree

Hide file tree

Showing 32 changed files with 706 additions and 173 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,6 +10,14 @@ __pycache__
 .DS_Store
 .coverage
 
+examples/train.csv
+examples/test.csv
+examples/*/dev
+examples/*/test
+examples/*/train
+examples/*/train+dev
+examples/*/output
+
 tests/configs/test_cv_folds1.cfg
 tests/configs/test_cv_folds2.cfg
 tests/configs/test_predict.cfg

diff --git a/.travis.yml b/.travis.yml
@@ -21,8 +21,9 @@ before_install:
   - sudo chmod 777 /scratch/
   - travis/miniconda.sh -b
   - export PATH=/home/travis/anaconda/bin:$PATH
+  - conda update --yes conda
 install:
-  - conda install --yes pip python=$TRAVIS_PYTHON_VERSION atlas numpy scipy
+  - conda install --yes pip python=$TRAVIS_PYTHON_VERSION atlas numpy scipy beautiful-soup six
   - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then conda install --yes scikit-learn; fi
   - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then pip install --use-mirrors configparser; fi
   - pip install -r requirements.txt --use-mirrors

diff --git a/README.rst b/README.rst
@@ -62,9 +62,34 @@ Requirements
 -  `futures <http://pypi.python.org/pypi/futures>`__ (only required for Python 2.7)
 -  `logutils <http://pypi.python.org/pypi/logutils>`__ (only required for Python 2.7)
 
+Talks
+~~~~~
+
+You can view the slides for the talk Dan Blanchard gave at PyData NYC 2013
+`here <https://www.dropbox.com/s/21nast3gxcpgd52/PyData%20NYC%202013%20slides.pdf>`__.
+
 Changelog
 ~~~~~~~~~
 
+-  v0.21.0
+
+   +  Added support for ``ElasticNet``, ``Lasso``, and ``LinearRegression``
+      learners.
+   +  Reorganized examples, and created new example based on the Kaggle
+      `Titanic <http://www.kaggle.com/c/titanic-gettingStarted>`__ data set.
+   +  Added ability to easily create multiple files at once when using
+      ``write_feature_file``.
+   +  Added support for the ``.ndj`` file extension for new-line delimited JSON
+      files. It's the same format as ``.jsonlines``, just with a different name.
+   +  Added support for comments and skipping blank lines in ``.jsonlines``
+      files.
+   +  Made some efficiency tweaks when creating logging messages.
+   +  Made labels in ``.results`` files a little clearer for objective function
+      scores.
+   +  Fixed some misleading error messages.
+   +  Fixed issue with backward-compatibility unit test in Python 2.7.
+   +  Fixed issue where predict mode required data to already be labelled.
+
 -  v0.20.0
 
    +  Refactored ``experiments`` module to remove unnecessary child processes,

diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst
@@ -17,7 +17,8 @@ things work, do the following from the command prompt:
 
     $ cd examples
     $ python make_example_iris_data.py          # download a simple dataset
-    $ run_experiment --local example.cfg        # run an experiment
+    $ cd iris
+    $ run_experiment --local evaluate.cfg        # run an experiment
 
 
 Feature file formats
@@ -48,10 +49,11 @@ The following feature file formats are supported:
         *   All other columns contain feature values, and every feature value
             must be specified (making this a poor choice for sparse data).
 
-    **jsonlines** *(Recommended)*
+    **jsonlines**/**ndj** *(Recommended)*
         A twist on the `JSON <http://www.json.org/>`_ format where every line is
-        a JSON dictionary (the entire contents of a normal JSON file). Each
-        dictionary is expected to contain the following keys:
+        a either JSON dictionary (the entire contents of a normal JSON file), or
+        a comment line starting with ``//``. Each dictionary is expected to
+        contain the following keys:
 
         *   *y*: The class label.
         *   *x*: A dictionary of feature values.
@@ -102,13 +104,13 @@ settings for each section is provided below, but to summarize:
     a training location, a test location, and set ``task`` to ``predict``.
 
 *   If you want to just **train a model**, specify a training location, and set
-    ``task`` to ``train_only``.
+    ``task`` to ``train``.
 
 *   A list of classifiers/regressors to try on your feature files is
     required.
 
-An example configuration file is available
-`here <https://github.com/EducationalTestingService/skll/blob/master/examples/example.cfg>`_.
+Example configuration files are available
+`here <https://github.com/EducationalTestingService/skll/blob/master/examples/>`_.
 
 General
 ^^^^^^^
@@ -176,9 +178,9 @@ Input
         ``ValueError`` will be raised if this is not the case.
 
     **suffix** *(Optional)*
-        The file format the training/test files are in. Valid option are ".tsv",
-        ".megam", and ".jsonlines" (one complete JSON dict per line in the
-        file).
+        The file format the training/test files are in. Valid option are
+        ``.arff``, ``.csv``, ``.jsonlines``, ``.megam,``, ``.ndj``, and
+        ``.tsv``".
 
         If you omit this field, it is assumed that the "prefixes" listed
         in ``featuresets`` are actually complete filenames. This can be useful
@@ -209,6 +211,9 @@ Input
 
             *   *DecisionTreeRegressor*: `Decision Tree Regressor <http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor>`_
             *   *GradientBoostingRegressor (gb_regressor)*: `Gradient Boosting Regressor <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html#sklearn.ensemble.GradientBoostingRegressor>`_
+            *   *ElasticNet*: `ElasticNet Regression <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html#sklearn.linear_model.ElasticNet>`_
+            *   *Lasso*: `Lasso Regression <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html#sklearn.linear_model.Lasso>`_
+            *   *LinearRegression*: `Linear Regression <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html#sklearn.linear_model.LinearRegression>`_
             *   *RandomForestRegressor*: `Random Forest Regressor <http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html#sklearn.ensemble.RandomForestRegressor>`_
             *   *Ridge (ridge)*: `Ridge Regression <http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge>`_
             *   *SVR (svr_linear)*: `Support Vector Regression <http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVR.html#sklearn.svm.SVR>`_
@@ -397,7 +402,7 @@ Tuning
 
            [{'max_depth': [1, 3, 5], 'n_estimators': [500]}]
 
-        *Ridge*
+        *ElasticNet*, *Lasso*, and *Ridge*
 
         .. code-block:: python
 
@@ -426,8 +431,7 @@ Output
 
     **results** *(Optional)*
         Directory to store result files in. If omitted, the current working
-        directory is used, **and we're assumed to just want to generate
-        predictions if the test_location is specified.**
+        directory is used.
 
     **log** *(Optional)*
         Directory to store result files in. If omitted, the current working

diff --git a/doc/skll.rst b/doc/skll.rst
@@ -15,7 +15,7 @@ Train a linear svm (assuming we have `train_examples`)::
 
     from skll import Learner
 
-    learner = Learner(model_type='LinearSVC')
+    learner = Learner('LinearSVC')
     learner.train(train_examples)
 
 
@@ -27,7 +27,7 @@ Evaluate a trained model::
 
 Perform ten-fold cross-validation with a radial SVM::
 
-    learner = Learner(model_type='SVC')
+    learner = Learner('SVC')
     fold_result_list, grid_search_scores = learner.cross-validate(train_examples)
 
 ``fold_result_list`` in this case is a list of the results returned by

diff --git a/examples/example.cfg b/examples/example.cfg
diff --git a/examples/iris/cross_val.cfg b/examples/iris/cross_val.cfg
@@ -0,0 +1,22 @@
+[General]
+experiment_name = Example_CV
+task = cross_validate
+
+[Input]
+# this could also be an absolute path instead (and must be if you're not running things in local mode)
+train_location = iris/train
+featuresets = [["example_iris_features"]]
+# there is only set of features to try with one feature file in it here.
+featureset_names = ["example_iris"]
+learners = ["RandomForestClassifier", "SVC", "LinearSVC", "LogisticRegression", "MultinomialNB"]
+suffix = .jsonlines
+
+[Tuning]
+grid_search = true
+objective = f1_score_micro
+
+[Output]
+# again, these can be absolute paths
+results = output
+log = output
+predictions = output
diff --git a/examples/iris/evaluate.cfg b/examples/iris/evaluate.cfg
@@ -0,0 +1,23 @@
+[General]
+experiment_name = Example_Evaluate
+task = evaluate
+
+[Input]
+# this could also be an absolute path instead (and must be if you're not running things in local mode)
+train_location = iris/train
+test_location = iris/test
+featuresets = [["example_iris_features"]]
+# there is only set of features to try with one feature file in it here.
+featureset_names = ["example_iris"]
+learners = ["RandomForestClassifier", "SVC", "LinearSVC", "LogisticRegression", "MultinomialNB"]
+suffix = .jsonlines
+
+[Tuning]
+grid_search = true
+objective = f1_score_micro
+
+[Output]
+# again, these can be absolute paths
+results = output
+log = output
+predictions = output
diff --git a/examples/make_example_iris_data.py b/examples/make_example_iris_data.py
@@ -12,12 +12,21 @@
 
 import json
 import os
+import sys
+
 import sklearn.datasets
 from sklearn.cross_validation import train_test_split
 
 
 def main():
+    '''
+    Download some example data and split it into training and test data.
+    '''
+    print('Retrieving iris data from servers...', end='')
     iris_data = sklearn.datasets.load_iris()
+    print('done')
+    sys.stdout.flush()
+
 
     X = iris_data['data']
     Y = [iris_data['target_names'][label] for label in iris_data['target']]
@@ -30,6 +39,7 @@ def main():
     examples_train, examples_test = train_test_split(examples, test_size=0.33,
                                                      random_state=42)
 
+    print('Writing training and testing files...', end='')
     for examples, suffix in [(examples_train, 'train'), (examples_test,
                                                          'test')]:
         iris_dir = os.path.join('iris', suffix)
@@ -40,6 +50,8 @@ def main():
         with open(jsonlines_path, 'w') as f:
             for ex in examples:
                 f.write('{}\n'.format(json.dumps(ex)))
+    print('done')
+
 
 if __name__ == '__main__':
     main()
diff --git a/examples/make_titanic_example_data.py b/examples/make_titanic_example_data.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+'''
+This is a simple script to split the train.csv and test.csv files from the
+Kaggle "Titanic: Machine Learning from Disaster" competition into the format
+titanic.cfg expects.
+
+:author: Dan Blanchard (dblanchard@ets.org)
+:organization: ETS
+'''
+
+from __future__ import division, print_function, unicode_literals
+
+import logging
+import os
+import sys
+
+from skll import load_examples, write_feature_file
+
+def main():
+    '''
+    Create directories and split CSV files into subsets.
+    '''
+    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
+                                '%(message)s'), level=logging.INFO)
+    logger = logging.getLogger(__name__)
+    if not (os.path.exists('train.csv') and os.path.exists('test.csv')):
+        logger.error('This script requires the train.csv and test.csv files ' +
+                     'from http://www.kaggle.com/c/titanic-gettingStarted/' +
+                     'data to be in the current directory in order to work. ' +
+                     'Please download them and try again.')
+        sys.exit(1)
+
+    # Create dictionary of subsets to use for creating split feature files
+    subset_dict = {'vitals': ['Sex', 'Age'],
+                   'socioeconomic': ['Pclass', 'Fare'],
+                   'family': ['SibSp', 'Parch'],
+                   'misc': ['Embarked']}
+
+    # Create directories to store files
+    if not os.path.exists('titanic/train'):
+        logger.info('Creating titanic/train directory')
+        os.makedirs('titanic/train')
+    if not os.path.exists('titanic/dev'):
+        logger.info('Creating titanic/dev directory')
+        os.makedirs('titanic/dev')
+    if not os.path.exists('titanic/train+dev'):
+        logger.info('Creating titanic/train+dev directory')
+        os.makedirs('titanic/train+dev')
+    if not os.path.exists('titanic/test'):
+        logger.info('Creating titanic/test directory')
+        os.makedirs('titanic/test')
+
+    # Read and write training examples
+    train_examples = load_examples('train.csv', label_col='Survived',
+                                   quiet=False, sparse=False)
+    num_train_dev = len(train_examples.classes)
+    num_train = int((num_train_dev / 5) * 4)
+    train_ids = list(range(1, num_train_dev + 1))
+    write_feature_file('titanic/train/.csv',
+                       train_ids[:num_train],
+                       train_examples.classes[:num_train],
+                       train_examples.features[:num_train, :],
+                       feat_vectorizer=train_examples.feat_vectorizer,
+                       subsets=subset_dict, label_col='Survived',
+                       id_prefix='train_example')
+
+    # Write train+dev set for training model to use to generate predictions on test
+    write_feature_file('titanic/train+dev/.csv',
+                       train_ids,
+                       train_examples.classes,
+                       train_examples.features,
+                       feat_vectorizer=train_examples.feat_vectorizer,
+                       subsets=subset_dict, label_col='Survived',
+                       id_prefix='train_example')
+
+    # Write dev examples
+    write_feature_file('titanic/dev/.csv',
+                       train_ids[num_train:],
+                       train_examples.classes[num_train:],
+                       train_examples.features[num_train:, :],
+                       feat_vectorizer=train_examples.feat_vectorizer,
+                       subsets=subset_dict, label_col='Survived',
+                       id_prefix='dev_example')
+
+    # Read and write test examples
+    test_examples = load_examples('test.csv', label_col='Survived',
+                                   quiet=False, sparse=False)
+    num_test = len(test_examples.classes)
+    test_ids = list(range(num_train_dev + 1, num_test + num_train_dev + 1))
+    write_feature_file('titanic/test/.csv', test_ids,
+                       test_examples.classes, test_examples.features,
+                       feat_vectorizer=test_examples.feat_vectorizer,
+                       subsets=subset_dict, label_col='Survived',
+                       id_prefix='test_example')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/titanic/cross_validate.cfg b/examples/titanic/cross_validate.cfg
@@ -0,0 +1,20 @@
+[General]
+experiment_name = Titanic_CV
+task = cross_validate
+
+[Input]
+# this could also be an absolute path instead (and must be if you're not running things in local mode)
+train_location = train+dev
+featuresets = [["family.csv", "misc.csv", "socioeconomic.csv", "vitals.csv"]]
+learners = ["RandomForestClassifier", "DecisionTreeClassifier", "SVC", "MultinomialNB"]
+label_col = Survived
+
+[Tuning]
+grid_search = true
+objective = accuracy
+
+[Output]
+# again, these can be absolute paths
+log = output
+results = output
+predictions = output