diff --git a/.github/stale.yml b/.github/stale.yml
new file mode 100644
index 00000000..590605d4
--- /dev/null
+++ b/.github/stale.yml
@@ -0,0 +1,16 @@
+# Number of days of inactivity before an issue becomes stale
+daysUntilStale: 90
+# Number of days of inactivity before a stale issue is closed
+daysUntilClose: 7
+# Issues with these labels will never be considered stale
+exemptLabels:
+ - pinned
+# Label to use when marking an issue as stale
+staleLabel: stale
+# Comment to post when marking an issue as stale. Set to `false` to disable
+markComment: >
+ This issue has been automatically marked as stale because it has not had
+ recent activity. It will be closed in 7 days if no further activity occurs.
+ Thank you for your contributions.
+# Comment to post when closing a stale issue. Set to `false` to disable
+closeComment: false
diff --git a/.travis.yml b/.travis.yml
index ea7e107c..327937d8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,7 +25,7 @@ before_install:
- if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then export PATH=/home/travis/miniconda2/bin:$PATH; else export PATH=/home/travis/miniconda3/bin:$PATH; fi
- conda update --yes conda
install:
- - conda install --yes --channel defaults --channel conda-forge python=$TRAVIS_PYTHON_VERSION numpy scipy beautifulsoup4 six scikit-learn==0.19.1 joblib prettytable python-coveralls ruamel.yaml
+ - conda install --yes --channel defaults --channel conda-forge python=$TRAVIS_PYTHON_VERSION numpy scipy beautifulsoup4 six scikit-learn==0.20.1 joblib prettytable python-coveralls ruamel.yaml
- if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then conda install --yes --channel defaults configparser mock; fi
- if [ ${WITH_PANDAS_AND_SEABORN} == "true" ]; then conda install --yes --channel defaults pandas seaborn; fi
# Have to use pip for nose-cov because its entry points are not supported by conda yet
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index e07b86d6..e0199044 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -114,5 +114,10 @@ documentation without the example gallery. The resulting HTML files will
be placed in _build/html/ and are viewable in a web browser. See the
README file in the doc/ directory for more information.
-For building the documentation, you will need [sphinx](http://sphinx.pocoo.org/).
+For building the documentation, you will need [sphinx](http://sphinx.pocoo.org/) as well as the readthedocs sphinx theme. To install both, just run:
+
+ conda install sphinx sphinx_rtd_theme
+
+in your existing conda environment.
+
diff --git a/README.rst b/README.rst
index 9bde7c4b..1e77ad8d 100644
--- a/README.rst
+++ b/README.rst
@@ -124,17 +124,17 @@ Requirements
- Python 2.7+
- `scikit-learn `__
-- `six `__
-- `PrettyTable `__
+- `six `__
+- `PrettyTable `__
- `BeautifulSoup 4 `__
-- `Grid Map `__ (only required if you plan
+- `Grid Map `__ (only required if you plan
to run things in parallel on a DRMAA-compatible cluster)
-- `joblib `__
+- `joblib `__
- `ruamel.yaml `__
-- `configparser `__ (only required for
+- `configparser `__ (only required for
Python 2.7)
-- `logutils `__ (only required for Python 2.7)
-- `mock `__ (only required for Python 2.7)
+- `logutils `__ (only required for Python 2.7)
+- `mock `__ (only required for Python 2.7)
The following packages can be optionally installed for additional features
but are not required:
diff --git a/conda-recipe/README.md b/conda-recipe/README.md
index 9dae17fc..790f574a 100644
--- a/conda-recipe/README.md
+++ b/conda-recipe/README.md
@@ -1,7 +1,7 @@
How to create and test conda package.
1. To create the SKLL conda package run:
- `conda build -c defaults -c conda-forge --python=3.6 --numpy=1.13 skll`
+ `conda build -c defaults -c conda-forge --python=3.6 --numpy=1.14 skll`
2. Upload the package to anaconda.org using `anaconda upload `.
3. Test the package:
`conda create -n foobar -c defaults -c conda-forge -c desilinguist python=3.6 skll=1.5`
diff --git a/conda-recipe/skll/meta.yaml b/conda-recipe/skll/meta.yaml
index 88c09935..ab63a602 100644
--- a/conda-recipe/skll/meta.yaml
+++ b/conda-recipe/skll/meta.yaml
@@ -1,6 +1,6 @@
package:
name: skll
- version: 1.5
+ version: 1.5.3
source:
path: ../../../skll
@@ -42,7 +42,7 @@ build:
requirements:
build:
- python
- - scikit-learn ==0.19.1
+ - scikit-learn ==0.20.1
- joblib >=0.8
- setuptools
- six
@@ -57,7 +57,7 @@ requirements:
run:
- python
- - scikit-learn ==0.19.1
+ - scikit-learn ==0.20.1
- joblib >=0.8
- six
- prettytable
diff --git a/conda_requirements.txt b/conda_requirements.txt
index b225501a..7cd12fe9 100644
--- a/conda_requirements.txt
+++ b/conda_requirements.txt
@@ -1,4 +1,4 @@
-scikit-learn==0.19.1
+scikit-learn==0.20.1
six
PrettyTable
beautifulsoup4
diff --git a/doc/getting_started.rst b/doc/getting_started.rst
index 724ad40d..709bf7b9 100644
--- a/doc/getting_started.rst
+++ b/doc/getting_started.rst
@@ -11,7 +11,7 @@ or via ``conda`` (only for Python 3.6)::
conda install -c defaults -c conda-forge -c desilinguist python=3.6 skll
It can also be downloaded directly from
-`GitHub `_.
+`GitHub `_.
License
diff --git a/doc/index.rst b/doc/index.rst
index c3a05b40..6fcd13fd 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -30,6 +30,7 @@ Documentation
run_experiment
utilities
api
+ internal
Indices and tables
diff --git a/doc/internal.rst b/doc/internal.rst
new file mode 100644
index 00000000..f8827b23
--- /dev/null
+++ b/doc/internal.rst
@@ -0,0 +1,7 @@
+Internal Documentation
+======================
+
+.. toctree::
+ :maxdepth: 4
+
+ internal/release
\ No newline at end of file
diff --git a/doc/internal/release.rst b/doc/internal/release.rst
new file mode 100644
index 00000000..fc06613b
--- /dev/null
+++ b/doc/internal/release.rst
@@ -0,0 +1,42 @@
+Release Process
+===============
+
+This document is only meant for the project administrators, not users and developers.
+
+1. Create a release branch on GitHub.
+
+2. In the release branch:
+
+ a. update the version numbers in ``version.py``.
+
+ b. update the conda recipe.
+
+ c. update the documentation with any new features or details about changes.
+
+ d. run ``make linkcheck`` on the documentation and fix any redirected/broken links.
+
+ e. update the README.
+
+3. Build the new conda package locally on your mac using the following command (*Note*: you may have to replace the contents of the ``requirements()`` function in ``setup.py`` with a ``pass`` statement to get ``conda build`` to work)::
+
+ conda build -c defaults -c conda-forge --python=3.6 --numpy=1.14 skll
+
+4. Convert the package for both linux and windows::
+
+ conda convert -p win-64 -p linux-64
+
+5. Upload each of the packages to anaconda.org using ``anaconda upload ``.
+
+6. Upload source package to PyPI using ``python setup.py sdist upload``.
+
+7. Draft a release on GitHub.
+
+8. Make a pull request with the release branch to be merged into ``master`` and request code review.
+
+9. Once the build for the PR passes and the reviewers approve, merge the release branch into ``master``.
+
+10. Make sure that the RTFD build for ``master`` passes.
+
+11. Tag the latest commit in ``master`` with the appropriate release tag and publish the release on GitHub.
+
+12. Send an email around at ETS announcing the release and the changes.
diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst
index b6b45604..27044171 100644
--- a/doc/run_experiment.rst
+++ b/doc/run_experiment.rst
@@ -31,7 +31,7 @@ The following feature file formats are supported:
arff
^^^^
-The same file format used by `Weka `__
+The same file format used by `Weka `__
with the following added restrictions:
* Only simple numeric, string, and nomimal values are supported.
@@ -81,8 +81,8 @@ libsvm
^^^^^^
While we can process the standard input file format supported by
-`LibSVM `__,
-`LibLinear `__,
+`LibSVM `__,
+`LibLinear `__,
and `SVMLight `__, we also support specifying
extra metadata usually missing from the format in comments at the of each line.
The comments are not mandatory, but without them, your labels and features will
@@ -105,7 +105,7 @@ megam
^^^^^
An expanded form of the input format for the
-`MegaM classification package `__ with
+`MegaM classification package `__ with
the ``-fvals`` switch.
The basic format is::
@@ -127,7 +127,7 @@ to/from this MegaM format and for adding/removing features from the files.
Creating configuration files
----------------------------
The experiment configuration files that run_experiment accepts are standard
-`Python configuration files `__
+`Python configuration files `__
that are similar in format to Windows INI files. [#]_
There are four expected sections in a configuration file: :ref:`General`,
:ref:`Input`, :ref:`Tuning`, and :ref:`Output`. A detailed description of each
@@ -138,7 +138,7 @@ possible settings for each section is provided below, but to summarize:
* If you want to do **cross-validation**, specify a path to training feature
files, and set :ref:`task` to ``cross_validate``. Please note that the
cross-validation currently uses
- `StratifiedKFold `__.
+ `StratifiedKFold `__.
You also can optionally use predetermined folds with the
:ref:`folds_file ` setting.
@@ -167,7 +167,7 @@ possible settings for each section is provided below, but to summarize:
.. _learning_curve:
-* If you want to **generate a learning curve** for your data, specify a training location and set :ref:`task` to ``learning_curve``. The learning curve is generated using essentially the same underlying process as in `scikit-learn `__ except that the SKLL feature pre-processing pipline is used while training the various models and computing the scores.
+* If you want to **generate a learning curve** for your data, specify a training location and set :ref:`task` to ``learning_curve``. The learning curve is generated using essentially the same underlying process as in `scikit-learn `__ except that the SKLL feature pre-processing pipline is used while training the various models and computing the scores.
.. note::
@@ -178,7 +178,7 @@ possible settings for each section is provided below, but to summarize:
* A :ref:`list of classifiers/regressors ` to try on your feature
files is required.
-Example configuration files are available `here `__.
+Example configuration files are available `here `__.
.. _general:
@@ -227,43 +227,43 @@ below. Custom learners can also be specified. See
Classifiers:
- * **AdaBoostClassifier**: `AdaBoost Classification `__. Note that the default base estimator is a ``DecisionTreeClassifier``. A different base estimator can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. The following additional base estimators are supported: ``MultinomialNB``, ``SGDClassifier``, and ``SVC``. Note that the last two base require setting an additional ``algorithm`` fixed parameter with the value ``'SAMME'``.
- * **DummyClassifier**: `Simple rule-based Classification `__
- * **DecisionTreeClassifier**: `Decision Tree Classification `__
- * **GradientBoostingClassifier**: `Gradient Boosting Classification `__
- * **KNeighborsClassifier**: `K-Nearest Neighbors Classification `__
- * **LinearSVC**: `Support Vector Classification using LibLinear `__
- * **LogisticRegression**: `Logistic Regression Classification using LibLinear `__
- * **MLPClassifier**: `Multi-layer Perceptron Classification `__
- * **MultinomialNB**: `Multinomial Naive Bayes Classification `__
- * **RandomForestClassifier**: `Random Forest Classification `__
- * **RidgeClassifier**: `Classification using Ridge Regression `__
- * **SGDClassifier**: `Stochastic Gradient Descent Classification `__
- * **SVC**: `Support Vector Classification using LibSVM `__
+ * **AdaBoostClassifier**: `AdaBoost Classification `__. Note that the default base estimator is a ``DecisionTreeClassifier``. A different base estimator can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. The following additional base estimators are supported: ``MultinomialNB``, ``SGDClassifier``, and ``SVC``. Note that the last two base require setting an additional ``algorithm`` fixed parameter with the value ``'SAMME'``.
+ * **DummyClassifier**: `Simple rule-based Classification `__
+ * **DecisionTreeClassifier**: `Decision Tree Classification `__
+ * **GradientBoostingClassifier**: `Gradient Boosting Classification `__
+ * **KNeighborsClassifier**: `K-Nearest Neighbors Classification `__
+ * **LinearSVC**: `Support Vector Classification using LibLinear `__
+ * **LogisticRegression**: `Logistic Regression Classification using LibLinear `__
+ * **MLPClassifier**: `Multi-layer Perceptron Classification `__
+ * **MultinomialNB**: `Multinomial Naive Bayes Classification `__
+ * **RandomForestClassifier**: `Random Forest Classification `__
+ * **RidgeClassifier**: `Classification using Ridge Regression `__
+ * **SGDClassifier**: `Stochastic Gradient Descent Classification `__
+ * **SVC**: `Support Vector Classification using LibSVM `__
.. _regressors:
Regressors:
- * **AdaBoostRegressor**: `AdaBoost Regression `__. Note that the default base estimator is a ``DecisionTreeRegressor``. A different base estimator can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. The following additional base estimators are supported: ``SGDRegressor``, and ``SVR``.
- * **BayesianRidge**: `Bayesian Ridge Regression `__
- * **DecisionTreeRegressor**: `Decision Tree Regressor `__
- * **DummyRegressor**: `Simple Rule-based Regression `__
- * **ElasticNet**: `ElasticNet Regression `__
- * **GradientBoostingRegressor**: `Gradient Boosting Regressor `__
- * **HuberRegressor**: `Huber Regression `__
- * **KNeighborsRegressor**: `K-Nearest Neighbors Regression `__
- * **Lars**: `Least Angle Regression `__
- * **Lasso**: `Lasso Regression `__
- * **LinearRegression**: `Linear Regression `__
- * **LinearSVR**: `Support Vector Regression using LibLinear `__
- * **MLPRegressor**: `Multi-layer Perceptron Regression `__
- * **RandomForestRegressor**: `Random Forest Regression `__
- * **RANSACRegressor**: `RANdom SAmple Consensus Regression `__. Note that the default base estimator is a ``LinearRegression``. A different base regressor can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list.
- * **Ridge**: `Ridge Regression `__
- * **SGDRegressor**: `Stochastic Gradient Descent Regression `__
- * **SVR**: `Support Vector Regression using LibSVM `__
- * **TheilSenRegressor**: `Theil-Sen Regression `__
+ * **AdaBoostRegressor**: `AdaBoost Regression `__. Note that the default base estimator is a ``DecisionTreeRegressor``. A different base estimator can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. The following additional base estimators are supported: ``SGDRegressor``, and ``SVR``.
+ * **BayesianRidge**: `Bayesian Ridge Regression `__
+ * **DecisionTreeRegressor**: `Decision Tree Regressor `__
+ * **DummyRegressor**: `Simple Rule-based Regression `__
+ * **ElasticNet**: `ElasticNet Regression `__
+ * **GradientBoostingRegressor**: `Gradient Boosting Regressor `__
+ * **HuberRegressor**: `Huber Regression `__
+ * **KNeighborsRegressor**: `K-Nearest Neighbors Regression `__
+ * **Lars**: `Least Angle Regression `__
+ * **Lasso**: `Lasso Regression `__
+ * **LinearRegression**: `Linear Regression `__
+ * **LinearSVR**: `Support Vector Regression using LibLinear `__
+ * **MLPRegressor**: `Multi-layer Perceptron Regression `__
+ * **RandomForestRegressor**: `Random Forest Regression `__
+ * **RANSACRegressor**: `RANdom SAmple Consensus Regression `__. Note that the default base estimator is a ``LinearRegression``. A different base regressor can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list.
+ * **Ridge**: `Ridge Regression `__
+ * **SGDRegressor**: `Stochastic Gradient Descent Regression `__
+ * **SVR**: `Support Vector Regression using LibSVM `__
+ * **TheilSenRegressor**: `Theil-Sen Regression `__
For all regressors you can also prepend ``Rescaled`` to the
beginning of the full name (e.g., ``RescaledSVR``) to get a version
@@ -496,9 +496,9 @@ imported dynamically. This is only required if a custom learner is specified
in the list of :ref:`learners`.
All Custom learners must implement the ``fit`` and
-``predict`` methods. Custom classifiers must either (a) inherit from an existing scikit-learn classifier, or (b) inherit from both `sklearn.base.BaseEstimator `__. *and* from `sklearn.base.ClassifierMixin `__.
+``predict`` methods. Custom classifiers must either (a) inherit from an existing scikit-learn classifier, or (b) inherit from both `sklearn.base.BaseEstimator `__. *and* from `sklearn.base.ClassifierMixin `__.
-Similarly, Custom regressors must either (a) inherit from an existing scikit-learn regressor, or (b) inherit from both `sklearn.base.BaseEstimator `__. *and* from `sklearn.base.RegressorMixin `__.
+Similarly, Custom regressors must either (a) inherit from an existing scikit-learn regressor, or (b) inherit from both `sklearn.base.BaseEstimator `__. *and* from `sklearn.base.RegressorMixin `__.
Learners that require dense matrices should implement a method ``requires_dense``
that returns ``True``.
@@ -511,11 +511,11 @@ sampler *(Optional)*
It performs a non-linear transformations of the input, which can serve
as a basis for linear classification or other algorithms. Valid options
are:
-`Nystroem `__,
-`RBFSampler `__,
-`SkewedChi2Sampler `__, and
-`AdditiveChi2Sampler `__. For additional information see
-`the scikit-learn documentation `__.
+`Nystroem `__,
+`RBFSampler `__,
+`SkewedChi2Sampler `__, and
+`AdditiveChi2Sampler `__. For additional information see
+`the scikit-learn documentation `__.
.. _sampler_parameters:
@@ -550,18 +550,18 @@ feature_hasher *(Optional)*
If "true", this enables a high-speed, low-memory vectorizer that uses
feature hashing for converting feature dictionaries into NumPy arrays
instead of using a
-`DictVectorizer `__. This flag will drastically
+`DictVectorizer `__. This flag will drastically
reduce memory consumption for data sets with a large number of
features. If enabled, the user should also specify the number of
features in the :ref:`hasher_features ` field. For additional
-information see `the scikit-learn documentation `__.
+information see `the scikit-learn documentation `__.
.. _hasher_features:
hasher_features *(Optional)*
""""""""""""""""""""""""""""
-The number of features used by the `FeatureHasher `__ if the
+The number of features used by the `FeatureHasher `__ if the
:ref:`feature_hasher ` flag is enabled.
.. note::
@@ -696,7 +696,7 @@ TheilSenRegressor
{'class_weight': {1: 10}}
- Additional examples and information can be seen `here `__.
+ Additional examples and information can be seen `here `__.
.. _feature_scaling:
@@ -785,9 +785,9 @@ The objective functions to use for tuning. This is a list of one or more objecti
Classification:
- * **accuracy**: Overall `accuracy `__
- * **precision**: `Precision `__
- * **recall**: `Recall `__
+ * **accuracy**: Overall `accuracy `__
+ * **precision**: `Precision `__
+ * **recall**: `Recall `__
* **f1**: The default scikit-learn |F1 link|_
(F\ :sub:`1` of the positive class for binary classification, or the weighted average F\ :sub:`1` for multiclass classification)
* **f1_score_micro**: Micro-averaged |F1 link|_
@@ -796,20 +796,20 @@ Classification:
* **f1_score_least_frequent**: F:\ :sub:`1` score of the least frequent
class. The least frequent class may vary from fold to fold for certain
data distributions.
- * **neg_log_loss**: The negative of the classification `log loss `__ . Since scikit-learn `recommends `__ using negated loss functions as scorer functions, SKLL does the same for the sake of consistency. To use this as the objective, :ref:`probability ` must be set to ``True``.
- * **average_precision**: `Area under PR curve `__
+ * **neg_log_loss**: The negative of the classification `log loss `__ . Since scikit-learn `recommends `__ using negated loss functions as scorer functions, SKLL does the same for the sake of consistency. To use this as the objective, :ref:`probability ` must be set to ``True``.
+ * **average_precision**: `Area under PR curve `__
(for binary classification)
- * **roc_auc**: `Area under ROC curve `__
+ * **roc_auc**: `Area under ROC curve `__
(for binary classification)
.. |F1 link| replace:: F\ :sub:`1` score
-.. _F1 link: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
+.. _F1 link: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html
.. _int_label_classification_obj:
Regression or classification with integer labels:
- * **unweighted_kappa**: Unweighted `Cohen's kappa `__ (any floating point
+ * **unweighted_kappa**: Unweighted `Cohen's kappa `__ (any floating point
values are rounded to ints)
* **linear_weighted_kappa**: Linear weighted kappa (any floating
point values are rounded to ints)
@@ -827,16 +827,16 @@ Regression or classification with integer labels:
Regression or classification with binary labels:
- * **kendall_tau**: `Kendall's tau `__
- * **pearson**: `Pearson correlation `__
- * **spearman**: `Spearman rank-correlation `__
+ * **kendall_tau**: `Kendall's tau `__
+ * **pearson**: `Pearson correlation `__
+ * **spearman**: `Spearman rank-correlation `__
.. _regression_obj:
Regression:
- * **r2**: `R2 `__
- * **neg_mean_squared_error**: The negative of the `mean squared error `__ regression loss. Since scikit-learn `recommends `__ using negated loss functions as scorer functions, SKLL does the same for the sake of consistency.
+ * **r2**: `R2 `__
+ * **neg_mean_squared_error**: The negative of the `mean squared error `__ regression loss. Since scikit-learn `recommends `__ using negated loss functions as scorer functions, SKLL does the same for the sake of consistency.
Defaults to ``['f1_score_micro']``.
@@ -1121,7 +1121,7 @@ specified via command-line arguments instead of in the configuration file:
GridMap options
^^^^^^^^^^^^^^^
-If you have `GridMap `__ installed,
+If you have `GridMap `__ installed,
:program:`run_experiment` will automatically schedule jobs on your DRMAA-
compatible cluster. You can use the following options to customize this
behavior.
@@ -1132,7 +1132,7 @@ behavior.
.. option:: -q , --queue
- Use this queue for `GridMap `__.
+ Use this queue for `GridMap `__.
(default: ``all.q``)
.. option:: -m , --machines
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 9d459623..2c85d7a7 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -21,15 +21,15 @@ Titanic Example
---------------
Let's see how we can apply the basic workflow above to a simple example using
-the `Titantic: Machine Learning from Disaster `__
-data from `Kaggle `__.
+the `Titantic: Machine Learning from Disaster `__
+data from `Kaggle `__.
Get your data into the correct format
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The first step to getting the Titanic data is logging into Kaggle and
-downloading `train.csv `__
-and `test.csv `__.
+downloading `train.csv `__
+and `test.csv `__.
Once you have those files, you'll also want to grab the
`examples folder `__
on our GitHub page and put ``train.csv`` and ``test.csv`` in ``examples``.
@@ -48,7 +48,7 @@ For this tutorial, we will refer to an "experiment" as having a single data set
split into training and testing portions. As part of each
experiment, we can train and test several models, either simultaneously or
sequentially, depending whether we're using
-`GridMap `__ or not.
+`GridMap `__ or not.
This will be described in more detail later on, when we are ready to run our
experiment.
@@ -87,9 +87,9 @@ instances IDs for each example.
The :ref:`Tuning` section defines how we want our model to be tuned. Setting
:ref:`grid_search ` to ``True`` here employs scikit-learn's
-`GridSearchCV `_
+`GridSearchCV `_
class, which is an implementation of the
-`standard, brute-force approach to hyperparameter optimization `_.
+`standard, brute-force approach to hyperparameter optimization `_.
:ref:`objectives ` refers to the desired objective functions; here,
``accuracy`` will optimize for overall accuracy. You can see a list of all the
diff --git a/doc/utilities.rst b/doc/utilities.rst
index 41319140..e509e535 100644
--- a/doc/utilities.rst
+++ b/doc/utilities.rst
@@ -113,17 +113,27 @@ Positional Arguments
Model file to load and use for generating predictions.
-.. option:: input_file
+.. option:: input_file(s)
- A csv file, json file, or megam file (with or without the label column),
- with the appropriate suffix.
+ One or more csv file(s), jsonlines file(s), or megam file(s) (with or without the
+ label column), with the appropriate suffix.
Optional Arguments
^^^^^^^^^^^^^^^^^^
+.. option:: -a, --all_probabilities
+
+ Flag indicating whether to output the probabilities of all labels instead of just
+ the probability of the positive label.
+
+.. option:: -i , --id_col
+
+ Name of the column which contains the instance IDs in ARFF, CSV, or TSV files.
+ (default: ``id``)
+
.. option:: -l , --label_col
Name of the column which contains the labels in ARFF, CSV, or TSV files.
- For ARFF files, this must be the final column to count as the label.
+ For ARFF files, this must be the final column to count as the label.
(default: ``y``)
.. option:: -p , --positive_label
@@ -131,7 +141,8 @@ Optional Arguments
If the model is only being used to predict the probability of a particular
label, this specifies the index of the label we're predicting. 1 = second
label, which is default for binary classification. Keep in mind that labels
- are sorted lexicographically. (default: 1)
+ are sorted lexicographically.
+ (default: 1)
.. option:: -q, --quiet
diff --git a/requirements.txt b/requirements.txt
index b225501a..7cd12fe9 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-scikit-learn==0.19.1
+scikit-learn==0.20.1
six
PrettyTable
beautifulsoup4
diff --git a/requirements_rtd.txt b/requirements_rtd.txt
index 72b70cda..1b63aabb 100644
--- a/requirements_rtd.txt
+++ b/requirements_rtd.txt
@@ -1,7 +1,7 @@
configparser==3.5.0b2
logutils
mock
-scikit-learn==0.19.1
+scikit-learn==0.20.1
six
PrettyTable
beautifulsoup4
diff --git a/skll/config.py b/skll/config.py
index 973d65f4..60318b4c 100644
--- a/skll/config.py
+++ b/skll/config.py
@@ -481,10 +481,11 @@ def _parse_config_file(config_path, log_level=logging.INFO):
# next, get the log path before anything else since we need to
# save all logging messages to a log file in addition to displaying
# them on the console
- log_path = _locate_file(config.get("Output", "log"), config_dir)
- if log_path:
- log_path = join(config_dir, log_path)
- if not exists(log_path):
+ try:
+ log_path = _locate_file(config.get("Output", "log"), config_dir)
+ except IOError as e:
+ if e.errno == errno.ENOENT:
+ log_path = e.filename
os.makedirs(log_path)
# Create a top-level log file under the log path
@@ -731,24 +732,29 @@ def _parse_config_file(config_path, log_level=logging.INFO):
probability = config.getboolean("Output", "probability")
# do we want to keep the predictions?
- prediction_dir = _locate_file(config.get("Output", "predictions"),
- config_dir)
- if prediction_dir:
- if not exists(prediction_dir):
+ # make sure the predictions path exists and if not create it
+ try:
+ prediction_dir = _locate_file(config.get("Output", "predictions"),
+ config_dir)
+ except IOError as e:
+ if e.errno == errno.ENOENT:
+ prediction_dir = e.filename
os.makedirs(prediction_dir)
- # make sure model path exists
- model_path = _locate_file(config.get("Output", "models"), config_dir)
- if model_path:
- model_path = join(config_dir, model_path)
- if not exists(model_path):
+ # make sure model path exists and if not, create it
+ try:
+ model_path = _locate_file(config.get("Output", "models"), config_dir)
+ except IOError as e:
+ if e.errno == errno.ENOENT:
+ model_path = e.filename
os.makedirs(model_path)
# make sure results path exists
- results_path = _locate_file(config.get("Output", "results"), config_dir)
- if results_path:
- results_path = join(config_dir, results_path)
- if not exists(results_path):
+ try:
+ results_path = _locate_file(config.get("Output", "results"), config_dir)
+ except IOError as e:
+ if e.errno == errno.ENOENT:
+ results_path = e.filename
os.makedirs(results_path)
# what are the output metrics?
@@ -872,10 +878,10 @@ def _parse_config_file(config_path, log_level=logging.INFO):
logger.warning("Specifying \"folds_file\" overrides both "
"explicit and default \"grid_search_folds\".")
if task == 'cross_validate':
- logger.warning("Specifying \"folds_file\" overrides both "
- "explicit and default \"num_cv_folds\".")
cv_folds = specified_folds_mapping if specified_folds_mapping else specified_num_folds
if specified_folds_mapping:
+ logger.warning("Specifying \"folds_file\" overrides both "
+ "explicit and default \"num_cv_folds\".")
if use_folds_file_for_grid_search:
grid_search_folds = cv_folds
else:
diff --git a/skll/data/writers.py b/skll/data/writers.py
index d694ff78..88b9f0fd 100644
--- a/skll/data/writers.py
+++ b/skll/data/writers.py
@@ -237,24 +237,24 @@ class DelimitedFileWriter(Writer):
type. For example ``/foo/.csv``.
feature_set : skll.FeatureSet
The ``FeatureSet`` instance to dump to the output file.
- quiet : bool
+ quiet : bool, optional
Do not print "Writing..." status message to stderr.
Defaults to ``True``.
- label_col : str
+ label_col : str, optional
Name of the column which contains the class labels
for ARFF/CSV/TSV files. If no column with that name
exists, or ``None`` is specified, the data is
considered to be unlabelled.
Defaults to ``'y'``.
- id_col : str
+ id_col : str, optional
Name of the column which contains the instance IDs.
If no column with that name exists, or ``None`` is
specified, example IDs will be automatically generated.
Defaults to ``'id'``.
- dialect : str
- Name of the column which contains the class labels for
- CSV/TSV files.
- logger : logging.Logger
+ dialect : str, optional
+ The dialect to use for writing out the delimited file.
+ Defaults to ``'excel-tab'``.
+ logger : logging.Logger, optional
A logger instance to use to log messages instead of creating
a new one by default.
Defaults to ``None``.
@@ -586,9 +586,21 @@ def _write_line(self, id_, label_, feat_dict, output_file):
"""
example_dict = {}
# Don't try to add class column if this is label-less data
+ # Try to convert the label to a scalar assuming it'a numpy
+ # non-scalar type (e.g., int64) but if that doesn't work
+ # then use it as is
if self.feat_set.has_labels:
- example_dict['y'] = np.asscalar(label_)
- example_dict['id'] = np.asscalar(id_)
+ try:
+ example_dict['y'] = label_.item()
+ except AttributeError:
+ example_dict['y'] = label_
+ # Try to convert the ID to a scalar assuming it'a numpy
+ # non-scalar type (e.g., int64) but if that doesn't work
+ # then use it as is
+ try:
+ example_dict['id'] = id_.item()
+ except AttributeError:
+ example_dict['id'] = id_
example_dict["x"] = feat_dict
print(json.dumps(example_dict, sort_keys=True), file=output_file)
diff --git a/skll/experiments.py b/skll/experiments.py
index d6a039cc..22ff3bb1 100644
--- a/skll/experiments.py
+++ b/skll/experiments.py
@@ -77,7 +77,7 @@ class NumpyTypeEncoder(json.JSONEncoder):
be serialized by the json module, so we must convert them to int objects.
A related issue where this was adapted from:
- http://stackoverflow.com/questions/11561932/why-does-json-dumpslistnp-arange5-fail-while-json-dumpsnp-arange5-tolis
+ https://stackoverflow.com/questions/11561932/why-does-json-dumpslistnp-arange5-fail-while-json-dumpsnp-arange5-tolis
"""
def default(self, obj):
@@ -989,7 +989,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
write_summary : bool, optional
Write a TSV file with a summary of the results.
Defaults to ``True``.
- quite : bool, optional
+ quiet : bool, optional
Suppress printing of "Loading..." messages.
Defaults to ``False``.
ablation : int, optional
@@ -1374,13 +1374,13 @@ def _generate_learning_curve_plots(experiment_name,
# each of the featuresets
for fs_name, df_fs in df_melted.groupby('featureset_name'):
fig = plt.figure();
- fig.set_size_inches(2.5*num_learners, 2.5*num_metrics);
+ fig.set_size_inches(2.5 * num_learners, 2.5 * num_metrics);
# compute ylimits for this feature set for each objective
with sns.axes_style('whitegrid', {"grid.linestyle": ':',
"xtick.major.size": 3.0}):
g = sns.FacetGrid(df_fs, row="metric", col="learner_name",
- hue="variable", size=2.5, aspect=1,
+ hue="variable", height=2.5, aspect=1,
margin_titles=True, despine=True, sharex=False,
sharey=False, legend_out=False, palette="Set1")
colors = train_color, test_color = sns.color_palette("Set1")[:2]
diff --git a/skll/learner.py b/skll/learner.py
index ff196f78..f8bd102d 100644
--- a/skll/learner.py
+++ b/skll/learner.py
@@ -20,6 +20,7 @@
from collections import Counter, defaultdict
from functools import wraps
from importlib import import_module
+from itertools import combinations
from multiprocessing import cpu_count
import joblib
@@ -42,6 +43,7 @@
RandomForestClassifier,
RandomForestRegressor)
from sklearn.feature_extraction import FeatureHasher
+from sklearn.feature_extraction import DictVectorizer as OldDictVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.utils.multiclass import type_of_target
# AdditiveChi2Sampler is used indirectly, so ignore linting message
@@ -74,6 +76,7 @@
from sklearn.utils import shuffle as sk_shuffle
from skll.data import FeatureSet
+from skll.data.dict_vectorizer import DictVectorizer
from skll.metrics import _CORRELATION_METRICS, use_score_func
from skll.version import VERSION
@@ -196,8 +199,6 @@
'neg_log_loss'])
_REQUIRES_DENSE = (BayesianRidge,
- GradientBoostingClassifier,
- GradientBoostingRegressor,
Lars,
TheilSenRegressor)
@@ -856,6 +857,7 @@ def __init__(self, model_type, probability=False, feature_scaling='none',
if issubclass(self._model_type, SVC):
self._model_kwargs['cache_size'] = 1000
self._model_kwargs['probability'] = self.probability
+ self._model_kwargs['gamma'] = 'auto'
if self.probability:
self.logger.warning('Because LibSVM does an internal '
'cross-validation to produce probabilities, '
@@ -868,14 +870,22 @@ def __init__(self, model_type, probability=False, feature_scaling='none',
self._model_kwargs['n_estimators'] = 500
elif issubclass(self._model_type, SVR):
self._model_kwargs['cache_size'] = 1000
+ self._model_kwargs['gamma'] = 'auto'
elif issubclass(self._model_type, SGDClassifier):
self._model_kwargs['loss'] = 'log'
+ self._model_kwargs['max_iter'] = None
+ self._model_kwargs['tol'] = None
+ elif issubclass(self._model_type, SGDRegressor):
+ self._model_kwargs['max_iter'] = None
+ self._model_kwargs['tol'] = None
elif issubclass(self._model_type, RANSACRegressor):
self._model_kwargs['loss'] = 'squared_loss'
elif issubclass(self._model_type, (MLPClassifier, MLPRegressor)):
self._model_kwargs['learning_rate'] = 'invscaling'
self._model_kwargs['max_iter'] = 500
-
+ elif issubclass(self._model_type, LogisticRegression):
+ self._model_kwargs['solver'] = 'liblinear'
+ self._model_kwargs['multi_class'] = 'auto'
if issubclass(self._model_type,
(AdaBoostClassifier, AdaBoostRegressor,
@@ -913,15 +923,24 @@ def __init__(self, model_type, probability=False, feature_scaling='none',
AdaBoostClassifier,
RANSACRegressor)) and ('base_estimator' in model_kwargs):
base_estimator_name = model_kwargs['base_estimator']
- base_estimator_kwargs = {} if base_estimator_name in ['LinearRegression',
- 'MultinomialNB',
- 'SVR'] else {'random_state': 123456789}
+ if base_estimator_name in ['LinearRegression', 'MultinomialNB']:
+ base_estimator_kwargs = {}
+ elif base_estimator_name in ['SGDClassifier', 'SGDRegressor']:
+ base_estimator_kwargs = {'max_iter': None,
+ 'tol': None,
+ 'random_state': 123456789}
+ elif base_estimator_name == 'SVR':
+ base_estimator_kwargs = {'gamma': 'auto'}
+ elif base_estimator_name == 'SVC':
+ base_estimator_kwargs = {'gamma': 'auto', 'random_state': 123456789}
+ else:
+ base_estimator_kwargs = {'random_state': 123456789}
base_estimator = globals()[base_estimator_name](**base_estimator_kwargs)
model_kwargs['base_estimator'] = base_estimator
self._model_kwargs.update(model_kwargs)
@classmethod
- def from_file(cls, learner_path):
+ def from_file(cls, learner_path, logger=None):
"""
Load a saved ``Learner`` instance from a file path.
@@ -929,6 +948,9 @@ def from_file(cls, learner_path):
----------
learner_path : str
The path to a saved ``Learner`` instance file.
+ logger : logging object, optional
+ A logging object. If ``None`` is passed, get logger from ``__name__``.
+ Defaults to ``None``.
Returns
-------
@@ -944,6 +966,10 @@ def from_file(cls, learner_path):
"""
skll_version, learner = joblib.load(learner_path)
+ # create the learner logger attribute to the logger that's passed in
+ # or if nothing was passed in, then a new logger should be linked
+ learner.logger = logger if logger else logging.getLogger(__name__)
+
# For backward compatibility, convert string model types to labels.
if isinstance(learner._model_type, string_types):
learner._model_type = globals()[learner._model_type]
@@ -1073,6 +1099,30 @@ def model_params(self):
elif self.model.intercept_.any():
intercept = dict(zip(label_list, self.model.intercept_))
+ # for SVCs with linear kernels, we want to print out the primal
+ # weights - that is, the weights for each feature for each one-vs-one
+ # binary classifier. These are the weights contained in the `coef_`
+ # attribute of the underlying scikit-learn model. This is a matrix that
+ # has the shape [(n_classes)*(n_classes -1)/2, n_features] since there
+ # are C(n_classes, 2) = n_classes*(n_classes-1)/2 one-vs-one classifiers
+ # and each one has weights for each of the features. According to the
+ # scikit-learn user guide and the code for the function `_one_vs_one_coef()`
+ # in `svm/base.py`, the order of the rows is as follows is "0 vs 1",
+ # "0 vs 2", ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", ... "n-1 vs n".
+ elif isinstance(self._model, SVC) and self._model.kernel == 'linear':
+ intercept = {}
+ for i, class_pair in enumerate(combinations(range(len(self.label_list)), 2)):
+ coef = self.model.coef_[i]
+ coef = coef.toarray()
+ coef = self.feat_selector.inverse_transform(coef)[0]
+ class1 = self.label_list[class_pair[0]]
+ class2 = self.label_list[class_pair[1]]
+ for feat, idx in iteritems(self.feat_vectorizer.vocabulary_):
+ if coef[idx]:
+ res['{}-vs-{}\t{}'.format(class1, class2, feat)] = coef[idx]
+
+ intercept['{}-vs-{}'.format(class1, class2)] = self.model.intercept_[i]
+
else:
# not supported
raise ValueError(("{} is not supported by" +
@@ -1113,7 +1163,8 @@ def __getstate__(self):
because we cannot pickle loggers.
"""
attribute_dict = dict(self.__dict__)
- del attribute_dict['logger']
+ if 'logger' in attribute_dict:
+ del attribute_dict['logger']
return attribute_dict
def save(self, learner_path):
@@ -1683,12 +1734,57 @@ def predict(self, examples, prediction_prefix=None, append=False,
# Need to do some transformations so the features are in the right
# columns for the test set. Obviously a bit hacky, but storing things
# in sparse matrices saves memory over our old list of dicts approach.
- if isinstance(self.feat_vectorizer, FeatureHasher):
- if (self.feat_vectorizer.n_features !=
- examples.vectorizer.n_features):
+
+ # We also need to think about the various combinations of the model
+ # vectorizer and the vectorizer for the set for which we want to make
+ # predictions:
+
+ # 1. Both vectorizers are DictVectorizers. If they use different sets
+ # of features, we raise a warning and transform the features of the
+ # prediction set from its space to the trained model space.
+
+ # 2. Both vectorizers are FeatureHashers. If they use different number
+ # of feature bins, we should just raise an error since there's no
+ # inverse_transform() available for a FeatureHasher - the hash function
+ # is not reversible.
+
+ # 3. The model vectorizer is a FeatureHasher but the prediction feature
+ # set vectorizer is a DictVectorizer. We should be able to handle this
+ # case, since we can just call inverse_transform() on the DictVectorizer
+ # and then transform() on the FeatureHasher?
+
+ # 4. The model vectorizer is a DictVectorizer but the prediction feature
+ # set vectorizer is a FeatureHasher. Again, we should raise an error here
+ # since there's no inverse available for the hasher.
+ model_is_dict = isinstance(self.feat_vectorizer,
+ (DictVectorizer, OldDictVectorizer))
+ model_is_hasher = isinstance(self.feat_vectorizer, FeatureHasher)
+ data_is_dict = isinstance(examples.vectorizer,
+ (DictVectorizer, OldDictVectorizer))
+ data_is_hasher = isinstance(examples.vectorizer, FeatureHasher)
+
+ both_dicts = model_is_dict and data_is_dict
+ both_hashers = model_is_hasher and data_is_hasher
+ model_hasher_and_data_dict = model_is_hasher and data_is_dict
+ model_dict_and_data_hasher = model_is_dict and data_is_hasher
+
+ # 1. both are DictVectorizers
+ if both_dicts:
+ if (set(self.feat_vectorizer.feature_names_) !=
+ set(examples.vectorizer.feature_names_)):
self.logger.warning("There is mismatch between the training model "
- "features and the data passed to predict.")
+ "features and the data passed to predict. The "
+ "prediction features will be transformed to "
+ "the trained model space.")
+ if self.feat_vectorizer == examples.vectorizer:
+ xtest = examples.features
+ else:
+ xtest = self.feat_vectorizer.transform(
+ examples.vectorizer.inverse_transform(
+ examples.features))
+ # 2. both are FeatureHashers
+ elif both_hashers:
self_feat_vec_tuple = (self.feat_vectorizer.dtype,
self.feat_vectorizer.input_type,
self.feat_vectorizer.n_features,
@@ -1701,21 +1797,23 @@ def predict(self, examples, prediction_prefix=None, append=False,
if self_feat_vec_tuple == example_feat_vec_tuple:
xtest = examples.features
else:
- xtest = self.feat_vectorizer.transform(
- examples.vectorizer.inverse_transform(
- examples.features))
- else:
- if (set(self.feat_vectorizer.feature_names_) !=
- set(examples.vectorizer.feature_names_)):
- self.logger.warning("There is mismatch between the training model "
- "features and the data passed to predict.")
- if self.feat_vectorizer == examples.vectorizer:
- xtest = examples.features
- else:
-
- xtest = self.feat_vectorizer.transform(
- examples.vectorizer.inverse_transform(
- examples.features))
+ self.logger.error('There is mismatch between the FeatureHasher '
+ 'configuration for the training data and '
+ 'the configuration for the data passed to predict')
+ raise RuntimeError('Mismatched hasher configurations')
+
+ # 3. model is a FeatureHasher and test set is a DictVectorizer
+ elif model_hasher_and_data_dict:
+ xtest = self.feat_vectorizer.transform(
+ examples.vectorizer.inverse_transform(
+ examples.features))
+
+ # 4. model is a DictVectorizer and test set is a FeatureHasher
+ elif model_dict_and_data_hasher:
+ self.logger.error('Cannot predict with a model using a '
+ 'DictVectorizer on data that uses '
+ 'a FeatureHasher')
+ raise RuntimeError('Cannot use FeatureHasher for data')
# filter features based on those selected from training set
xtest = self.feat_selector.transform(xtest)
diff --git a/skll/utilities/generate_predictions.py b/skll/utilities/generate_predictions.py
index 97d9aefb..514ff680 100755
--- a/skll/utilities/generate_predictions.py
+++ b/skll/utilities/generate_predictions.py
@@ -14,6 +14,7 @@
import argparse
import logging
import os
+import sys
from skll.data.readers import EXT_TO_READER
from skll.learner import Learner
@@ -26,7 +27,8 @@ class Predictor(object):
predictions for feature strings.
"""
- def __init__(self, model_path, threshold=None, positive_label=1):
+ def __init__(self, model_path, threshold=None, positive_label=1,
+ all_labels=False, logger=None):
"""
Initialize the predictor.
@@ -46,10 +48,24 @@ def __init__(self, model_path, threshold=None, positive_label=1):
predicting. 1 = second class, which is default
for binary classification.
Defaults to 1.
+ all_labels: bool, optional
+ A flag indicating whether to return the probabilities for all
+ labels in each row instead of just returning the probability of
+ `positive_label`. Defaults to None.
+ logger : logging object, optional
+ A logging object. If ``None`` is passed, get logger from ``__name__``.
+ Defaults to ``None``.
"""
+ # self.logger = logger if logger else logging.getLogger(__name__)
+ if threshold is not None and all_labels:
+ raise ValueError("`threshold` and `all_labels` are mutually "
+ "exclusive. They can not both be set to True.")
+
self._learner = Learner.from_file(model_path)
self._pos_index = positive_label
self.threshold = threshold
+ self.all_labels = all_labels
+ self.output_file_header = None
def predict(self, data):
"""
@@ -67,18 +83,29 @@ def predict(self, data):
# compute the predictions from the learner
preds = self._learner.predict(data)
preds = preds.tolist()
+ labels = self._learner.label_list
+ # Create file header list, and transform predictions as needed
+ # depending on the specified prediction arguments.
if self._learner.probability:
- if self.threshold is None:
- return [pred[self._pos_index] for pred in preds]
+ if self.all_labels:
+ self.output_file_header = ["id"] + [str(x) for x in labels]
+ elif self.threshold is None:
+ label = self._learner.label_dict[self._pos_index]
+ self.output_file_header = ["id",
+ "Probability of '{}'".format(label)]
+ preds = [pred[self._pos_index] for pred in preds]
else:
- return [int(pred[self._pos_index] >= self.threshold)
- for pred in preds]
+ self.output_file_header = ["id", "prediction"]
+ preds = [int(pred[self._pos_index] >= self.threshold)
+ for pred in preds]
elif self._learner.model._estimator_type == 'regressor':
- return preds
+ self.output_file_header = ["id", "prediction"]
else:
- return [self._learner.label_list[pred if isinstance(pred, int) else
- int(pred[0])] for pred in preds]
+ self.output_file_header = ["id", "prediction"]
+ preds = [labels[pred if isinstance(pred, int) else int(pred[0])]
+ for pred in preds]
+ return preds
def main(argv=None):
@@ -99,41 +126,53 @@ def main(argv=None):
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
conflict_handler='resolve')
parser.add_argument('model_file',
- help='Model file to load and use for generating \
- predictions.')
- parser.add_argument('input_file',
- help='A csv file, json file, or megam file \
- (with or without the label column), \
- with the appropriate suffix.',
+ help='Model file to load and use for generating '
+ 'predictions.')
+ parser.add_argument('input_files',
+ help='A space-separated list of csv file, json file, '
+ 'or megam file (with or without the label '
+ 'column), with the appropriate suffix.',
nargs='+')
parser.add_argument('-i', '--id_col',
- help='Name of the column which contains the instance \
- IDs in ARFF, CSV, or TSV files.',
+ help='Name of the column which contains the instance '
+ 'IDs in ARFF, CSV, or TSV files.',
default='id')
parser.add_argument('-l', '--label_col',
- help='Name of the column which contains the labels\
- in ARFF, CSV, or TSV files. For ARFF files, this\
- must be the final column to count as the label.',
+ help='Name of the column which contains the labels '
+ 'in ARFF, CSV, or TSV files. For ARFF files, '
+ 'this must be the final column to count as the '
+ 'label.',
default='y')
parser.add_argument('-p', '--positive_label',
- help="If the model is only being used to predict the \
- probability of a particular label, this \
- specifies the index of the label we're \
- predicting. 1 = second label, which is default \
- for binary classification. Keep in mind that \
- labels are sorted lexicographically.",
+ help="If the model is only being used to predict the "
+ "probability of a particular label, this "
+ "specifies the index of the label we're "
+ "predicting. 1 = second label, which is default "
+ "for binary classification. Keep in mind that "
+ "labels are sorted lexicographically.",
default=1, type=int)
parser.add_argument('-q', '--quiet',
help='Suppress printing of "Loading..." messages.',
action='store_true')
- parser.add_argument('-t', '--threshold',
- help="If the model we're using is generating \
- probabilities of the positive label, return 1 \
- if it meets/exceeds the given threshold and 0 \
- otherwise.",
- type=float)
+ parser.add_argument('--output_file', '-o',
+ help="Path to output tsv file. If not specified, "
+ "predictions will be printed to stdout.")
parser.add_argument('--version', action='version',
version='%(prog)s {0}'.format(__version__))
+ probability_handling = parser.add_mutually_exclusive_group()
+ probability_handling.add_argument('-t', '--threshold',
+ help="If the model we're using is "
+ "generating probabilities of the "
+ "positive label, return 1 if it "
+ "meets/exceeds the given threshold "
+ "and 0 otherwise.", type=float)
+ probability_handling.add_argument('--all_probabilities', '-a',
+ action='store_true',
+ help="Flag indicating whether to output "
+ "the probabilities of all labels "
+ "instead of just the probability "
+ "of the positive label.")
+
args = parser.parse_args(argv)
# Make warnings from built-in warnings module get formatted more nicely
@@ -145,10 +184,12 @@ def main(argv=None):
# Create the classifier and load the model
predictor = Predictor(args.model_file,
positive_label=args.positive_label,
- threshold=args.threshold)
+ threshold=args.threshold,
+ all_labels=args.all_probabilities,
+ logger=logger)
# Iterate over all the specified input files
- for input_file in args.input_file:
+ for i, input_file in enumerate(args.input_files):
# make sure each file extension is one we can process
input_extension = os.path.splitext(input_file)[1].lower()
@@ -164,8 +205,34 @@ def main(argv=None):
label_col=args.label_col,
id_col=args.id_col)
feature_set = reader.read()
- for pred in predictor.predict(feature_set):
- print(pred)
+ preds = predictor.predict(feature_set)
+ header = predictor.output_file_header
+
+ if args.output_file is not None:
+ with open(args.output_file, 'a') as outputfh:
+ if i == 0: # Only write header once per set of input files
+ print("\t".join(header), file=outputfh)
+ if args.all_probabilities:
+ for j, probabilities in enumerate(preds):
+ id_ = feature_set.ids[j]
+ probs_str = "\t".join([str(p) for p in probabilities])
+ print("{}\t{}".format(id_, probs_str), file=outputfh)
+ else:
+ for j, pred in enumerate(preds):
+ id_ = feature_set.ids[j]
+ print("{}\t{}".format(id_, pred), file=outputfh)
+ else:
+ if i == 0: # Only write header once per set of input files
+ print("\t".join(header))
+ if args.all_probabilities:
+ for j, probabilities in enumerate(preds):
+ id_ = feature_set.ids[j]
+ probs_str = "\t".join([str(p) for p in probabilities])
+ print("{}\t{}".format(id_, probs_str))
+ else:
+ for j, pred in enumerate(preds):
+ id_ = feature_set.ids[j]
+ print("{}\t{}".format(id_, pred))
if __name__ == '__main__':
diff --git a/skll/version.py b/skll/version.py
index b67290bc..3bb8833e 100644
--- a/skll/version.py
+++ b/skll/version.py
@@ -7,5 +7,5 @@
:organization: ETS
"""
-__version__ = '1.5'
+__version__ = '1.5.3'
VERSION = tuple(int(x) for x in __version__.split('.'))
diff --git a/tests/configs/test_single_file_saved_subset.template.cfg b/tests/configs/test_single_file_saved_subset.template.cfg
new file mode 100644
index 00000000..de8e2b48
--- /dev/null
+++ b/tests/configs/test_single_file_saved_subset.template.cfg
@@ -0,0 +1,11 @@
+[General]
+experiment_name=train_test_single_file
+task=evaluate
+
+[Input]
+learners=["RandomForestClassifier"]
+
+[Tuning]
+
+[Output]
+probability=false
diff --git a/tests/other/examples_test.jsonlines b/tests/other/examples_test.jsonlines
new file mode 100644
index 00000000..6d344cc6
--- /dev/null
+++ b/tests/other/examples_test.jsonlines
@@ -0,0 +1,50 @@
+{"id": "EXAMPLE_73", "y": "versicolor", "x": {"f0": 6.1, "f1": 2.8, "f2": 4.7, "f3": 1.2}}
+{"id": "EXAMPLE_18", "y": "setosa", "x": {"f0": 5.7, "f1": 3.8, "f2": 1.7, "f3": 0.3}}
+{"id": "EXAMPLE_118", "y": "virginica", "x": {"f0": 7.7, "f1": 2.6, "f2": 6.9, "f3": 2.3}}
+{"id": "EXAMPLE_78", "y": "versicolor", "x": {"f0": 6.0, "f1": 2.9, "f2": 4.5, "f3": 1.5}}
+{"id": "EXAMPLE_76", "y": "versicolor", "x": {"f0": 6.8, "f1": 2.8, "f2": 4.8, "f3": 1.4}}
+{"id": "EXAMPLE_31", "y": "setosa", "x": {"f0": 5.4, "f1": 3.4, "f2": 1.5, "f3": 0.4}}
+{"id": "EXAMPLE_64", "y": "versicolor", "x": {"f0": 5.6, "f1": 2.9, "f2": 3.6, "f3": 1.3}}
+{"id": "EXAMPLE_141", "y": "virginica", "x": {"f0": 6.9, "f1": 3.1, "f2": 5.1, "f3": 2.3}}
+{"id": "EXAMPLE_68", "y": "versicolor", "x": {"f0": 6.2, "f1": 2.2, "f2": 4.5, "f3": 1.5}}
+{"id": "EXAMPLE_82", "y": "versicolor", "x": {"f0": 5.8, "f1": 2.7, "f2": 3.9, "f3": 1.2}}
+{"id": "EXAMPLE_110", "y": "virginica", "x": {"f0": 6.5, "f1": 3.2, "f2": 5.1, "f3": 2.0}}
+{"id": "EXAMPLE_12", "y": "setosa", "x": {"f0": 4.8, "f1": 3.0, "f2": 1.4, "f3": 0.1}}
+{"id": "EXAMPLE_36", "y": "setosa", "x": {"f0": 5.5, "f1": 3.5, "f2": 1.3, "f3": 0.2}}
+{"id": "EXAMPLE_9", "y": "setosa", "x": {"f0": 4.9, "f1": 3.1, "f2": 1.5, "f3": 0.1}}
+{"id": "EXAMPLE_19", "y": "setosa", "x": {"f0": 5.1, "f1": 3.8, "f2": 1.5, "f3": 0.3}}
+{"id": "EXAMPLE_56", "y": "versicolor", "x": {"f0": 6.3, "f1": 3.3, "f2": 4.7, "f3": 1.6}}
+{"id": "EXAMPLE_104", "y": "virginica", "x": {"f0": 6.5, "f1": 3.0, "f2": 5.8, "f3": 2.2}}
+{"id": "EXAMPLE_69", "y": "versicolor", "x": {"f0": 5.6, "f1": 2.5, "f2": 3.9, "f3": 1.1}}
+{"id": "EXAMPLE_55", "y": "versicolor", "x": {"f0": 5.7, "f1": 2.8, "f2": 4.5, "f3": 1.3}}
+{"id": "EXAMPLE_132", "y": "virginica", "x": {"f0": 6.4, "f1": 2.8, "f2": 5.6, "f3": 2.2}}
+{"id": "EXAMPLE_29", "y": "setosa", "x": {"f0": 4.7, "f1": 3.2, "f2": 1.6, "f3": 0.2}}
+{"id": "EXAMPLE_127", "y": "virginica", "x": {"f0": 6.1, "f1": 3.0, "f2": 4.9, "f3": 1.8}}
+{"id": "EXAMPLE_26", "y": "setosa", "x": {"f0": 5.0, "f1": 3.4, "f2": 1.6, "f3": 0.4}}
+{"id": "EXAMPLE_128", "y": "virginica", "x": {"f0": 6.4, "f1": 2.8, "f2": 5.6, "f3": 2.1}}
+{"id": "EXAMPLE_131", "y": "virginica", "x": {"f0": 7.9, "f1": 3.8, "f2": 6.4, "f3": 2.0}}
+{"id": "EXAMPLE_145", "y": "virginica", "x": {"f0": 6.7, "f1": 3.0, "f2": 5.2, "f3": 2.3}}
+{"id": "EXAMPLE_108", "y": "virginica", "x": {"f0": 6.7, "f1": 2.5, "f2": 5.8, "f3": 1.8}}
+{"id": "EXAMPLE_143", "y": "virginica", "x": {"f0": 6.8, "f1": 3.2, "f2": 5.9, "f3": 2.3}}
+{"id": "EXAMPLE_45", "y": "setosa", "x": {"f0": 4.8, "f1": 3.0, "f2": 1.4, "f3": 0.3}}
+{"id": "EXAMPLE_30", "y": "setosa", "x": {"f0": 4.8, "f1": 3.1, "f2": 1.6, "f3": 0.2}}
+{"id": "EXAMPLE_22", "y": "setosa", "x": {"f0": 4.6, "f1": 3.6, "f2": 1.0, "f3": 0.2}}
+{"id": "EXAMPLE_15", "y": "setosa", "x": {"f0": 5.7, "f1": 4.4, "f2": 1.5, "f3": 0.4}}
+{"id": "EXAMPLE_65", "y": "versicolor", "x": {"f0": 6.7, "f1": 3.1, "f2": 4.4, "f3": 1.4}}
+{"id": "EXAMPLE_11", "y": "setosa", "x": {"f0": 4.8, "f1": 3.4, "f2": 1.6, "f3": 0.2}}
+{"id": "EXAMPLE_42", "y": "setosa", "x": {"f0": 4.4, "f1": 3.2, "f2": 1.3, "f3": 0.2}}
+{"id": "EXAMPLE_146", "y": "virginica", "x": {"f0": 6.3, "f1": 2.5, "f2": 5.0, "f3": 1.9}}
+{"id": "EXAMPLE_51", "y": "versicolor", "x": {"f0": 6.4, "f1": 3.2, "f2": 4.5, "f3": 1.5}}
+{"id": "EXAMPLE_27", "y": "setosa", "x": {"f0": 5.2, "f1": 3.5, "f2": 1.5, "f3": 0.2}}
+{"id": "EXAMPLE_4", "y": "setosa", "x": {"f0": 5.0, "f1": 3.6, "f2": 1.4, "f3": 0.2}}
+{"id": "EXAMPLE_32", "y": "setosa", "x": {"f0": 5.2, "f1": 4.1, "f2": 1.5, "f3": 0.1}}
+{"id": "EXAMPLE_142", "y": "virginica", "x": {"f0": 5.8, "f1": 2.7, "f2": 5.1, "f3": 1.9}}
+{"id": "EXAMPLE_85", "y": "versicolor", "x": {"f0": 6.0, "f1": 3.4, "f2": 4.5, "f3": 1.6}}
+{"id": "EXAMPLE_86", "y": "versicolor", "x": {"f0": 6.7, "f1": 3.1, "f2": 4.7, "f3": 1.5}}
+{"id": "EXAMPLE_16", "y": "setosa", "x": {"f0": 5.4, "f1": 3.9, "f2": 1.3, "f3": 0.4}}
+{"id": "EXAMPLE_10", "y": "setosa", "x": {"f0": 5.4, "f1": 3.7, "f2": 1.5, "f3": 0.2}}
+{"id": "EXAMPLE_81", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.4, "f2": 3.7, "f3": 1.0}}
+{"id": "EXAMPLE_133", "y": "virginica", "x": {"f0": 6.3, "f1": 2.8, "f2": 5.1, "f3": 1.5}}
+{"id": "EXAMPLE_137", "y": "virginica", "x": {"f0": 6.4, "f1": 3.1, "f2": 5.5, "f3": 1.8}}
+{"id": "EXAMPLE_75", "y": "versicolor", "x": {"f0": 6.6, "f1": 3.0, "f2": 4.4, "f3": 1.4}}
+{"id": "EXAMPLE_109", "y": "virginica", "x": {"f0": 7.2, "f1": 3.6, "f2": 6.1, "f3": 2.5}}
diff --git a/tests/other/examples_train.jsonlines b/tests/other/examples_train.jsonlines
new file mode 100644
index 00000000..2d403b02
--- /dev/null
+++ b/tests/other/examples_train.jsonlines
@@ -0,0 +1,100 @@
+{"id": "EXAMPLE_96", "y": "versicolor", "x": {"f0": 5.7, "f1": 2.9, "f2": 4.2, "f3": 1.3}}
+{"id": "EXAMPLE_105", "y": "virginica", "x": {"f0": 7.6, "f1": 3.0, "f2": 6.6, "f3": 2.1}}
+{"id": "EXAMPLE_66", "y": "versicolor", "x": {"f0": 5.6, "f1": 3.0, "f2": 4.5, "f3": 1.5}}
+{"id": "EXAMPLE_0", "y": "setosa", "x": {"f0": 5.1, "f1": 3.5, "f2": 1.4, "f3": 0.2}}
+{"id": "EXAMPLE_122", "y": "virginica", "x": {"f0": 7.7, "f1": 2.8, "f2": 6.7, "f3": 2.0}}
+{"id": "EXAMPLE_67", "y": "versicolor", "x": {"f0": 5.8, "f1": 2.7, "f2": 4.1, "f3": 1.0}}
+{"id": "EXAMPLE_28", "y": "setosa", "x": {"f0": 5.2, "f1": 3.4, "f2": 1.4, "f3": 0.2}}
+{"id": "EXAMPLE_40", "y": "setosa", "x": {"f0": 5.0, "f1": 3.5, "f2": 1.3, "f3": 0.3}}
+{"id": "EXAMPLE_44", "y": "setosa", "x": {"f0": 5.1, "f1": 3.8, "f2": 1.9, "f3": 0.4}}
+{"id": "EXAMPLE_60", "y": "versicolor", "x": {"f0": 5.0, "f1": 2.0, "f2": 3.5, "f3": 1.0}}
+{"id": "EXAMPLE_123", "y": "virginica", "x": {"f0": 6.3, "f1": 2.7, "f2": 4.9, "f3": 1.8}}
+{"id": "EXAMPLE_24", "y": "setosa", "x": {"f0": 4.8, "f1": 3.4, "f2": 1.9, "f3": 0.2}}
+{"id": "EXAMPLE_25", "y": "setosa", "x": {"f0": 5.0, "f1": 3.0, "f2": 1.6, "f3": 0.2}}
+{"id": "EXAMPLE_23", "y": "setosa", "x": {"f0": 5.1, "f1": 3.3, "f2": 1.7, "f3": 0.5}}
+{"id": "EXAMPLE_94", "y": "versicolor", "x": {"f0": 5.6, "f1": 2.7, "f2": 4.2, "f3": 1.3}}
+{"id": "EXAMPLE_39", "y": "setosa", "x": {"f0": 5.1, "f1": 3.4, "f2": 1.5, "f3": 0.2}}
+{"id": "EXAMPLE_95", "y": "versicolor", "x": {"f0": 5.7, "f1": 3.0, "f2": 4.2, "f3": 1.2}}
+{"id": "EXAMPLE_117", "y": "virginica", "x": {"f0": 7.7, "f1": 3.8, "f2": 6.7, "f3": 2.2}}
+{"id": "EXAMPLE_47", "y": "setosa", "x": {"f0": 4.6, "f1": 3.2, "f2": 1.4, "f3": 0.2}}
+{"id": "EXAMPLE_97", "y": "versicolor", "x": {"f0": 6.2, "f1": 2.9, "f2": 4.3, "f3": 1.3}}
+{"id": "EXAMPLE_113", "y": "virginica", "x": {"f0": 5.7, "f1": 2.5, "f2": 5.0, "f3": 2.0}}
+{"id": "EXAMPLE_33", "y": "setosa", "x": {"f0": 5.5, "f1": 4.2, "f2": 1.4, "f3": 0.2}}
+{"id": "EXAMPLE_138", "y": "virginica", "x": {"f0": 6.0, "f1": 3.0, "f2": 4.8, "f3": 1.8}}
+{"id": "EXAMPLE_101", "y": "virginica", "x": {"f0": 5.8, "f1": 2.7, "f2": 5.1, "f3": 1.9}}
+{"id": "EXAMPLE_62", "y": "versicolor", "x": {"f0": 6.0, "f1": 2.2, "f2": 4.0, "f3": 1.0}}
+{"id": "EXAMPLE_84", "y": "versicolor", "x": {"f0": 5.4, "f1": 3.0, "f2": 4.5, "f3": 1.5}}
+{"id": "EXAMPLE_148", "y": "virginica", "x": {"f0": 6.2, "f1": 3.4, "f2": 5.4, "f3": 2.3}}
+{"id": "EXAMPLE_53", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.3, "f2": 4.0, "f3": 1.3}}
+{"id": "EXAMPLE_5", "y": "setosa", "x": {"f0": 5.4, "f1": 3.9, "f2": 1.7, "f3": 0.4}}
+{"id": "EXAMPLE_93", "y": "versicolor", "x": {"f0": 5.0, "f1": 2.3, "f2": 3.3, "f3": 1.0}}
+{"id": "EXAMPLE_111", "y": "virginica", "x": {"f0": 6.4, "f1": 2.7, "f2": 5.3, "f3": 1.9}}
+{"id": "EXAMPLE_49", "y": "setosa", "x": {"f0": 5.0, "f1": 3.3, "f2": 1.4, "f3": 0.2}}
+{"id": "EXAMPLE_35", "y": "setosa", "x": {"f0": 5.0, "f1": 3.2, "f2": 1.2, "f3": 0.2}}
+{"id": "EXAMPLE_80", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.4, "f2": 3.8, "f3": 1.1}}
+{"id": "EXAMPLE_77", "y": "versicolor", "x": {"f0": 6.7, "f1": 3.0, "f2": 5.0, "f3": 1.7}}
+{"id": "EXAMPLE_34", "y": "setosa", "x": {"f0": 4.9, "f1": 3.1, "f2": 1.5, "f3": 0.1}}
+{"id": "EXAMPLE_114", "y": "virginica", "x": {"f0": 5.8, "f1": 2.8, "f2": 5.1, "f3": 2.4}}
+{"id": "EXAMPLE_7", "y": "setosa", "x": {"f0": 5.0, "f1": 3.4, "f2": 1.5, "f3": 0.2}}
+{"id": "EXAMPLE_43", "y": "setosa", "x": {"f0": 5.0, "f1": 3.5, "f2": 1.6, "f3": 0.6}}
+{"id": "EXAMPLE_70", "y": "versicolor", "x": {"f0": 5.9, "f1": 3.2, "f2": 4.8, "f3": 1.8}}
+{"id": "EXAMPLE_98", "y": "versicolor", "x": {"f0": 5.1, "f1": 2.5, "f2": 3.0, "f3": 1.1}}
+{"id": "EXAMPLE_120", "y": "virginica", "x": {"f0": 6.9, "f1": 3.2, "f2": 5.7, "f3": 2.3}}
+{"id": "EXAMPLE_83", "y": "versicolor", "x": {"f0": 6.0, "f1": 2.7, "f2": 5.1, "f3": 1.6}}
+{"id": "EXAMPLE_134", "y": "virginica", "x": {"f0": 6.1, "f1": 2.6, "f2": 5.6, "f3": 1.4}}
+{"id": "EXAMPLE_135", "y": "virginica", "x": {"f0": 7.7, "f1": 3.0, "f2": 6.1, "f3": 2.3}}
+{"id": "EXAMPLE_89", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.5, "f2": 4.0, "f3": 1.3}}
+{"id": "EXAMPLE_8", "y": "setosa", "x": {"f0": 4.4, "f1": 2.9, "f2": 1.4, "f3": 0.2}}
+{"id": "EXAMPLE_13", "y": "setosa", "x": {"f0": 4.3, "f1": 3.0, "f2": 1.1, "f3": 0.1}}
+{"id": "EXAMPLE_119", "y": "virginica", "x": {"f0": 6.0, "f1": 2.2, "f2": 5.0, "f3": 1.5}}
+{"id": "EXAMPLE_125", "y": "virginica", "x": {"f0": 7.2, "f1": 3.2, "f2": 6.0, "f3": 1.8}}
+{"id": "EXAMPLE_3", "y": "setosa", "x": {"f0": 4.6, "f1": 3.1, "f2": 1.5, "f3": 0.2}}
+{"id": "EXAMPLE_17", "y": "setosa", "x": {"f0": 5.1, "f1": 3.5, "f2": 1.4, "f3": 0.3}}
+{"id": "EXAMPLE_38", "y": "setosa", "x": {"f0": 4.4, "f1": 3.0, "f2": 1.3, "f3": 0.2}}
+{"id": "EXAMPLE_72", "y": "versicolor", "x": {"f0": 6.3, "f1": 2.5, "f2": 4.9, "f3": 1.5}}
+{"id": "EXAMPLE_136", "y": "virginica", "x": {"f0": 6.3, "f1": 3.4, "f2": 5.6, "f3": 2.4}}
+{"id": "EXAMPLE_6", "y": "setosa", "x": {"f0": 4.6, "f1": 3.4, "f2": 1.4, "f3": 0.3}}
+{"id": "EXAMPLE_112", "y": "virginica", "x": {"f0": 6.8, "f1": 3.0, "f2": 5.5, "f3": 2.1}}
+{"id": "EXAMPLE_100", "y": "virginica", "x": {"f0": 6.3, "f1": 3.3, "f2": 6.0, "f3": 2.5}}
+{"id": "EXAMPLE_2", "y": "setosa", "x": {"f0": 4.7, "f1": 3.2, "f2": 1.3, "f3": 0.2}}
+{"id": "EXAMPLE_63", "y": "versicolor", "x": {"f0": 6.1, "f1": 2.9, "f2": 4.7, "f3": 1.4}}
+{"id": "EXAMPLE_54", "y": "versicolor", "x": {"f0": 6.5, "f1": 2.8, "f2": 4.6, "f3": 1.5}}
+{"id": "EXAMPLE_126", "y": "virginica", "x": {"f0": 6.2, "f1": 2.8, "f2": 4.8, "f3": 1.8}}
+{"id": "EXAMPLE_50", "y": "versicolor", "x": {"f0": 7.0, "f1": 3.2, "f2": 4.7, "f3": 1.4}}
+{"id": "EXAMPLE_115", "y": "virginica", "x": {"f0": 6.4, "f1": 3.2, "f2": 5.3, "f3": 2.3}}
+{"id": "EXAMPLE_46", "y": "setosa", "x": {"f0": 5.1, "f1": 3.8, "f2": 1.6, "f3": 0.2}}
+{"id": "EXAMPLE_139", "y": "virginica", "x": {"f0": 6.9, "f1": 3.1, "f2": 5.4, "f3": 2.1}}
+{"id": "EXAMPLE_61", "y": "versicolor", "x": {"f0": 5.9, "f1": 3.0, "f2": 4.2, "f3": 1.5}}
+{"id": "EXAMPLE_147", "y": "virginica", "x": {"f0": 6.5, "f1": 3.0, "f2": 5.2, "f3": 2.0}}
+{"id": "EXAMPLE_79", "y": "versicolor", "x": {"f0": 5.7, "f1": 2.6, "f2": 3.5, "f3": 1.0}}
+{"id": "EXAMPLE_59", "y": "versicolor", "x": {"f0": 5.2, "f1": 2.7, "f2": 3.9, "f3": 1.4}}
+{"id": "EXAMPLE_91", "y": "versicolor", "x": {"f0": 6.1, "f1": 3.0, "f2": 4.6, "f3": 1.4}}
+{"id": "EXAMPLE_41", "y": "setosa", "x": {"f0": 4.5, "f1": 2.3, "f2": 1.3, "f3": 0.3}}
+{"id": "EXAMPLE_58", "y": "versicolor", "x": {"f0": 6.6, "f1": 2.9, "f2": 4.6, "f3": 1.3}}
+{"id": "EXAMPLE_90", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.6, "f2": 4.4, "f3": 1.2}}
+{"id": "EXAMPLE_48", "y": "setosa", "x": {"f0": 5.3, "f1": 3.7, "f2": 1.5, "f3": 0.2}}
+{"id": "EXAMPLE_88", "y": "versicolor", "x": {"f0": 5.6, "f1": 3.0, "f2": 4.1, "f3": 1.3}}
+{"id": "EXAMPLE_107", "y": "virginica", "x": {"f0": 7.3, "f1": 2.9, "f2": 6.3, "f3": 1.8}}
+{"id": "EXAMPLE_124", "y": "virginica", "x": {"f0": 6.7, "f1": 3.3, "f2": 5.7, "f3": 2.1}}
+{"id": "EXAMPLE_21", "y": "setosa", "x": {"f0": 5.1, "f1": 3.7, "f2": 1.5, "f3": 0.4}}
+{"id": "EXAMPLE_57", "y": "versicolor", "x": {"f0": 4.9, "f1": 2.4, "f2": 3.3, "f3": 1.0}}
+{"id": "EXAMPLE_144", "y": "virginica", "x": {"f0": 6.7, "f1": 3.3, "f2": 5.7, "f3": 2.5}}
+{"id": "EXAMPLE_129", "y": "virginica", "x": {"f0": 7.2, "f1": 3.0, "f2": 5.8, "f3": 1.6}}
+{"id": "EXAMPLE_37", "y": "setosa", "x": {"f0": 4.9, "f1": 3.1, "f2": 1.5, "f3": 0.1}}
+{"id": "EXAMPLE_140", "y": "virginica", "x": {"f0": 6.7, "f1": 3.1, "f2": 5.6, "f3": 2.4}}
+{"id": "EXAMPLE_1", "y": "setosa", "x": {"f0": 4.9, "f1": 3.0, "f2": 1.4, "f3": 0.2}}
+{"id": "EXAMPLE_52", "y": "versicolor", "x": {"f0": 6.9, "f1": 3.1, "f2": 4.9, "f3": 1.5}}
+{"id": "EXAMPLE_130", "y": "virginica", "x": {"f0": 7.4, "f1": 2.8, "f2": 6.1, "f3": 1.9}}
+{"id": "EXAMPLE_103", "y": "virginica", "x": {"f0": 6.3, "f1": 2.9, "f2": 5.6, "f3": 1.8}}
+{"id": "EXAMPLE_99", "y": "versicolor", "x": {"f0": 5.7, "f1": 2.8, "f2": 4.1, "f3": 1.3}}
+{"id": "EXAMPLE_116", "y": "virginica", "x": {"f0": 6.5, "f1": 3.0, "f2": 5.5, "f3": 1.8}}
+{"id": "EXAMPLE_87", "y": "versicolor", "x": {"f0": 6.3, "f1": 2.3, "f2": 4.4, "f3": 1.3}}
+{"id": "EXAMPLE_74", "y": "versicolor", "x": {"f0": 6.4, "f1": 2.9, "f2": 4.3, "f3": 1.3}}
+{"id": "EXAMPLE_121", "y": "virginica", "x": {"f0": 5.6, "f1": 2.8, "f2": 4.9, "f3": 2.0}}
+{"id": "EXAMPLE_149", "y": "virginica", "x": {"f0": 5.9, "f1": 3.0, "f2": 5.1, "f3": 1.8}}
+{"id": "EXAMPLE_20", "y": "setosa", "x": {"f0": 5.4, "f1": 3.4, "f2": 1.7, "f3": 0.2}}
+{"id": "EXAMPLE_71", "y": "versicolor", "x": {"f0": 6.1, "f1": 2.8, "f2": 4.0, "f3": 1.3}}
+{"id": "EXAMPLE_106", "y": "virginica", "x": {"f0": 4.9, "f1": 2.5, "f2": 4.5, "f3": 1.7}}
+{"id": "EXAMPLE_14", "y": "setosa", "x": {"f0": 5.8, "f1": 4.0, "f2": 1.2, "f3": 0.2}}
+{"id": "EXAMPLE_92", "y": "versicolor", "x": {"f0": 5.8, "f1": 2.6, "f2": 4.0, "f3": 1.2}}
+{"id": "EXAMPLE_102", "y": "virginica", "x": {"f0": 7.1, "f1": 3.0, "f2": 5.9, "f3": 2.1}}
diff --git a/tests/test_classification.py b/tests/test_classification.py
index c026f154..dde6b237 100644
--- a/tests/test_classification.py
+++ b/tests/test_classification.py
@@ -26,9 +26,11 @@
from nose.tools import eq_, assert_almost_equal, raises
from sklearn.exceptions import ConvergenceWarning
+from sklearn.feature_extraction import FeatureHasher
from sklearn.metrics import accuracy_score
from skll.data import FeatureSet
+from skll.data.readers import NDJReader
from skll.data.writers import NDJWriter
from skll.config import _parse_config_file
from skll.experiments import run_configuration
@@ -76,9 +78,11 @@ def tearDown():
for output_file in glob.glob(join(output_dir, 'train_test_single_file_*')):
os.unlink(output_file)
- config_file = join(config_dir, 'test_single_file.cfg')
- if exists(config_file):
- os.unlink(config_file)
+ config_files = [join(config_dir, cfgname) for cfgname in ['test_single_file.cfg',
+ 'test_single_file_saved_subset']]
+ for config_file in config_files:
+ if exists(config_file):
+ os.unlink(config_file)
def check_predict(model, use_feature_hashing=False):
@@ -127,6 +131,70 @@ def test_predict():
yield check_predict, model, use_feature_hashing
+# test predictions when both the model and the data use DictVectorizers
+def test_predict_dict_dict():
+ train_file = join(_my_dir, 'other', 'examples_train.jsonlines')
+ test_file = join(_my_dir, 'other', 'examples_test.jsonlines')
+ train_fs = NDJReader.for_path(train_file).read()
+ test_fs = NDJReader.for_path(test_file).read()
+ learner = Learner('LogisticRegression')
+ learner.train(train_fs, grid_search=False)
+ predictions = learner.predict(test_fs)
+ eq_(len(predictions), test_fs.features.shape[0])
+
+
+# test predictions when both the model and the data use FeatureHashers
+# and the same number of bins
+def test_predict_hasher_hasher_same_bins():
+ train_file = join(_my_dir, 'other', 'examples_train.jsonlines')
+ test_file = join(_my_dir, 'other', 'examples_test.jsonlines')
+ train_fs = NDJReader.for_path(train_file, feature_hasher=True, num_features=3).read()
+ test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read()
+ learner = Learner('LogisticRegression')
+ learner.train(train_fs, grid_search=False)
+ predictions = learner.predict(test_fs)
+ eq_(len(predictions), test_fs.features.shape[0])
+
+
+# test predictions when both the model and the data use FeatureHashers
+# but different number of bins
+@raises(RuntimeError)
+def test_predict_hasher_hasher_different_bins():
+ train_file = join(_my_dir, 'other', 'examples_train.jsonlines')
+ test_file = join(_my_dir, 'other', 'examples_test.jsonlines')
+ train_fs = NDJReader.for_path(train_file, feature_hasher=True, num_features=3).read()
+ test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=2).read()
+ learner = Learner('LogisticRegression')
+ learner.train(train_fs, grid_search=False)
+ _ = learner.predict(test_fs)
+
+
+# test predictions when model uses a FeatureHasher and data
+# uses a DictVectorizer
+def test_predict_hasher_dict():
+ train_file = join(_my_dir, 'other', 'examples_train.jsonlines')
+ test_file = join(_my_dir, 'other', 'examples_test.jsonlines')
+ train_fs = NDJReader.for_path(train_file, feature_hasher=True, num_features=3).read()
+ test_fs = NDJReader.for_path(test_file).read()
+ learner = Learner('LogisticRegression')
+ learner.train(train_fs, grid_search=False)
+ predictions = learner.predict(test_fs)
+ eq_(len(predictions), test_fs.features.shape[0])
+
+
+# test predictions when model uses a DictVectorizer and data
+# uses a FeatureHasher
+@raises(RuntimeError)
+def test_predict_dict_hasher():
+ train_file = join(_my_dir, 'other', 'examples_train.jsonlines')
+ test_file = join(_my_dir, 'other', 'examples_test.jsonlines')
+ train_fs = NDJReader.for_path(train_file).read()
+ test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read()
+ learner = Learner('LogisticRegression')
+ learner.train(train_fs, grid_search=False)
+ _ = learner.predict(test_fs)
+
+
# the function to create data with rare labels for cross-validation
def make_rare_class_data():
"""
@@ -190,7 +258,7 @@ def test_sparse_predict():
[(0.45, 0.52), (0.52, 0.5),
(0.48, 0.5), (0.49, 0.5),
(0.43, 0), (0.53, 0.57),
- (0.49, 0.49), (0.48, 0.5)]):
+ (0.49, 0.49), (0.5, 0.49)]):
yield check_sparse_predict, learner_name, expected_scores[0], False
if learner_name != 'MultinomialNB':
yield check_sparse_predict, learner_name, expected_scores[1], True
@@ -207,7 +275,7 @@ def test_mlp_classification():
learner = Learner('MLPClassifier')
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=ConvergenceWarning)
- learner.train(train_fs, grid_search=True)
+ learner.train(train_fs, grid_search=False)
# now generate the predictions on the test set
predictions = learner.predict(test_fs)
@@ -217,7 +285,7 @@ def test_mlp_classification():
# using make_regression_data. To do this, we just
# make sure that they are correlated
accuracy = accuracy_score(predictions, test_fs.labels)
- assert_almost_equal(accuracy, 0.825)
+ assert_almost_equal(accuracy, 0.858, places=3)
def check_sparse_predict_sampler(use_feature_hashing=False):
@@ -301,6 +369,12 @@ def make_single_file_featureset_data():
writer = NDJWriter(test_path, test_fs)
writer.write()
+ # Also write another test feature set that has fewer features than the training set
+ test_fs.filter(features=['f01', 'f02'])
+ test_path = join(_my_dir, 'test', 'test_single_file_subset.jsonlines')
+ writer = NDJWriter(test_path, test_fs)
+ writer.write()
+
def test_train_file_test_file():
"""
@@ -340,6 +414,43 @@ def test_train_file_test_file():
assert_almost_equal(result_dict['score'], 0.9491525423728813)
+def test_predict_on_subset_with_existing_model():
+ """
+ Test generating predictions on subset with existing model
+ """
+ # Create data files
+ make_single_file_featureset_data()
+
+ # train and save a model on the training file
+ train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read()
+ learner = Learner('RandomForestClassifier')
+ learner.train(train_fs, grid_search=True, grid_objective="accuracy")
+ model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_'
+ 'single_file.jsonlines_test_test_single'
+ '_file_subset.jsonlines_RandomForestClassifier'
+ '.model'))
+
+ learner.save(model_filename)
+
+ # Run experiment
+ config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs",
+ "test_single_file_saved_subset"
+ ".template.cfg"),
+ join(_my_dir, 'train', 'train_single_file.jsonlines'),
+ join(_my_dir, 'test',
+ 'test_single_file_subset.'
+ 'jsonlines'))
+ run_configuration(config_path, quiet=True, overwrite=False)
+
+ # Check results
+ with open(join(_my_dir, 'output', ('train_test_single_file_train_train_'
+ 'single_file.jsonlines_test_test_single'
+ '_file_subset.jsonlines_RandomForestClassifier'
+ '.results.json'))) as f:
+ result_dict = json.load(f)[0]
+ assert_almost_equal(result_dict['score'], 0.7333333)
+
+
def test_train_file_test_file_ablation():
"""
Test that specifying ablation with train and test file is ignored
diff --git a/tests/test_featureset.py b/tests/test_featureset.py
index 3ae1f6ee..d5cb6f30 100644
--- a/tests/test_featureset.py
+++ b/tests/test_featureset.py
@@ -25,7 +25,7 @@
from sklearn.datasets.samples_generator import make_classification
import skll
-from skll.data import FeatureSet, Writer, Reader
+from skll.data import FeatureSet, Writer, Reader, NDJReader, NDJWriter
from skll.data.readers import DictListReader
from skll.experiments import _load_featureset
from skll.learner import _DEFAULT_PARAM_GRIDS
@@ -62,6 +62,11 @@ def tearDown():
if exists(filepath):
os.unlink(filepath)
+ filepaths = [join(_my_dir, 'other', '{}.jsonlines'.format(x)) for x in ['test_string_ids', 'test_string_ids_df', 'test_string_labels_df']]
+ for filepath in filepaths:
+ if exists(filepath):
+ os.unlink(filepath)
+
def _create_empty_file(filetype):
filepath = join(_my_dir, 'other', 'empty.{}'.format(filetype))
@@ -1026,3 +1031,79 @@ def test_featureset_creation_from_dataframe_without_labels_with_vectorizer():
rtol=1e-6) and
np.all(np.isnan(expected.labels)) and
np.all(np.isnan(current.labels)))
+
+
+def test_writing_ndj_featureset_with_string_ids():
+ test_dict_vectorizer = DictVectorizer()
+ test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
+ Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
+ fs_test = FeatureSet('test',
+ ids=['1', '2'],
+ labels=[1, 2],
+ features=Xtest,
+ vectorizer=test_dict_vectorizer)
+ output_path = join(_my_dir, "other", "test_string_ids.jsonlines")
+ test_writer = NDJWriter(output_path, fs_test)
+ test_writer.write()
+
+ # read in the written file into a featureset and confirm that the
+ # two featuresets are equal
+ fs_test2 = NDJReader.for_path(output_path).read()
+
+ assert fs_test == fs_test2
+
+
+@attr('have_pandas_and_seaborn')
+def test_featureset_creation_from_dataframe_with_string_ids():
+
+ import pandas
+
+ dftest = pandas.DataFrame({"id": ['1', '2'],
+ "score": [1, 2],
+ "text": ["a b", "b c"]})
+ dftest.set_index("id", inplace=True)
+ test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
+ test_dict_vectorizer = DictVectorizer()
+ Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
+ fs_test = FeatureSet('test',
+ ids=dftest.index.values,
+ labels=dftest['score'].values,
+ features=Xtest,
+ vectorizer=test_dict_vectorizer)
+ output_path = join(_my_dir, "other", "test_string_ids_df.jsonlines")
+ test_writer = NDJWriter(output_path, fs_test)
+ test_writer.write()
+
+ # read in the written file into a featureset and confirm that the
+ # two featuresets are equal
+ fs_test2 = NDJReader.for_path(output_path).read()
+
+ assert fs_test == fs_test2
+
+
+@attr('have_pandas_and_seaborn')
+def test_featureset_creation_from_dataframe_with_string_labels():
+
+ import pandas
+
+ dftest = pandas.DataFrame({"id": [1, 2],
+ "score": ['yes', 'no'],
+ "text": ["a b", "b c"]})
+ dftest.set_index("id", inplace=True)
+ test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}]
+ test_dict_vectorizer = DictVectorizer()
+ Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list)
+ fs_test = FeatureSet('test',
+ ids=dftest.index.values,
+ labels=dftest['score'].values,
+ features=Xtest,
+ vectorizer=test_dict_vectorizer)
+ output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines")
+ test_writer = NDJWriter(output_path, fs_test)
+ test_writer.write()
+
+ # read in the written file into a featureset and confirm that the
+ # two featuresets are equal
+ fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read()
+
+ assert fs_test == fs_test2
diff --git a/tests/test_input.py b/tests/test_input.py
index a36b9fe5..158c2f27 100644
--- a/tests/test_input.py
+++ b/tests/test_input.py
@@ -62,6 +62,10 @@ def tearDown():
config_dir = join(_my_dir, 'configs')
for config_file in glob(join(config_dir, 'test_config_parsing_*.cfg')):
os.unlink(config_file)
+ for auto_dir in glob(join(_my_dir, 'auto*')):
+ for auto_dir_file in os.listdir(auto_dir):
+ os.unlink(join(auto_dir, auto_dir_file))
+ os.rmdir(auto_dir)
def check_safe_float_conversion(converted_val, expected_val):
@@ -1119,6 +1123,60 @@ def test_config_parsing_relative_input_paths():
learning_curve_train_sizes, output_metrics) = _parse_config_file(config_path)
+def test_config_parsing_automatic_output_directory_creation():
+
+ train_dir = '../train'
+ train_file = join(train_dir, 'f0.jsonlines')
+ test_file = join(train_dir, 'f1.jsonlines')
+ output_dir = '../output'
+
+ # make a simple config file that has new directories that should
+ # be automatically created
+ new_log_path = join(_my_dir, 'autolog')
+ new_results_path = join(_my_dir, 'autoresults')
+ new_models_path = join(_my_dir, 'automodels')
+ new_predictions_path = join(_my_dir, 'autopredictions')
+
+ ok_(not(exists(new_log_path)))
+ ok_(not(exists(new_results_path)))
+ ok_(not(exists(new_models_path)))
+ ok_(not(exists(new_predictions_path)))
+
+ values_to_fill_dict = {'experiment_name': 'auto_dir_creation',
+ 'task': 'evaluate',
+ 'train_file': train_file,
+ 'test_file': test_file,
+ 'learners': "['LogisticRegression']",
+ 'log': new_log_path,
+ 'results': new_results_path,
+ 'models': new_models_path,
+ 'predictions': new_predictions_path,
+ 'objective': 'f1_score_micro'}
+
+ config_template_path = join(_my_dir, 'configs',
+ 'test_relative_paths.template.cfg')
+ config_path = fill_in_config_options(config_template_path,
+ values_to_fill_dict,
+ 'auto_dir_creation')
+
+ (experiment_name, task, sampler, fixed_sampler_parameters,
+ feature_hasher, hasher_features, id_col, label_col, train_set_name,
+ test_set_name, suffix, featuresets, do_shuffle, model_path,
+ do_grid_search, grid_objective, probability, results_path,
+ pos_label_str, feature_scaling, min_feature_count, folds_file,
+ grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds,
+ use_folds_file_for_grid_search, do_stratified_folds,
+ fixed_parameter_list, param_grid_list, featureset_names, learners,
+ prediction_dir, log_path, train_path, test_path, ids_to_floats,
+ class_map, custom_learner_path, learning_curve_cv_folds_list,
+ learning_curve_train_sizes, output_metrics) = _parse_config_file(config_path)
+
+ ok_(exists(new_log_path))
+ ok_(exists(new_results_path))
+ ok_(exists(new_models_path))
+ ok_(exists(new_predictions_path))
+
+
def check_config_parsing_metrics_and_objectives_overlap(task,
metrics,
objectives):
diff --git a/tests/test_regression.py b/tests/test_regression.py
index 098ed8ac..c090082c 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -135,7 +135,7 @@ def check_rescaling(name, grid_search=False):
train_p_std = np.std(train_predictions)
rescaled_train_p_std = np.std(rescaled_train_predictions)
assert_less(abs(rescaled_train_p_std - train_y_std),
- abs(train_p_std - train_y_std))
+ abs(train_p_std - train_y_std))
def test_rescaling():
@@ -403,14 +403,14 @@ def check_ensemble_models(name,
else:
expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
else:
- expected_feature_importances = ([0.204,
- 0.172,
- 0.178,
- 0.212,
- 0.234] if use_feature_hashing else
- [0.262,
- 0.288,
- 0.45])
+ expected_feature_importances = ([0.471714,
+ 0.022797,
+ 0.283377,
+ 0.170823,
+ 0.051288] if use_feature_hashing else
+ [0.082621,
+ 0.166652,
+ 0.750726])
feature_importances = learner.model.feature_importances_
assert_allclose(feature_importances, expected_feature_importances,
@@ -611,7 +611,7 @@ def test_ransac_regression():
'SGDRegressor',
'DecisionTreeRegressor',
'SVR'],
- [0.95, 0.45, 0.75, 0.65]):
+ [0.95, 0.45, 0.75, 0.65]):
yield check_ransac_regression, base_estimator_name, pearson_value
@@ -627,7 +627,7 @@ def check_mlp_regression(use_rescaling=False):
# we don't want to see any convergence warnings during the grid search
with warnings.catch_warnings():
warnings.filterwarnings('ignore', category=ConvergenceWarning)
- learner.train(train_fs, grid_search=True, grid_objective='pearson')
+ learner.train(train_fs, grid_search=False)
# now generate the predictions on the test set
predictions = learner.predict(test_fs)
diff --git a/tests/test_utilities.py b/tests/test_utilities.py
index 2b26c57f..634623bc 100644
--- a/tests/test_utilities.py
+++ b/tests/test_utilities.py
@@ -11,6 +11,7 @@
import ast
import copy
+import csv
import itertools
import os
import sys
@@ -29,6 +30,7 @@
from nose.plugins.logcapture import LogCapture
from nose.tools import eq_, assert_almost_equal, raises
from numpy.testing import assert_allclose, assert_array_almost_equal
+from numpy import concatenate
import skll
import skll.utilities.compute_eval_from_predictions as cefp
@@ -284,19 +286,29 @@ def test_compute_eval_from_predictions_random_choice():
eq_(pred, 'C')
-def check_generate_predictions(use_feature_hashing=False, use_threshold=False):
-
- # create some simple classification data without feature hashing
- train_fs, test_fs = make_classification_data(
- num_examples=1000, num_features=5,
- use_feature_hashing=use_feature_hashing, feature_bins=4)
+def check_generate_predictions(use_feature_hashing=False,
+ use_threshold=False,
+ test_on_subset=False,
+ use_all_labels=False):
+ # create some simple classification feature sets for training and testing
+ train_fs, test_fs = make_classification_data(num_examples=1000,
+ num_features=5,
+ use_feature_hashing=use_feature_hashing,
+ feature_bins=4)
+ enable_probability = use_threshold or use_all_labels
# create a learner that uses an SGD classifier
- learner = Learner('SGDClassifier', probability=use_threshold)
+ learner = Learner('SGDClassifier', probability=enable_probability)
# train the learner with grid search
learner.train(train_fs, grid_search=True)
+ # if we are asked to use only a subset, then filter out
+ # one of the features if we are not using feature hashing,
+ # do nothing if we are using feature hashing
+ if test_on_subset and not use_feature_hashing:
+ test_fs.filter(features=['f01', 'f02', 'f03', 'f04'])
+
# get the predictions on the test featureset
predictions = learner.predict(test_fs)
@@ -316,24 +328,90 @@ def check_generate_predictions(use_feature_hashing=False, use_threshold=False):
# now use Predictor to generate the predictions and make
# sure that they are the same as before saving the model
- p = gp.Predictor(model_file, threshold=threshold)
+ p = gp.Predictor(model_file, threshold=threshold,
+ all_labels=use_all_labels)
+
+ assert(p._pos_index == 1)
+ assert(p.threshold == threshold)
+
predictions_after_saving = p.predict(test_fs)
eq_(predictions, predictions_after_saving)
-def test_generate_predictions():
+def check_generate_predictions_file_headers(use_threshold=False,
+ use_all_labels=False):
+ # create some simple classification feature sets for training and testing
+ train_fs, test_fs = make_classification_data(num_examples=1000,
+ num_features=5,
+ feature_bins=4)
+ enable_probability = use_threshold or use_all_labels
+ # create a learner that uses an SGD classifier
+ learner = Learner('SGDClassifier', probability=enable_probability)
+
+ # train the learner with grid search
+ learner.train(train_fs, grid_search=True)
+
+ # get the predictions on the test featureset
+ predictions = learner.predict(test_fs)
+
+ # if we asked for probabilities, then use the threshold
+ # to convert them into binary predictions
+ if use_threshold:
+ threshold = 0.6
+ else:
+ threshold = None
+
+ # save the learner to a file
+ model_file = join(_my_dir, 'output',
+ 'test_generate_predictions.model')
+ learner.save(model_file)
+
+ # now use Predictor to generate the predictions and make
+ # sure that they are the same as before saving the model
+ p = gp.Predictor(model_file, threshold=threshold,
+ all_labels=use_all_labels)
+ predictions_after_saving = p.predict(test_fs)
+
+ if threshold:
+ assert (p.output_file_header == ['id', 'prediction'])
+ elif use_all_labels:
+ assert (p.output_file_header == ['id', '0', '1'])
+
+
+
+@raises(ValueError)
+def test_generate_predictions_conflicting_params():
"""
- Test generate predictions API with hashing and a threshold
+ Test that ValueError is raised when `generate_predictions.Predictor` is
+ initialized with both `threshold` and `all_labels` turned on.
"""
+ model_file = "not/real/model/file.model"
+ gp.Predictor(model_file, threshold=0.6, all_labels=True)
+
+
+def test_generate_predictions():
+ for (use_feature_hashing,
+ use_threshold,
+ test_on_subset,
+ all_probabilities) in product([True, False], [True, False],
+ [True, False], [True, False]):
+ if use_threshold and all_probabilities:
+ continue
+ yield (check_generate_predictions, use_feature_hashing,
+ use_threshold, test_on_subset, all_probabilities)
+
- yield check_generate_predictions, False, False
- yield check_generate_predictions, True, False
- yield check_generate_predictions, False, True
- yield check_generate_predictions, True, True
+def test_generate_predictions_file_header():
+ for (use_threshold, all_probabilities) in ([True, False], [False, True]):
+ if use_threshold and all_probabilities:
+ continue
+ yield (check_generate_predictions_file_headers,
+ use_threshold, all_probabilities)
-def check_generate_predictions_console(use_threshold=False):
+
+def check_generate_predictions_console(use_threshold=False, all_labels=False):
# create some simple classification data without feature hashing
train_fs, test_fs = make_classification_data(num_examples=1000,
@@ -345,8 +423,9 @@ def check_generate_predictions_console(use_threshold=False):
writer = NDJWriter(input_file, test_fs)
writer.write()
+ enable_probability = use_threshold or all_labels
# create a learner that uses an SGD classifier
- learner = Learner('SGDClassifier', probability=use_threshold)
+ learner = Learner('SGDClassifier', probability=enable_probability)
# train the learner with grid search
learner.train(train_fs, grid_search=True)
@@ -372,6 +451,9 @@ def check_generate_predictions_console(use_threshold=False):
generate_cmd = []
if use_threshold:
generate_cmd.append('-t {}'.format(threshold))
+ elif all_labels:
+ generate_cmd.append('-a')
+
generate_cmd.extend([model_file, input_file])
# we need to capture stdout since that's what main() writes to
@@ -384,21 +466,265 @@ def check_generate_predictions_console(use_threshold=False):
gp.main(generate_cmd)
out = mystdout.getvalue()
err = mystderr.getvalue()
- predictions_after_saving = [int(x) for x in out.strip().split('\n')]
- eq_(predictions, predictions_after_saving)
+ output_lines = out.strip().split('\n')[1:] # Skip headers
+ if all_labels:
+ # Ignore the id (first column) in output.
+ predictions_after_saving = [[float(p) for p in x.split('\t')[1:]]
+ for x in output_lines]
+ else:
+ # Ignore the id (first column) in output.
+ predictions_after_saving = [int(x.split('\t')[1])
+ for x in output_lines]
+ if all_labels:
+ assert_array_almost_equal(predictions, predictions_after_saving)
+ else:
+ eq_(predictions, predictions_after_saving)
+ finally:
+ sys.stdout = old_stdout
+ sys.stderr = old_stderr
+ print(err)
+
+def test_generate_predictions_console_bad_input_ext():
+ lc = LogCapture()
+ lc.begin()
+
+ # create some simple classification data without feature hashing
+ train_fs, test_fs = make_classification_data(num_examples=1000,
+ num_features=5)
+
+ # create a learner that uses an SGD classifier
+ learner = Learner('SGDClassifier')
+ # train the learner with grid search
+ learner.train(train_fs, grid_search=True)
+ # get the predictions on the test featureset
+ predictions = learner.predict(test_fs)
+ # save the learner to a file
+ model_file = join(_my_dir, 'output',
+ 'test_generate_predictions_console.model')
+ learner.save(model_file)
+
+ # now call main() from generate_predictions.py
+ generate_cmd = [model_file, "fake_input_file.txt"]
+
+ # we need to capture stdout since that's what main() writes to
+ err = ''
+ try:
+ old_stdout = sys.stdout
+ old_stderr = sys.stderr
+ sys.stdout = mystdout = StringIO()
+ sys.stderr = mystderr = StringIO()
+ gp.main(generate_cmd)
+ out = mystdout.getvalue()
+ err = mystderr.getvalue()
finally:
sys.stdout = old_stdout
sys.stderr = old_stderr
print(err)
+ expected_log_mssg = ("skll.utilities.generate_predictions: ERROR: Input "
+ "file must be in either .arff, .csv, .jsonlines, "
+ ".libsvm, .megam, .ndj, or .tsv format. Skipping "
+ "file fake_input_file.txt")
+
+ eq_(lc.handler.buffer[-1], expected_log_mssg)
+
def test_generate_predictions_console():
"""
Test generate_predictions as a console script with/without a threshold
"""
- yield check_generate_predictions_console, False
- yield check_generate_predictions_console, True
+ yield check_generate_predictions_console, False, False
+ yield check_generate_predictions_console, False, True
+ yield check_generate_predictions_console, True, False
+
+
+def check_generate_predictions_file_output_multi_infiles(use_threshold=False,
+ all_labels=False):
+ """
+ Make sure generate_predictions works with multiple input files.
+ """
+
+ # create some simple classification data without feature hashing
+ train_fs, test_fs = make_classification_data(num_examples=1000,
+ num_features=5)
+
+ # save the test feature set to an NDJ file
+ input_file = join(_my_dir, 'test', 'test_generate_predictions.jsonlines')
+ writer = NDJWriter(input_file, test_fs)
+ writer.write()
+
+ enable_probability = use_threshold or all_labels
+ # create a learner that uses an SGD classifier
+ learner = Learner('SGDClassifier', probability=enable_probability)
+
+ # train the learner with grid search
+ learner.train(train_fs, grid_search=True)
+
+ # get the predictions on the test featureset
+ predictions = learner.predict(test_fs)
+ predictions = concatenate([predictions, predictions])
+
+ # if we asked for probabilities, then use the threshold
+ # to convert them into binary predictions
+ if use_threshold:
+ threshold = 0.6
+ predictions = [int(p[1] >= threshold) for p in predictions]
+ else:
+ predictions = predictions.tolist()
+ threshold = None
+
+ # save the learner to a file
+ model_file = join(_my_dir, 'output',
+ 'test_generate_predictions_console.model')
+ learner.save(model_file)
+
+ # now call main() from generate_predictions.py
+ generate_cmd = []
+ if use_threshold:
+ generate_cmd.append('-t {}'.format(threshold))
+ elif all_labels:
+ generate_cmd.append('-a')
+
+ output_file_path = join(_my_dir, 'output',
+ 'output_test_{}_{}_MULTI.tsv'
+ .format(use_threshold, all_labels))
+ generate_cmd.extend(["--output_file", output_file_path])
+
+ generate_cmd.extend([model_file, input_file, input_file])
+
+ gp.main(generate_cmd)
+
+ with open(output_file_path) as saved_predictions_file:
+ predictions_after_saving = []
+ reader = csv.reader(saved_predictions_file, delimiter=str("\t"))
+ next(reader)
+ if all_labels:
+ for row in reader:
+ predictions_after_saving.append([float(r) for r in row[1:]])
+ else:
+ for row in reader:
+ predictions_after_saving.append(float(row[1]))
+
+ assert_array_almost_equal(predictions, predictions_after_saving)
+
+
+def test_generate_predictions_file_output_multi_infiles():
+ """
+ Test generate_predictions file output with/without a threshold
+ """
+
+ yield check_generate_predictions_file_output_multi_infiles, False, False
+ yield check_generate_predictions_file_output_multi_infiles, False, True
+ yield check_generate_predictions_file_output_multi_infiles, True, False
+
+
+
+def check_generate_predictions_file_output(use_threshold=False,
+ all_labels=False):
+
+ # create some simple classification data without feature hashing
+ train_fs, test_fs = make_classification_data(num_examples=1000,
+ num_features=5)
+
+ # save the test feature set to an NDJ file
+ input_file = join(_my_dir, 'test', 'test_generate_predictions.jsonlines')
+ writer = NDJWriter(input_file, test_fs)
+ writer.write()
+
+ enable_probability = use_threshold or all_labels
+ # create a learner that uses an SGD classifier
+ learner = Learner('SGDClassifier', probability=enable_probability)
+
+ # train the learner with grid search
+ learner.train(train_fs, grid_search=True)
+
+ # get the predictions on the test featureset
+ predictions = learner.predict(test_fs)
+
+ # if we asked for probabilities, then use the threshold
+ # to convert them into binary predictions
+ if use_threshold:
+ threshold = 0.6
+ predictions = [int(p[1] >= threshold) for p in predictions]
+ else:
+ predictions = predictions.tolist()
+ threshold = None
+
+ # save the learner to a file
+ model_file = join(_my_dir, 'output',
+ 'test_generate_predictions_console.model')
+ learner.save(model_file)
+
+ # now call main() from generate_predictions.py
+ generate_cmd = []
+ if use_threshold:
+ generate_cmd.append('-t {}'.format(threshold))
+ elif all_labels:
+ generate_cmd.append('-a')
+
+ output_file_path = join(_my_dir, 'output',
+ 'output_test_{}_{}.tsv'
+ .format(use_threshold, all_labels))
+ generate_cmd.extend(["--output_file", output_file_path])
+
+ generate_cmd.extend([model_file, input_file])
+ gp.main(generate_cmd)
+
+ with open(output_file_path) as saved_predictions_file:
+ predictions_after_saving = []
+ reader = csv.reader(saved_predictions_file, delimiter=str("\t"))
+ next(reader)
+ if all_labels:
+ for row in reader:
+ predictions_after_saving.append([float(r) for r in row[1:]])
+ else:
+ for row in reader:
+ predictions_after_saving.append(float(row[1]))
+
+ assert_array_almost_equal(predictions, predictions_after_saving)
+
+
+def test_generate_predictions_file_output():
+ """
+ Test generate_predictions file output with/without a threshold
+ """
+
+ yield check_generate_predictions_file_output, False, False
+ yield check_generate_predictions_file_output, False, True
+ yield check_generate_predictions_file_output, True, False
+
+
+
+
+@raises(SystemExit)
+def test_mutually_exclusive_generate_predictions_args():
+ # create some simple classification data without feature hashing
+ train_fs, test_fs = make_classification_data(num_examples=1000,
+ num_features=5)
+ threshold = 0.6
+
+ # save the test feature set to an NDJ file
+ input_file = join(_my_dir, 'test',
+ 'test_generate_predictions.jsonlines')
+ writer = NDJWriter(input_file, test_fs)
+ writer.write()
+
+ # create a learner that uses an SGD classifier
+ learner = Learner('SGDClassifier')
+
+ # train the learner with grid search
+ learner.train(train_fs, grid_search=True)
+
+ # save the learner to a file
+ model_file = join(_my_dir, 'output',
+ 'test_generate_predictions_console.model')
+ learner.save(model_file)
+
+ # now call main() from generate_predictions.py
+ generate_cmd = ['-t {}'.format(threshold), '-a']
+ generate_cmd.extend([model_file, input_file])
+ gp.main(generate_cmd)
def check_skll_convert(from_suffix, to_suffix):
@@ -522,7 +848,7 @@ def check_print_model_weights(task='classification'):
# create some simple classification or regression data
if task == 'classification' or task == 'classification_no_intercept':
train_fs, _ = make_classification_data(train_test_ratio=0.8)
- elif task == 'multiclass_classification':
+ elif task in ['multiclass_classification', 'multiclass_classification_svc']:
train_fs, _ = make_classification_data(train_test_ratio=0.8, num_labels=3)
else:
train_fs, _, _ = make_regression_data(num_features=4,
@@ -532,9 +858,14 @@ def check_print_model_weights(task='classification'):
if task == 'classification' or task == 'multiclass_classification':
learner = Learner('LogisticRegression')
learner.train(train_fs, grid_objective='f1_score_micro')
+ elif task == 'multiclass_classification_svc':
+ learner = Learner('SVC', model_kwargs={'kernel': 'linear'})
+ learner.train(train_fs, grid_objective='f1_score_micro')
elif task == 'classification_no_intercept':
learner = Learner('LogisticRegression')
- learner.train(train_fs, grid_objective='f1_score_micro', param_grid=[{'fit_intercept':[False]}])
+ learner.train(train_fs,
+ grid_objective='f1_score_micro',
+ param_grid=[{'fit_intercept': [False]}])
elif task == 'regression':
learner = Learner('LinearRegression')
learner.train(train_fs, grid_objective='pearson')
@@ -596,6 +927,49 @@ def check_print_model_weights(task='classification'):
assert_array_almost_equal(weights, feature_values[index])
assert_array_almost_equal(intercept, learner.model.intercept_)
+ elif task == 'multiclass_classification_svc':
+ # for multiple classes with the SVC with a linear kernel,
+ # we get an intercept for each class pair combination
+ # as well as a list of weights for each class pair
+ # combination
+
+ # save the computed intercept values in a dictionary
+ # with the class oair label as the key
+ lines_to_parse = [l for l in out.split('\n')[1:] if l]
+ parsed_intercepts_dict = {}
+ for intercept_string in lines_to_parse[0:3]:
+ fields = intercept_string.split('\t')
+ parsed_intercepts_dict[fields[1]] = safe_float(fields[0])
+
+ # save the computed feature weights in a dictionary
+ # with the class pair label as the key and the value
+ # being a list; each feature weight for this class pair
+ # is stored at the index of the feature name as given
+ # by the feature vectorizer vocabulary dictionary
+ parsed_weights_dict = {}
+ for ltp in lines_to_parse[3:]:
+ (weight, class_pair, feature) = ltp.split('\t')
+ if class_pair not in parsed_weights_dict:
+ parsed_weights_dict[class_pair] = [0] * 10
+ feature_index = learner.feat_vectorizer.vocabulary_[feature]
+ parsed_weights_dict['{}'.format(class_pair)][feature_index] = safe_float(weight)
+
+ # to validate that our coefficients are correct, we will
+ # get the coefficient array (for all features) from `coef_`
+ # for a particular class pair and then check that this array
+ # is equal to the list that we computed above. We will do
+ # the same for intercepts which are even easier to validate
+ # since they _only_ depend on the class pair
+ for idx, (class1, class2) in enumerate(itertools.combinations([0, 1, 2], 2)):
+ class_pair_label = '{}-vs-{}'.format(class1, class2)
+ computed_coefficients = parsed_weights_dict[class_pair_label]
+ expected_coefficients = learner.model.coef_[idx].toarray()[0]
+ assert_array_almost_equal(computed_coefficients, expected_coefficients)
+
+ computed_intercept = parsed_intercepts_dict[class_pair_label]
+ expected_intercept = learner.model.intercept_[idx]
+ assert_almost_equal(computed_intercept, expected_intercept)
+
elif task == 'classification_no_intercept':
lines_to_parse = [l for l in out.split('\n')[0:] if l]
intercept = safe_float(lines_to_parse[0].split('=')[1])
@@ -637,6 +1011,7 @@ def check_print_model_weights(task='classification'):
def test_print_model_weights():
yield check_print_model_weights, 'classification'
yield check_print_model_weights, 'multiclass_classification'
+ yield check_print_model_weights, 'multiclass_classification_svc'
yield check_print_model_weights, 'classification_no_intercept'
yield check_print_model_weights, 'regression'
yield check_print_model_weights, 'regression_linearSVR'