diff --git a/.github/stale.yml b/.github/stale.yml new file mode 100644 index 00000000..590605d4 --- /dev/null +++ b/.github/stale.yml @@ -0,0 +1,16 @@ +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 90 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 7 +# Issues with these labels will never be considered stale +exemptLabels: + - pinned +# Label to use when marking an issue as stale +staleLabel: stale +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as stale because it has not had + recent activity. It will be closed in 7 days if no further activity occurs. + Thank you for your contributions. +# Comment to post when closing a stale issue. Set to `false` to disable +closeComment: false diff --git a/.travis.yml b/.travis.yml index ea7e107c..327937d8 100644 --- a/.travis.yml +++ b/.travis.yml @@ -25,7 +25,7 @@ before_install: - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then export PATH=/home/travis/miniconda2/bin:$PATH; else export PATH=/home/travis/miniconda3/bin:$PATH; fi - conda update --yes conda install: - - conda install --yes --channel defaults --channel conda-forge python=$TRAVIS_PYTHON_VERSION numpy scipy beautifulsoup4 six scikit-learn==0.19.1 joblib prettytable python-coveralls ruamel.yaml + - conda install --yes --channel defaults --channel conda-forge python=$TRAVIS_PYTHON_VERSION numpy scipy beautifulsoup4 six scikit-learn==0.20.1 joblib prettytable python-coveralls ruamel.yaml - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then conda install --yes --channel defaults configparser mock; fi - if [ ${WITH_PANDAS_AND_SEABORN} == "true" ]; then conda install --yes --channel defaults pandas seaborn; fi # Have to use pip for nose-cov because its entry points are not supported by conda yet diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index e07b86d6..e0199044 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -114,5 +114,10 @@ documentation without the example gallery. The resulting HTML files will be placed in _build/html/ and are viewable in a web browser. See the README file in the doc/ directory for more information. -For building the documentation, you will need [sphinx](http://sphinx.pocoo.org/). +For building the documentation, you will need [sphinx](http://sphinx.pocoo.org/) as well as the readthedocs sphinx theme. To install both, just run: + + conda install sphinx sphinx_rtd_theme + +in your existing conda environment. + diff --git a/README.rst b/README.rst index 9bde7c4b..1e77ad8d 100644 --- a/README.rst +++ b/README.rst @@ -124,17 +124,17 @@ Requirements - Python 2.7+ - `scikit-learn `__ -- `six `__ -- `PrettyTable `__ +- `six `__ +- `PrettyTable `__ - `BeautifulSoup 4 `__ -- `Grid Map `__ (only required if you plan +- `Grid Map `__ (only required if you plan to run things in parallel on a DRMAA-compatible cluster) -- `joblib `__ +- `joblib `__ - `ruamel.yaml `__ -- `configparser `__ (only required for +- `configparser `__ (only required for Python 2.7) -- `logutils `__ (only required for Python 2.7) -- `mock `__ (only required for Python 2.7) +- `logutils `__ (only required for Python 2.7) +- `mock `__ (only required for Python 2.7) The following packages can be optionally installed for additional features but are not required: diff --git a/conda-recipe/README.md b/conda-recipe/README.md index 9dae17fc..790f574a 100644 --- a/conda-recipe/README.md +++ b/conda-recipe/README.md @@ -1,7 +1,7 @@ How to create and test conda package. 1. To create the SKLL conda package run: - `conda build -c defaults -c conda-forge --python=3.6 --numpy=1.13 skll` + `conda build -c defaults -c conda-forge --python=3.6 --numpy=1.14 skll` 2. Upload the package to anaconda.org using `anaconda upload `. 3. Test the package: `conda create -n foobar -c defaults -c conda-forge -c desilinguist python=3.6 skll=1.5` diff --git a/conda-recipe/skll/meta.yaml b/conda-recipe/skll/meta.yaml index 88c09935..ab63a602 100644 --- a/conda-recipe/skll/meta.yaml +++ b/conda-recipe/skll/meta.yaml @@ -1,6 +1,6 @@ package: name: skll - version: 1.5 + version: 1.5.3 source: path: ../../../skll @@ -42,7 +42,7 @@ build: requirements: build: - python - - scikit-learn ==0.19.1 + - scikit-learn ==0.20.1 - joblib >=0.8 - setuptools - six @@ -57,7 +57,7 @@ requirements: run: - python - - scikit-learn ==0.19.1 + - scikit-learn ==0.20.1 - joblib >=0.8 - six - prettytable diff --git a/conda_requirements.txt b/conda_requirements.txt index b225501a..7cd12fe9 100644 --- a/conda_requirements.txt +++ b/conda_requirements.txt @@ -1,4 +1,4 @@ -scikit-learn==0.19.1 +scikit-learn==0.20.1 six PrettyTable beautifulsoup4 diff --git a/doc/getting_started.rst b/doc/getting_started.rst index 724ad40d..709bf7b9 100644 --- a/doc/getting_started.rst +++ b/doc/getting_started.rst @@ -11,7 +11,7 @@ or via ``conda`` (only for Python 3.6):: conda install -c defaults -c conda-forge -c desilinguist python=3.6 skll It can also be downloaded directly from -`GitHub `_. +`GitHub `_. License diff --git a/doc/index.rst b/doc/index.rst index c3a05b40..6fcd13fd 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -30,6 +30,7 @@ Documentation run_experiment utilities api + internal Indices and tables diff --git a/doc/internal.rst b/doc/internal.rst new file mode 100644 index 00000000..f8827b23 --- /dev/null +++ b/doc/internal.rst @@ -0,0 +1,7 @@ +Internal Documentation +====================== + +.. toctree:: + :maxdepth: 4 + + internal/release \ No newline at end of file diff --git a/doc/internal/release.rst b/doc/internal/release.rst new file mode 100644 index 00000000..fc06613b --- /dev/null +++ b/doc/internal/release.rst @@ -0,0 +1,42 @@ +Release Process +=============== + +This document is only meant for the project administrators, not users and developers. + +1. Create a release branch on GitHub. + +2. In the release branch: + + a. update the version numbers in ``version.py``. + + b. update the conda recipe. + + c. update the documentation with any new features or details about changes. + + d. run ``make linkcheck`` on the documentation and fix any redirected/broken links. + + e. update the README. + +3. Build the new conda package locally on your mac using the following command (*Note*: you may have to replace the contents of the ``requirements()`` function in ``setup.py`` with a ``pass`` statement to get ``conda build`` to work):: + + conda build -c defaults -c conda-forge --python=3.6 --numpy=1.14 skll + +4. Convert the package for both linux and windows:: + + conda convert -p win-64 -p linux-64 + +5. Upload each of the packages to anaconda.org using ``anaconda upload ``. + +6. Upload source package to PyPI using ``python setup.py sdist upload``. + +7. Draft a release on GitHub. + +8. Make a pull request with the release branch to be merged into ``master`` and request code review. + +9. Once the build for the PR passes and the reviewers approve, merge the release branch into ``master``. + +10. Make sure that the RTFD build for ``master`` passes. + +11. Tag the latest commit in ``master`` with the appropriate release tag and publish the release on GitHub. + +12. Send an email around at ETS announcing the release and the changes. diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst index b6b45604..27044171 100644 --- a/doc/run_experiment.rst +++ b/doc/run_experiment.rst @@ -31,7 +31,7 @@ The following feature file formats are supported: arff ^^^^ -The same file format used by `Weka `__ +The same file format used by `Weka `__ with the following added restrictions: * Only simple numeric, string, and nomimal values are supported. @@ -81,8 +81,8 @@ libsvm ^^^^^^ While we can process the standard input file format supported by -`LibSVM `__, -`LibLinear `__, +`LibSVM `__, +`LibLinear `__, and `SVMLight `__, we also support specifying extra metadata usually missing from the format in comments at the of each line. The comments are not mandatory, but without them, your labels and features will @@ -105,7 +105,7 @@ megam ^^^^^ An expanded form of the input format for the -`MegaM classification package `__ with +`MegaM classification package `__ with the ``-fvals`` switch. The basic format is:: @@ -127,7 +127,7 @@ to/from this MegaM format and for adding/removing features from the files. Creating configuration files ---------------------------- The experiment configuration files that run_experiment accepts are standard -`Python configuration files `__ +`Python configuration files `__ that are similar in format to Windows INI files. [#]_ There are four expected sections in a configuration file: :ref:`General`, :ref:`Input`, :ref:`Tuning`, and :ref:`Output`. A detailed description of each @@ -138,7 +138,7 @@ possible settings for each section is provided below, but to summarize: * If you want to do **cross-validation**, specify a path to training feature files, and set :ref:`task` to ``cross_validate``. Please note that the cross-validation currently uses - `StratifiedKFold `__. + `StratifiedKFold `__. You also can optionally use predetermined folds with the :ref:`folds_file ` setting. @@ -167,7 +167,7 @@ possible settings for each section is provided below, but to summarize: .. _learning_curve: -* If you want to **generate a learning curve** for your data, specify a training location and set :ref:`task` to ``learning_curve``. The learning curve is generated using essentially the same underlying process as in `scikit-learn `__ except that the SKLL feature pre-processing pipline is used while training the various models and computing the scores. +* If you want to **generate a learning curve** for your data, specify a training location and set :ref:`task` to ``learning_curve``. The learning curve is generated using essentially the same underlying process as in `scikit-learn `__ except that the SKLL feature pre-processing pipline is used while training the various models and computing the scores. .. note:: @@ -178,7 +178,7 @@ possible settings for each section is provided below, but to summarize: * A :ref:`list of classifiers/regressors ` to try on your feature files is required. -Example configuration files are available `here `__. +Example configuration files are available `here `__. .. _general: @@ -227,43 +227,43 @@ below. Custom learners can also be specified. See Classifiers: - * **AdaBoostClassifier**: `AdaBoost Classification `__. Note that the default base estimator is a ``DecisionTreeClassifier``. A different base estimator can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. The following additional base estimators are supported: ``MultinomialNB``, ``SGDClassifier``, and ``SVC``. Note that the last two base require setting an additional ``algorithm`` fixed parameter with the value ``'SAMME'``. - * **DummyClassifier**: `Simple rule-based Classification `__ - * **DecisionTreeClassifier**: `Decision Tree Classification `__ - * **GradientBoostingClassifier**: `Gradient Boosting Classification `__ - * **KNeighborsClassifier**: `K-Nearest Neighbors Classification `__ - * **LinearSVC**: `Support Vector Classification using LibLinear `__ - * **LogisticRegression**: `Logistic Regression Classification using LibLinear `__ - * **MLPClassifier**: `Multi-layer Perceptron Classification `__ - * **MultinomialNB**: `Multinomial Naive Bayes Classification `__ - * **RandomForestClassifier**: `Random Forest Classification `__ - * **RidgeClassifier**: `Classification using Ridge Regression `__ - * **SGDClassifier**: `Stochastic Gradient Descent Classification `__ - * **SVC**: `Support Vector Classification using LibSVM `__ + * **AdaBoostClassifier**: `AdaBoost Classification `__. Note that the default base estimator is a ``DecisionTreeClassifier``. A different base estimator can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. The following additional base estimators are supported: ``MultinomialNB``, ``SGDClassifier``, and ``SVC``. Note that the last two base require setting an additional ``algorithm`` fixed parameter with the value ``'SAMME'``. + * **DummyClassifier**: `Simple rule-based Classification `__ + * **DecisionTreeClassifier**: `Decision Tree Classification `__ + * **GradientBoostingClassifier**: `Gradient Boosting Classification `__ + * **KNeighborsClassifier**: `K-Nearest Neighbors Classification `__ + * **LinearSVC**: `Support Vector Classification using LibLinear `__ + * **LogisticRegression**: `Logistic Regression Classification using LibLinear `__ + * **MLPClassifier**: `Multi-layer Perceptron Classification `__ + * **MultinomialNB**: `Multinomial Naive Bayes Classification `__ + * **RandomForestClassifier**: `Random Forest Classification `__ + * **RidgeClassifier**: `Classification using Ridge Regression `__ + * **SGDClassifier**: `Stochastic Gradient Descent Classification `__ + * **SVC**: `Support Vector Classification using LibSVM `__ .. _regressors: Regressors: - * **AdaBoostRegressor**: `AdaBoost Regression `__. Note that the default base estimator is a ``DecisionTreeRegressor``. A different base estimator can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. The following additional base estimators are supported: ``SGDRegressor``, and ``SVR``. - * **BayesianRidge**: `Bayesian Ridge Regression `__ - * **DecisionTreeRegressor**: `Decision Tree Regressor `__ - * **DummyRegressor**: `Simple Rule-based Regression `__ - * **ElasticNet**: `ElasticNet Regression `__ - * **GradientBoostingRegressor**: `Gradient Boosting Regressor `__ - * **HuberRegressor**: `Huber Regression `__ - * **KNeighborsRegressor**: `K-Nearest Neighbors Regression `__ - * **Lars**: `Least Angle Regression `__ - * **Lasso**: `Lasso Regression `__ - * **LinearRegression**: `Linear Regression `__ - * **LinearSVR**: `Support Vector Regression using LibLinear `__ - * **MLPRegressor**: `Multi-layer Perceptron Regression `__ - * **RandomForestRegressor**: `Random Forest Regression `__ - * **RANSACRegressor**: `RANdom SAmple Consensus Regression `__. Note that the default base estimator is a ``LinearRegression``. A different base regressor can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. - * **Ridge**: `Ridge Regression `__ - * **SGDRegressor**: `Stochastic Gradient Descent Regression `__ - * **SVR**: `Support Vector Regression using LibSVM `__ - * **TheilSenRegressor**: `Theil-Sen Regression `__ + * **AdaBoostRegressor**: `AdaBoost Regression `__. Note that the default base estimator is a ``DecisionTreeRegressor``. A different base estimator can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. The following additional base estimators are supported: ``SGDRegressor``, and ``SVR``. + * **BayesianRidge**: `Bayesian Ridge Regression `__ + * **DecisionTreeRegressor**: `Decision Tree Regressor `__ + * **DummyRegressor**: `Simple Rule-based Regression `__ + * **ElasticNet**: `ElasticNet Regression `__ + * **GradientBoostingRegressor**: `Gradient Boosting Regressor `__ + * **HuberRegressor**: `Huber Regression `__ + * **KNeighborsRegressor**: `K-Nearest Neighbors Regression `__ + * **Lars**: `Least Angle Regression `__ + * **Lasso**: `Lasso Regression `__ + * **LinearRegression**: `Linear Regression `__ + * **LinearSVR**: `Support Vector Regression using LibLinear `__ + * **MLPRegressor**: `Multi-layer Perceptron Regression `__ + * **RandomForestRegressor**: `Random Forest Regression `__ + * **RANSACRegressor**: `RANdom SAmple Consensus Regression `__. Note that the default base estimator is a ``LinearRegression``. A different base regressor can be used by specifying a ``base_estimator`` fixed parameter in the :ref:`fixed_parameters ` list. + * **Ridge**: `Ridge Regression `__ + * **SGDRegressor**: `Stochastic Gradient Descent Regression `__ + * **SVR**: `Support Vector Regression using LibSVM `__ + * **TheilSenRegressor**: `Theil-Sen Regression `__ For all regressors you can also prepend ``Rescaled`` to the beginning of the full name (e.g., ``RescaledSVR``) to get a version @@ -496,9 +496,9 @@ imported dynamically. This is only required if a custom learner is specified in the list of :ref:`learners`. All Custom learners must implement the ``fit`` and -``predict`` methods. Custom classifiers must either (a) inherit from an existing scikit-learn classifier, or (b) inherit from both `sklearn.base.BaseEstimator `__. *and* from `sklearn.base.ClassifierMixin `__. +``predict`` methods. Custom classifiers must either (a) inherit from an existing scikit-learn classifier, or (b) inherit from both `sklearn.base.BaseEstimator `__. *and* from `sklearn.base.ClassifierMixin `__. -Similarly, Custom regressors must either (a) inherit from an existing scikit-learn regressor, or (b) inherit from both `sklearn.base.BaseEstimator `__. *and* from `sklearn.base.RegressorMixin `__. +Similarly, Custom regressors must either (a) inherit from an existing scikit-learn regressor, or (b) inherit from both `sklearn.base.BaseEstimator `__. *and* from `sklearn.base.RegressorMixin `__. Learners that require dense matrices should implement a method ``requires_dense`` that returns ``True``. @@ -511,11 +511,11 @@ sampler *(Optional)* It performs a non-linear transformations of the input, which can serve as a basis for linear classification or other algorithms. Valid options are: -`Nystroem `__, -`RBFSampler `__, -`SkewedChi2Sampler `__, and -`AdditiveChi2Sampler `__. For additional information see -`the scikit-learn documentation `__. +`Nystroem `__, +`RBFSampler `__, +`SkewedChi2Sampler `__, and +`AdditiveChi2Sampler `__. For additional information see +`the scikit-learn documentation `__. .. _sampler_parameters: @@ -550,18 +550,18 @@ feature_hasher *(Optional)* If "true", this enables a high-speed, low-memory vectorizer that uses feature hashing for converting feature dictionaries into NumPy arrays instead of using a -`DictVectorizer `__. This flag will drastically +`DictVectorizer `__. This flag will drastically reduce memory consumption for data sets with a large number of features. If enabled, the user should also specify the number of features in the :ref:`hasher_features ` field. For additional -information see `the scikit-learn documentation `__. +information see `the scikit-learn documentation `__. .. _hasher_features: hasher_features *(Optional)* """""""""""""""""""""""""""" -The number of features used by the `FeatureHasher `__ if the +The number of features used by the `FeatureHasher `__ if the :ref:`feature_hasher ` flag is enabled. .. note:: @@ -696,7 +696,7 @@ TheilSenRegressor {'class_weight': {1: 10}} - Additional examples and information can be seen `here `__. + Additional examples and information can be seen `here `__. .. _feature_scaling: @@ -785,9 +785,9 @@ The objective functions to use for tuning. This is a list of one or more objecti Classification: - * **accuracy**: Overall `accuracy `__ - * **precision**: `Precision `__ - * **recall**: `Recall `__ + * **accuracy**: Overall `accuracy `__ + * **precision**: `Precision `__ + * **recall**: `Recall `__ * **f1**: The default scikit-learn |F1 link|_ (F\ :sub:`1` of the positive class for binary classification, or the weighted average F\ :sub:`1` for multiclass classification) * **f1_score_micro**: Micro-averaged |F1 link|_ @@ -796,20 +796,20 @@ Classification: * **f1_score_least_frequent**: F:\ :sub:`1` score of the least frequent class. The least frequent class may vary from fold to fold for certain data distributions. - * **neg_log_loss**: The negative of the classification `log loss `__ . Since scikit-learn `recommends `__ using negated loss functions as scorer functions, SKLL does the same for the sake of consistency. To use this as the objective, :ref:`probability ` must be set to ``True``. - * **average_precision**: `Area under PR curve `__ + * **neg_log_loss**: The negative of the classification `log loss `__ . Since scikit-learn `recommends `__ using negated loss functions as scorer functions, SKLL does the same for the sake of consistency. To use this as the objective, :ref:`probability ` must be set to ``True``. + * **average_precision**: `Area under PR curve `__ (for binary classification) - * **roc_auc**: `Area under ROC curve `__ + * **roc_auc**: `Area under ROC curve `__ (for binary classification) .. |F1 link| replace:: F\ :sub:`1` score -.. _F1 link: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html +.. _F1 link: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html .. _int_label_classification_obj: Regression or classification with integer labels: - * **unweighted_kappa**: Unweighted `Cohen's kappa `__ (any floating point + * **unweighted_kappa**: Unweighted `Cohen's kappa `__ (any floating point values are rounded to ints) * **linear_weighted_kappa**: Linear weighted kappa (any floating point values are rounded to ints) @@ -827,16 +827,16 @@ Regression or classification with integer labels: Regression or classification with binary labels: - * **kendall_tau**: `Kendall's tau `__ - * **pearson**: `Pearson correlation `__ - * **spearman**: `Spearman rank-correlation `__ + * **kendall_tau**: `Kendall's tau `__ + * **pearson**: `Pearson correlation `__ + * **spearman**: `Spearman rank-correlation `__ .. _regression_obj: Regression: - * **r2**: `R2 `__ - * **neg_mean_squared_error**: The negative of the `mean squared error `__ regression loss. Since scikit-learn `recommends `__ using negated loss functions as scorer functions, SKLL does the same for the sake of consistency. + * **r2**: `R2 `__ + * **neg_mean_squared_error**: The negative of the `mean squared error `__ regression loss. Since scikit-learn `recommends `__ using negated loss functions as scorer functions, SKLL does the same for the sake of consistency. Defaults to ``['f1_score_micro']``. @@ -1121,7 +1121,7 @@ specified via command-line arguments instead of in the configuration file: GridMap options ^^^^^^^^^^^^^^^ -If you have `GridMap `__ installed, +If you have `GridMap `__ installed, :program:`run_experiment` will automatically schedule jobs on your DRMAA- compatible cluster. You can use the following options to customize this behavior. @@ -1132,7 +1132,7 @@ behavior. .. option:: -q , --queue - Use this queue for `GridMap `__. + Use this queue for `GridMap `__. (default: ``all.q``) .. option:: -m , --machines diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 9d459623..2c85d7a7 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -21,15 +21,15 @@ Titanic Example --------------- Let's see how we can apply the basic workflow above to a simple example using -the `Titantic: Machine Learning from Disaster `__ -data from `Kaggle `__. +the `Titantic: Machine Learning from Disaster `__ +data from `Kaggle `__. Get your data into the correct format ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The first step to getting the Titanic data is logging into Kaggle and -downloading `train.csv `__ -and `test.csv `__. +downloading `train.csv `__ +and `test.csv `__. Once you have those files, you'll also want to grab the `examples folder `__ on our GitHub page and put ``train.csv`` and ``test.csv`` in ``examples``. @@ -48,7 +48,7 @@ For this tutorial, we will refer to an "experiment" as having a single data set split into training and testing portions. As part of each experiment, we can train and test several models, either simultaneously or sequentially, depending whether we're using -`GridMap `__ or not. +`GridMap `__ or not. This will be described in more detail later on, when we are ready to run our experiment. @@ -87,9 +87,9 @@ instances IDs for each example. The :ref:`Tuning` section defines how we want our model to be tuned. Setting :ref:`grid_search ` to ``True`` here employs scikit-learn's -`GridSearchCV `_ +`GridSearchCV `_ class, which is an implementation of the -`standard, brute-force approach to hyperparameter optimization `_. +`standard, brute-force approach to hyperparameter optimization `_. :ref:`objectives ` refers to the desired objective functions; here, ``accuracy`` will optimize for overall accuracy. You can see a list of all the diff --git a/doc/utilities.rst b/doc/utilities.rst index 41319140..e509e535 100644 --- a/doc/utilities.rst +++ b/doc/utilities.rst @@ -113,17 +113,27 @@ Positional Arguments Model file to load and use for generating predictions. -.. option:: input_file +.. option:: input_file(s) - A csv file, json file, or megam file (with or without the label column), - with the appropriate suffix. + One or more csv file(s), jsonlines file(s), or megam file(s) (with or without the + label column), with the appropriate suffix. Optional Arguments ^^^^^^^^^^^^^^^^^^ +.. option:: -a, --all_probabilities + + Flag indicating whether to output the probabilities of all labels instead of just + the probability of the positive label. + +.. option:: -i , --id_col + + Name of the column which contains the instance IDs in ARFF, CSV, or TSV files. + (default: ``id``) + .. option:: -l , --label_col Name of the column which contains the labels in ARFF, CSV, or TSV files. - For ARFF files, this must be the final column to count as the label. + For ARFF files, this must be the final column to count as the label. (default: ``y``) .. option:: -p , --positive_label @@ -131,7 +141,8 @@ Optional Arguments If the model is only being used to predict the probability of a particular label, this specifies the index of the label we're predicting. 1 = second label, which is default for binary classification. Keep in mind that labels - are sorted lexicographically. (default: 1) + are sorted lexicographically. + (default: 1) .. option:: -q, --quiet diff --git a/requirements.txt b/requirements.txt index b225501a..7cd12fe9 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -scikit-learn==0.19.1 +scikit-learn==0.20.1 six PrettyTable beautifulsoup4 diff --git a/requirements_rtd.txt b/requirements_rtd.txt index 72b70cda..1b63aabb 100644 --- a/requirements_rtd.txt +++ b/requirements_rtd.txt @@ -1,7 +1,7 @@ configparser==3.5.0b2 logutils mock -scikit-learn==0.19.1 +scikit-learn==0.20.1 six PrettyTable beautifulsoup4 diff --git a/skll/config.py b/skll/config.py index 973d65f4..60318b4c 100644 --- a/skll/config.py +++ b/skll/config.py @@ -481,10 +481,11 @@ def _parse_config_file(config_path, log_level=logging.INFO): # next, get the log path before anything else since we need to # save all logging messages to a log file in addition to displaying # them on the console - log_path = _locate_file(config.get("Output", "log"), config_dir) - if log_path: - log_path = join(config_dir, log_path) - if not exists(log_path): + try: + log_path = _locate_file(config.get("Output", "log"), config_dir) + except IOError as e: + if e.errno == errno.ENOENT: + log_path = e.filename os.makedirs(log_path) # Create a top-level log file under the log path @@ -731,24 +732,29 @@ def _parse_config_file(config_path, log_level=logging.INFO): probability = config.getboolean("Output", "probability") # do we want to keep the predictions? - prediction_dir = _locate_file(config.get("Output", "predictions"), - config_dir) - if prediction_dir: - if not exists(prediction_dir): + # make sure the predictions path exists and if not create it + try: + prediction_dir = _locate_file(config.get("Output", "predictions"), + config_dir) + except IOError as e: + if e.errno == errno.ENOENT: + prediction_dir = e.filename os.makedirs(prediction_dir) - # make sure model path exists - model_path = _locate_file(config.get("Output", "models"), config_dir) - if model_path: - model_path = join(config_dir, model_path) - if not exists(model_path): + # make sure model path exists and if not, create it + try: + model_path = _locate_file(config.get("Output", "models"), config_dir) + except IOError as e: + if e.errno == errno.ENOENT: + model_path = e.filename os.makedirs(model_path) # make sure results path exists - results_path = _locate_file(config.get("Output", "results"), config_dir) - if results_path: - results_path = join(config_dir, results_path) - if not exists(results_path): + try: + results_path = _locate_file(config.get("Output", "results"), config_dir) + except IOError as e: + if e.errno == errno.ENOENT: + results_path = e.filename os.makedirs(results_path) # what are the output metrics? @@ -872,10 +878,10 @@ def _parse_config_file(config_path, log_level=logging.INFO): logger.warning("Specifying \"folds_file\" overrides both " "explicit and default \"grid_search_folds\".") if task == 'cross_validate': - logger.warning("Specifying \"folds_file\" overrides both " - "explicit and default \"num_cv_folds\".") cv_folds = specified_folds_mapping if specified_folds_mapping else specified_num_folds if specified_folds_mapping: + logger.warning("Specifying \"folds_file\" overrides both " + "explicit and default \"num_cv_folds\".") if use_folds_file_for_grid_search: grid_search_folds = cv_folds else: diff --git a/skll/data/writers.py b/skll/data/writers.py index d694ff78..88b9f0fd 100644 --- a/skll/data/writers.py +++ b/skll/data/writers.py @@ -237,24 +237,24 @@ class DelimitedFileWriter(Writer): type. For example ``/foo/.csv``. feature_set : skll.FeatureSet The ``FeatureSet`` instance to dump to the output file. - quiet : bool + quiet : bool, optional Do not print "Writing..." status message to stderr. Defaults to ``True``. - label_col : str + label_col : str, optional Name of the column which contains the class labels for ARFF/CSV/TSV files. If no column with that name exists, or ``None`` is specified, the data is considered to be unlabelled. Defaults to ``'y'``. - id_col : str + id_col : str, optional Name of the column which contains the instance IDs. If no column with that name exists, or ``None`` is specified, example IDs will be automatically generated. Defaults to ``'id'``. - dialect : str - Name of the column which contains the class labels for - CSV/TSV files. - logger : logging.Logger + dialect : str, optional + The dialect to use for writing out the delimited file. + Defaults to ``'excel-tab'``. + logger : logging.Logger, optional A logger instance to use to log messages instead of creating a new one by default. Defaults to ``None``. @@ -586,9 +586,21 @@ def _write_line(self, id_, label_, feat_dict, output_file): """ example_dict = {} # Don't try to add class column if this is label-less data + # Try to convert the label to a scalar assuming it'a numpy + # non-scalar type (e.g., int64) but if that doesn't work + # then use it as is if self.feat_set.has_labels: - example_dict['y'] = np.asscalar(label_) - example_dict['id'] = np.asscalar(id_) + try: + example_dict['y'] = label_.item() + except AttributeError: + example_dict['y'] = label_ + # Try to convert the ID to a scalar assuming it'a numpy + # non-scalar type (e.g., int64) but if that doesn't work + # then use it as is + try: + example_dict['id'] = id_.item() + except AttributeError: + example_dict['id'] = id_ example_dict["x"] = feat_dict print(json.dumps(example_dict, sort_keys=True), file=output_file) diff --git a/skll/experiments.py b/skll/experiments.py index d6a039cc..22ff3bb1 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -77,7 +77,7 @@ class NumpyTypeEncoder(json.JSONEncoder): be serialized by the json module, so we must convert them to int objects. A related issue where this was adapted from: - http://stackoverflow.com/questions/11561932/why-does-json-dumpslistnp-arange5-fail-while-json-dumpsnp-arange5-tolis + https://stackoverflow.com/questions/11561932/why-does-json-dumpslistnp-arange5-fail-while-json-dumpsnp-arange5-tolis """ def default(self, obj): @@ -989,7 +989,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q', write_summary : bool, optional Write a TSV file with a summary of the results. Defaults to ``True``. - quite : bool, optional + quiet : bool, optional Suppress printing of "Loading..." messages. Defaults to ``False``. ablation : int, optional @@ -1374,13 +1374,13 @@ def _generate_learning_curve_plots(experiment_name, # each of the featuresets for fs_name, df_fs in df_melted.groupby('featureset_name'): fig = plt.figure(); - fig.set_size_inches(2.5*num_learners, 2.5*num_metrics); + fig.set_size_inches(2.5 * num_learners, 2.5 * num_metrics); # compute ylimits for this feature set for each objective with sns.axes_style('whitegrid', {"grid.linestyle": ':', "xtick.major.size": 3.0}): g = sns.FacetGrid(df_fs, row="metric", col="learner_name", - hue="variable", size=2.5, aspect=1, + hue="variable", height=2.5, aspect=1, margin_titles=True, despine=True, sharex=False, sharey=False, legend_out=False, palette="Set1") colors = train_color, test_color = sns.color_palette("Set1")[:2] diff --git a/skll/learner.py b/skll/learner.py index ff196f78..f8bd102d 100644 --- a/skll/learner.py +++ b/skll/learner.py @@ -20,6 +20,7 @@ from collections import Counter, defaultdict from functools import wraps from importlib import import_module +from itertools import combinations from multiprocessing import cpu_count import joblib @@ -42,6 +43,7 @@ RandomForestClassifier, RandomForestRegressor) from sklearn.feature_extraction import FeatureHasher +from sklearn.feature_extraction import DictVectorizer as OldDictVectorizer from sklearn.feature_selection import SelectKBest from sklearn.utils.multiclass import type_of_target # AdditiveChi2Sampler is used indirectly, so ignore linting message @@ -74,6 +76,7 @@ from sklearn.utils import shuffle as sk_shuffle from skll.data import FeatureSet +from skll.data.dict_vectorizer import DictVectorizer from skll.metrics import _CORRELATION_METRICS, use_score_func from skll.version import VERSION @@ -196,8 +199,6 @@ 'neg_log_loss']) _REQUIRES_DENSE = (BayesianRidge, - GradientBoostingClassifier, - GradientBoostingRegressor, Lars, TheilSenRegressor) @@ -856,6 +857,7 @@ def __init__(self, model_type, probability=False, feature_scaling='none', if issubclass(self._model_type, SVC): self._model_kwargs['cache_size'] = 1000 self._model_kwargs['probability'] = self.probability + self._model_kwargs['gamma'] = 'auto' if self.probability: self.logger.warning('Because LibSVM does an internal ' 'cross-validation to produce probabilities, ' @@ -868,14 +870,22 @@ def __init__(self, model_type, probability=False, feature_scaling='none', self._model_kwargs['n_estimators'] = 500 elif issubclass(self._model_type, SVR): self._model_kwargs['cache_size'] = 1000 + self._model_kwargs['gamma'] = 'auto' elif issubclass(self._model_type, SGDClassifier): self._model_kwargs['loss'] = 'log' + self._model_kwargs['max_iter'] = None + self._model_kwargs['tol'] = None + elif issubclass(self._model_type, SGDRegressor): + self._model_kwargs['max_iter'] = None + self._model_kwargs['tol'] = None elif issubclass(self._model_type, RANSACRegressor): self._model_kwargs['loss'] = 'squared_loss' elif issubclass(self._model_type, (MLPClassifier, MLPRegressor)): self._model_kwargs['learning_rate'] = 'invscaling' self._model_kwargs['max_iter'] = 500 - + elif issubclass(self._model_type, LogisticRegression): + self._model_kwargs['solver'] = 'liblinear' + self._model_kwargs['multi_class'] = 'auto' if issubclass(self._model_type, (AdaBoostClassifier, AdaBoostRegressor, @@ -913,15 +923,24 @@ def __init__(self, model_type, probability=False, feature_scaling='none', AdaBoostClassifier, RANSACRegressor)) and ('base_estimator' in model_kwargs): base_estimator_name = model_kwargs['base_estimator'] - base_estimator_kwargs = {} if base_estimator_name in ['LinearRegression', - 'MultinomialNB', - 'SVR'] else {'random_state': 123456789} + if base_estimator_name in ['LinearRegression', 'MultinomialNB']: + base_estimator_kwargs = {} + elif base_estimator_name in ['SGDClassifier', 'SGDRegressor']: + base_estimator_kwargs = {'max_iter': None, + 'tol': None, + 'random_state': 123456789} + elif base_estimator_name == 'SVR': + base_estimator_kwargs = {'gamma': 'auto'} + elif base_estimator_name == 'SVC': + base_estimator_kwargs = {'gamma': 'auto', 'random_state': 123456789} + else: + base_estimator_kwargs = {'random_state': 123456789} base_estimator = globals()[base_estimator_name](**base_estimator_kwargs) model_kwargs['base_estimator'] = base_estimator self._model_kwargs.update(model_kwargs) @classmethod - def from_file(cls, learner_path): + def from_file(cls, learner_path, logger=None): """ Load a saved ``Learner`` instance from a file path. @@ -929,6 +948,9 @@ def from_file(cls, learner_path): ---------- learner_path : str The path to a saved ``Learner`` instance file. + logger : logging object, optional + A logging object. If ``None`` is passed, get logger from ``__name__``. + Defaults to ``None``. Returns ------- @@ -944,6 +966,10 @@ def from_file(cls, learner_path): """ skll_version, learner = joblib.load(learner_path) + # create the learner logger attribute to the logger that's passed in + # or if nothing was passed in, then a new logger should be linked + learner.logger = logger if logger else logging.getLogger(__name__) + # For backward compatibility, convert string model types to labels. if isinstance(learner._model_type, string_types): learner._model_type = globals()[learner._model_type] @@ -1073,6 +1099,30 @@ def model_params(self): elif self.model.intercept_.any(): intercept = dict(zip(label_list, self.model.intercept_)) + # for SVCs with linear kernels, we want to print out the primal + # weights - that is, the weights for each feature for each one-vs-one + # binary classifier. These are the weights contained in the `coef_` + # attribute of the underlying scikit-learn model. This is a matrix that + # has the shape [(n_classes)*(n_classes -1)/2, n_features] since there + # are C(n_classes, 2) = n_classes*(n_classes-1)/2 one-vs-one classifiers + # and each one has weights for each of the features. According to the + # scikit-learn user guide and the code for the function `_one_vs_one_coef()` + # in `svm/base.py`, the order of the rows is as follows is "0 vs 1", + # "0 vs 2", ... "0 vs n", "1 vs 2", "1 vs 3", "1 vs n", ... "n-1 vs n". + elif isinstance(self._model, SVC) and self._model.kernel == 'linear': + intercept = {} + for i, class_pair in enumerate(combinations(range(len(self.label_list)), 2)): + coef = self.model.coef_[i] + coef = coef.toarray() + coef = self.feat_selector.inverse_transform(coef)[0] + class1 = self.label_list[class_pair[0]] + class2 = self.label_list[class_pair[1]] + for feat, idx in iteritems(self.feat_vectorizer.vocabulary_): + if coef[idx]: + res['{}-vs-{}\t{}'.format(class1, class2, feat)] = coef[idx] + + intercept['{}-vs-{}'.format(class1, class2)] = self.model.intercept_[i] + else: # not supported raise ValueError(("{} is not supported by" + @@ -1113,7 +1163,8 @@ def __getstate__(self): because we cannot pickle loggers. """ attribute_dict = dict(self.__dict__) - del attribute_dict['logger'] + if 'logger' in attribute_dict: + del attribute_dict['logger'] return attribute_dict def save(self, learner_path): @@ -1683,12 +1734,57 @@ def predict(self, examples, prediction_prefix=None, append=False, # Need to do some transformations so the features are in the right # columns for the test set. Obviously a bit hacky, but storing things # in sparse matrices saves memory over our old list of dicts approach. - if isinstance(self.feat_vectorizer, FeatureHasher): - if (self.feat_vectorizer.n_features != - examples.vectorizer.n_features): + + # We also need to think about the various combinations of the model + # vectorizer and the vectorizer for the set for which we want to make + # predictions: + + # 1. Both vectorizers are DictVectorizers. If they use different sets + # of features, we raise a warning and transform the features of the + # prediction set from its space to the trained model space. + + # 2. Both vectorizers are FeatureHashers. If they use different number + # of feature bins, we should just raise an error since there's no + # inverse_transform() available for a FeatureHasher - the hash function + # is not reversible. + + # 3. The model vectorizer is a FeatureHasher but the prediction feature + # set vectorizer is a DictVectorizer. We should be able to handle this + # case, since we can just call inverse_transform() on the DictVectorizer + # and then transform() on the FeatureHasher? + + # 4. The model vectorizer is a DictVectorizer but the prediction feature + # set vectorizer is a FeatureHasher. Again, we should raise an error here + # since there's no inverse available for the hasher. + model_is_dict = isinstance(self.feat_vectorizer, + (DictVectorizer, OldDictVectorizer)) + model_is_hasher = isinstance(self.feat_vectorizer, FeatureHasher) + data_is_dict = isinstance(examples.vectorizer, + (DictVectorizer, OldDictVectorizer)) + data_is_hasher = isinstance(examples.vectorizer, FeatureHasher) + + both_dicts = model_is_dict and data_is_dict + both_hashers = model_is_hasher and data_is_hasher + model_hasher_and_data_dict = model_is_hasher and data_is_dict + model_dict_and_data_hasher = model_is_dict and data_is_hasher + + # 1. both are DictVectorizers + if both_dicts: + if (set(self.feat_vectorizer.feature_names_) != + set(examples.vectorizer.feature_names_)): self.logger.warning("There is mismatch between the training model " - "features and the data passed to predict.") + "features and the data passed to predict. The " + "prediction features will be transformed to " + "the trained model space.") + if self.feat_vectorizer == examples.vectorizer: + xtest = examples.features + else: + xtest = self.feat_vectorizer.transform( + examples.vectorizer.inverse_transform( + examples.features)) + # 2. both are FeatureHashers + elif both_hashers: self_feat_vec_tuple = (self.feat_vectorizer.dtype, self.feat_vectorizer.input_type, self.feat_vectorizer.n_features, @@ -1701,21 +1797,23 @@ def predict(self, examples, prediction_prefix=None, append=False, if self_feat_vec_tuple == example_feat_vec_tuple: xtest = examples.features else: - xtest = self.feat_vectorizer.transform( - examples.vectorizer.inverse_transform( - examples.features)) - else: - if (set(self.feat_vectorizer.feature_names_) != - set(examples.vectorizer.feature_names_)): - self.logger.warning("There is mismatch between the training model " - "features and the data passed to predict.") - if self.feat_vectorizer == examples.vectorizer: - xtest = examples.features - else: - - xtest = self.feat_vectorizer.transform( - examples.vectorizer.inverse_transform( - examples.features)) + self.logger.error('There is mismatch between the FeatureHasher ' + 'configuration for the training data and ' + 'the configuration for the data passed to predict') + raise RuntimeError('Mismatched hasher configurations') + + # 3. model is a FeatureHasher and test set is a DictVectorizer + elif model_hasher_and_data_dict: + xtest = self.feat_vectorizer.transform( + examples.vectorizer.inverse_transform( + examples.features)) + + # 4. model is a DictVectorizer and test set is a FeatureHasher + elif model_dict_and_data_hasher: + self.logger.error('Cannot predict with a model using a ' + 'DictVectorizer on data that uses ' + 'a FeatureHasher') + raise RuntimeError('Cannot use FeatureHasher for data') # filter features based on those selected from training set xtest = self.feat_selector.transform(xtest) diff --git a/skll/utilities/generate_predictions.py b/skll/utilities/generate_predictions.py index 97d9aefb..514ff680 100755 --- a/skll/utilities/generate_predictions.py +++ b/skll/utilities/generate_predictions.py @@ -14,6 +14,7 @@ import argparse import logging import os +import sys from skll.data.readers import EXT_TO_READER from skll.learner import Learner @@ -26,7 +27,8 @@ class Predictor(object): predictions for feature strings. """ - def __init__(self, model_path, threshold=None, positive_label=1): + def __init__(self, model_path, threshold=None, positive_label=1, + all_labels=False, logger=None): """ Initialize the predictor. @@ -46,10 +48,24 @@ def __init__(self, model_path, threshold=None, positive_label=1): predicting. 1 = second class, which is default for binary classification. Defaults to 1. + all_labels: bool, optional + A flag indicating whether to return the probabilities for all + labels in each row instead of just returning the probability of + `positive_label`. Defaults to None. + logger : logging object, optional + A logging object. If ``None`` is passed, get logger from ``__name__``. + Defaults to ``None``. """ + # self.logger = logger if logger else logging.getLogger(__name__) + if threshold is not None and all_labels: + raise ValueError("`threshold` and `all_labels` are mutually " + "exclusive. They can not both be set to True.") + self._learner = Learner.from_file(model_path) self._pos_index = positive_label self.threshold = threshold + self.all_labels = all_labels + self.output_file_header = None def predict(self, data): """ @@ -67,18 +83,29 @@ def predict(self, data): # compute the predictions from the learner preds = self._learner.predict(data) preds = preds.tolist() + labels = self._learner.label_list + # Create file header list, and transform predictions as needed + # depending on the specified prediction arguments. if self._learner.probability: - if self.threshold is None: - return [pred[self._pos_index] for pred in preds] + if self.all_labels: + self.output_file_header = ["id"] + [str(x) for x in labels] + elif self.threshold is None: + label = self._learner.label_dict[self._pos_index] + self.output_file_header = ["id", + "Probability of '{}'".format(label)] + preds = [pred[self._pos_index] for pred in preds] else: - return [int(pred[self._pos_index] >= self.threshold) - for pred in preds] + self.output_file_header = ["id", "prediction"] + preds = [int(pred[self._pos_index] >= self.threshold) + for pred in preds] elif self._learner.model._estimator_type == 'regressor': - return preds + self.output_file_header = ["id", "prediction"] else: - return [self._learner.label_list[pred if isinstance(pred, int) else - int(pred[0])] for pred in preds] + self.output_file_header = ["id", "prediction"] + preds = [labels[pred if isinstance(pred, int) else int(pred[0])] + for pred in preds] + return preds def main(argv=None): @@ -99,41 +126,53 @@ def main(argv=None): formatter_class=argparse.ArgumentDefaultsHelpFormatter, conflict_handler='resolve') parser.add_argument('model_file', - help='Model file to load and use for generating \ - predictions.') - parser.add_argument('input_file', - help='A csv file, json file, or megam file \ - (with or without the label column), \ - with the appropriate suffix.', + help='Model file to load and use for generating ' + 'predictions.') + parser.add_argument('input_files', + help='A space-separated list of csv file, json file, ' + 'or megam file (with or without the label ' + 'column), with the appropriate suffix.', nargs='+') parser.add_argument('-i', '--id_col', - help='Name of the column which contains the instance \ - IDs in ARFF, CSV, or TSV files.', + help='Name of the column which contains the instance ' + 'IDs in ARFF, CSV, or TSV files.', default='id') parser.add_argument('-l', '--label_col', - help='Name of the column which contains the labels\ - in ARFF, CSV, or TSV files. For ARFF files, this\ - must be the final column to count as the label.', + help='Name of the column which contains the labels ' + 'in ARFF, CSV, or TSV files. For ARFF files, ' + 'this must be the final column to count as the ' + 'label.', default='y') parser.add_argument('-p', '--positive_label', - help="If the model is only being used to predict the \ - probability of a particular label, this \ - specifies the index of the label we're \ - predicting. 1 = second label, which is default \ - for binary classification. Keep in mind that \ - labels are sorted lexicographically.", + help="If the model is only being used to predict the " + "probability of a particular label, this " + "specifies the index of the label we're " + "predicting. 1 = second label, which is default " + "for binary classification. Keep in mind that " + "labels are sorted lexicographically.", default=1, type=int) parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') - parser.add_argument('-t', '--threshold', - help="If the model we're using is generating \ - probabilities of the positive label, return 1 \ - if it meets/exceeds the given threshold and 0 \ - otherwise.", - type=float) + parser.add_argument('--output_file', '-o', + help="Path to output tsv file. If not specified, " + "predictions will be printed to stdout.") parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) + probability_handling = parser.add_mutually_exclusive_group() + probability_handling.add_argument('-t', '--threshold', + help="If the model we're using is " + "generating probabilities of the " + "positive label, return 1 if it " + "meets/exceeds the given threshold " + "and 0 otherwise.", type=float) + probability_handling.add_argument('--all_probabilities', '-a', + action='store_true', + help="Flag indicating whether to output " + "the probabilities of all labels " + "instead of just the probability " + "of the positive label.") + args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely @@ -145,10 +184,12 @@ def main(argv=None): # Create the classifier and load the model predictor = Predictor(args.model_file, positive_label=args.positive_label, - threshold=args.threshold) + threshold=args.threshold, + all_labels=args.all_probabilities, + logger=logger) # Iterate over all the specified input files - for input_file in args.input_file: + for i, input_file in enumerate(args.input_files): # make sure each file extension is one we can process input_extension = os.path.splitext(input_file)[1].lower() @@ -164,8 +205,34 @@ def main(argv=None): label_col=args.label_col, id_col=args.id_col) feature_set = reader.read() - for pred in predictor.predict(feature_set): - print(pred) + preds = predictor.predict(feature_set) + header = predictor.output_file_header + + if args.output_file is not None: + with open(args.output_file, 'a') as outputfh: + if i == 0: # Only write header once per set of input files + print("\t".join(header), file=outputfh) + if args.all_probabilities: + for j, probabilities in enumerate(preds): + id_ = feature_set.ids[j] + probs_str = "\t".join([str(p) for p in probabilities]) + print("{}\t{}".format(id_, probs_str), file=outputfh) + else: + for j, pred in enumerate(preds): + id_ = feature_set.ids[j] + print("{}\t{}".format(id_, pred), file=outputfh) + else: + if i == 0: # Only write header once per set of input files + print("\t".join(header)) + if args.all_probabilities: + for j, probabilities in enumerate(preds): + id_ = feature_set.ids[j] + probs_str = "\t".join([str(p) for p in probabilities]) + print("{}\t{}".format(id_, probs_str)) + else: + for j, pred in enumerate(preds): + id_ = feature_set.ids[j] + print("{}\t{}".format(id_, pred)) if __name__ == '__main__': diff --git a/skll/version.py b/skll/version.py index b67290bc..3bb8833e 100644 --- a/skll/version.py +++ b/skll/version.py @@ -7,5 +7,5 @@ :organization: ETS """ -__version__ = '1.5' +__version__ = '1.5.3' VERSION = tuple(int(x) for x in __version__.split('.')) diff --git a/tests/configs/test_single_file_saved_subset.template.cfg b/tests/configs/test_single_file_saved_subset.template.cfg new file mode 100644 index 00000000..de8e2b48 --- /dev/null +++ b/tests/configs/test_single_file_saved_subset.template.cfg @@ -0,0 +1,11 @@ +[General] +experiment_name=train_test_single_file +task=evaluate + +[Input] +learners=["RandomForestClassifier"] + +[Tuning] + +[Output] +probability=false diff --git a/tests/other/examples_test.jsonlines b/tests/other/examples_test.jsonlines new file mode 100644 index 00000000..6d344cc6 --- /dev/null +++ b/tests/other/examples_test.jsonlines @@ -0,0 +1,50 @@ +{"id": "EXAMPLE_73", "y": "versicolor", "x": {"f0": 6.1, "f1": 2.8, "f2": 4.7, "f3": 1.2}} +{"id": "EXAMPLE_18", "y": "setosa", "x": {"f0": 5.7, "f1": 3.8, "f2": 1.7, "f3": 0.3}} +{"id": "EXAMPLE_118", "y": "virginica", "x": {"f0": 7.7, "f1": 2.6, "f2": 6.9, "f3": 2.3}} +{"id": "EXAMPLE_78", "y": "versicolor", "x": {"f0": 6.0, "f1": 2.9, "f2": 4.5, "f3": 1.5}} +{"id": "EXAMPLE_76", "y": "versicolor", "x": {"f0": 6.8, "f1": 2.8, "f2": 4.8, "f3": 1.4}} +{"id": "EXAMPLE_31", "y": "setosa", "x": {"f0": 5.4, "f1": 3.4, "f2": 1.5, "f3": 0.4}} +{"id": "EXAMPLE_64", "y": "versicolor", "x": {"f0": 5.6, "f1": 2.9, "f2": 3.6, "f3": 1.3}} +{"id": "EXAMPLE_141", "y": "virginica", "x": {"f0": 6.9, "f1": 3.1, "f2": 5.1, "f3": 2.3}} +{"id": "EXAMPLE_68", "y": "versicolor", "x": {"f0": 6.2, "f1": 2.2, "f2": 4.5, "f3": 1.5}} +{"id": "EXAMPLE_82", "y": "versicolor", "x": {"f0": 5.8, "f1": 2.7, "f2": 3.9, "f3": 1.2}} +{"id": "EXAMPLE_110", "y": "virginica", "x": {"f0": 6.5, "f1": 3.2, "f2": 5.1, "f3": 2.0}} +{"id": "EXAMPLE_12", "y": "setosa", "x": {"f0": 4.8, "f1": 3.0, "f2": 1.4, "f3": 0.1}} +{"id": "EXAMPLE_36", "y": "setosa", "x": {"f0": 5.5, "f1": 3.5, "f2": 1.3, "f3": 0.2}} +{"id": "EXAMPLE_9", "y": "setosa", "x": {"f0": 4.9, "f1": 3.1, "f2": 1.5, "f3": 0.1}} +{"id": "EXAMPLE_19", "y": "setosa", "x": {"f0": 5.1, "f1": 3.8, "f2": 1.5, "f3": 0.3}} +{"id": "EXAMPLE_56", "y": "versicolor", "x": {"f0": 6.3, "f1": 3.3, "f2": 4.7, "f3": 1.6}} +{"id": "EXAMPLE_104", "y": "virginica", "x": {"f0": 6.5, "f1": 3.0, "f2": 5.8, "f3": 2.2}} +{"id": "EXAMPLE_69", "y": "versicolor", "x": {"f0": 5.6, "f1": 2.5, "f2": 3.9, "f3": 1.1}} +{"id": "EXAMPLE_55", "y": "versicolor", "x": {"f0": 5.7, "f1": 2.8, "f2": 4.5, "f3": 1.3}} +{"id": "EXAMPLE_132", "y": "virginica", "x": {"f0": 6.4, "f1": 2.8, "f2": 5.6, "f3": 2.2}} +{"id": "EXAMPLE_29", "y": "setosa", "x": {"f0": 4.7, "f1": 3.2, "f2": 1.6, "f3": 0.2}} +{"id": "EXAMPLE_127", "y": "virginica", "x": {"f0": 6.1, "f1": 3.0, "f2": 4.9, "f3": 1.8}} +{"id": "EXAMPLE_26", "y": "setosa", "x": {"f0": 5.0, "f1": 3.4, "f2": 1.6, "f3": 0.4}} +{"id": "EXAMPLE_128", "y": "virginica", "x": {"f0": 6.4, "f1": 2.8, "f2": 5.6, "f3": 2.1}} +{"id": "EXAMPLE_131", "y": "virginica", "x": {"f0": 7.9, "f1": 3.8, "f2": 6.4, "f3": 2.0}} +{"id": "EXAMPLE_145", "y": "virginica", "x": {"f0": 6.7, "f1": 3.0, "f2": 5.2, "f3": 2.3}} +{"id": "EXAMPLE_108", "y": "virginica", "x": {"f0": 6.7, "f1": 2.5, "f2": 5.8, "f3": 1.8}} +{"id": "EXAMPLE_143", "y": "virginica", "x": {"f0": 6.8, "f1": 3.2, "f2": 5.9, "f3": 2.3}} +{"id": "EXAMPLE_45", "y": "setosa", "x": {"f0": 4.8, "f1": 3.0, "f2": 1.4, "f3": 0.3}} +{"id": "EXAMPLE_30", "y": "setosa", "x": {"f0": 4.8, "f1": 3.1, "f2": 1.6, "f3": 0.2}} +{"id": "EXAMPLE_22", "y": "setosa", "x": {"f0": 4.6, "f1": 3.6, "f2": 1.0, "f3": 0.2}} +{"id": "EXAMPLE_15", "y": "setosa", "x": {"f0": 5.7, "f1": 4.4, "f2": 1.5, "f3": 0.4}} +{"id": "EXAMPLE_65", "y": "versicolor", "x": {"f0": 6.7, "f1": 3.1, "f2": 4.4, "f3": 1.4}} +{"id": "EXAMPLE_11", "y": "setosa", "x": {"f0": 4.8, "f1": 3.4, "f2": 1.6, "f3": 0.2}} +{"id": "EXAMPLE_42", "y": "setosa", "x": {"f0": 4.4, "f1": 3.2, "f2": 1.3, "f3": 0.2}} +{"id": "EXAMPLE_146", "y": "virginica", "x": {"f0": 6.3, "f1": 2.5, "f2": 5.0, "f3": 1.9}} +{"id": "EXAMPLE_51", "y": "versicolor", "x": {"f0": 6.4, "f1": 3.2, "f2": 4.5, "f3": 1.5}} +{"id": "EXAMPLE_27", "y": "setosa", "x": {"f0": 5.2, "f1": 3.5, "f2": 1.5, "f3": 0.2}} +{"id": "EXAMPLE_4", "y": "setosa", "x": {"f0": 5.0, "f1": 3.6, "f2": 1.4, "f3": 0.2}} +{"id": "EXAMPLE_32", "y": "setosa", "x": {"f0": 5.2, "f1": 4.1, "f2": 1.5, "f3": 0.1}} +{"id": "EXAMPLE_142", "y": "virginica", "x": {"f0": 5.8, "f1": 2.7, "f2": 5.1, "f3": 1.9}} +{"id": "EXAMPLE_85", "y": "versicolor", "x": {"f0": 6.0, "f1": 3.4, "f2": 4.5, "f3": 1.6}} +{"id": "EXAMPLE_86", "y": "versicolor", "x": {"f0": 6.7, "f1": 3.1, "f2": 4.7, "f3": 1.5}} +{"id": "EXAMPLE_16", "y": "setosa", "x": {"f0": 5.4, "f1": 3.9, "f2": 1.3, "f3": 0.4}} +{"id": "EXAMPLE_10", "y": "setosa", "x": {"f0": 5.4, "f1": 3.7, "f2": 1.5, "f3": 0.2}} +{"id": "EXAMPLE_81", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.4, "f2": 3.7, "f3": 1.0}} +{"id": "EXAMPLE_133", "y": "virginica", "x": {"f0": 6.3, "f1": 2.8, "f2": 5.1, "f3": 1.5}} +{"id": "EXAMPLE_137", "y": "virginica", "x": {"f0": 6.4, "f1": 3.1, "f2": 5.5, "f3": 1.8}} +{"id": "EXAMPLE_75", "y": "versicolor", "x": {"f0": 6.6, "f1": 3.0, "f2": 4.4, "f3": 1.4}} +{"id": "EXAMPLE_109", "y": "virginica", "x": {"f0": 7.2, "f1": 3.6, "f2": 6.1, "f3": 2.5}} diff --git a/tests/other/examples_train.jsonlines b/tests/other/examples_train.jsonlines new file mode 100644 index 00000000..2d403b02 --- /dev/null +++ b/tests/other/examples_train.jsonlines @@ -0,0 +1,100 @@ +{"id": "EXAMPLE_96", "y": "versicolor", "x": {"f0": 5.7, "f1": 2.9, "f2": 4.2, "f3": 1.3}} +{"id": "EXAMPLE_105", "y": "virginica", "x": {"f0": 7.6, "f1": 3.0, "f2": 6.6, "f3": 2.1}} +{"id": "EXAMPLE_66", "y": "versicolor", "x": {"f0": 5.6, "f1": 3.0, "f2": 4.5, "f3": 1.5}} +{"id": "EXAMPLE_0", "y": "setosa", "x": {"f0": 5.1, "f1": 3.5, "f2": 1.4, "f3": 0.2}} +{"id": "EXAMPLE_122", "y": "virginica", "x": {"f0": 7.7, "f1": 2.8, "f2": 6.7, "f3": 2.0}} +{"id": "EXAMPLE_67", "y": "versicolor", "x": {"f0": 5.8, "f1": 2.7, "f2": 4.1, "f3": 1.0}} +{"id": "EXAMPLE_28", "y": "setosa", "x": {"f0": 5.2, "f1": 3.4, "f2": 1.4, "f3": 0.2}} +{"id": "EXAMPLE_40", "y": "setosa", "x": {"f0": 5.0, "f1": 3.5, "f2": 1.3, "f3": 0.3}} +{"id": "EXAMPLE_44", "y": "setosa", "x": {"f0": 5.1, "f1": 3.8, "f2": 1.9, "f3": 0.4}} +{"id": "EXAMPLE_60", "y": "versicolor", "x": {"f0": 5.0, "f1": 2.0, "f2": 3.5, "f3": 1.0}} +{"id": "EXAMPLE_123", "y": "virginica", "x": {"f0": 6.3, "f1": 2.7, "f2": 4.9, "f3": 1.8}} +{"id": "EXAMPLE_24", "y": "setosa", "x": {"f0": 4.8, "f1": 3.4, "f2": 1.9, "f3": 0.2}} +{"id": "EXAMPLE_25", "y": "setosa", "x": {"f0": 5.0, "f1": 3.0, "f2": 1.6, "f3": 0.2}} +{"id": "EXAMPLE_23", "y": "setosa", "x": {"f0": 5.1, "f1": 3.3, "f2": 1.7, "f3": 0.5}} +{"id": "EXAMPLE_94", "y": "versicolor", "x": {"f0": 5.6, "f1": 2.7, "f2": 4.2, "f3": 1.3}} +{"id": "EXAMPLE_39", "y": "setosa", "x": {"f0": 5.1, "f1": 3.4, "f2": 1.5, "f3": 0.2}} +{"id": "EXAMPLE_95", "y": "versicolor", "x": {"f0": 5.7, "f1": 3.0, "f2": 4.2, "f3": 1.2}} +{"id": "EXAMPLE_117", "y": "virginica", "x": {"f0": 7.7, "f1": 3.8, "f2": 6.7, "f3": 2.2}} +{"id": "EXAMPLE_47", "y": "setosa", "x": {"f0": 4.6, "f1": 3.2, "f2": 1.4, "f3": 0.2}} +{"id": "EXAMPLE_97", "y": "versicolor", "x": {"f0": 6.2, "f1": 2.9, "f2": 4.3, "f3": 1.3}} +{"id": "EXAMPLE_113", "y": "virginica", "x": {"f0": 5.7, "f1": 2.5, "f2": 5.0, "f3": 2.0}} +{"id": "EXAMPLE_33", "y": "setosa", "x": {"f0": 5.5, "f1": 4.2, "f2": 1.4, "f3": 0.2}} +{"id": "EXAMPLE_138", "y": "virginica", "x": {"f0": 6.0, "f1": 3.0, "f2": 4.8, "f3": 1.8}} +{"id": "EXAMPLE_101", "y": "virginica", "x": {"f0": 5.8, "f1": 2.7, "f2": 5.1, "f3": 1.9}} +{"id": "EXAMPLE_62", "y": "versicolor", "x": {"f0": 6.0, "f1": 2.2, "f2": 4.0, "f3": 1.0}} +{"id": "EXAMPLE_84", "y": "versicolor", "x": {"f0": 5.4, "f1": 3.0, "f2": 4.5, "f3": 1.5}} +{"id": "EXAMPLE_148", "y": "virginica", "x": {"f0": 6.2, "f1": 3.4, "f2": 5.4, "f3": 2.3}} +{"id": "EXAMPLE_53", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.3, "f2": 4.0, "f3": 1.3}} +{"id": "EXAMPLE_5", "y": "setosa", "x": {"f0": 5.4, "f1": 3.9, "f2": 1.7, "f3": 0.4}} +{"id": "EXAMPLE_93", "y": "versicolor", "x": {"f0": 5.0, "f1": 2.3, "f2": 3.3, "f3": 1.0}} +{"id": "EXAMPLE_111", "y": "virginica", "x": {"f0": 6.4, "f1": 2.7, "f2": 5.3, "f3": 1.9}} +{"id": "EXAMPLE_49", "y": "setosa", "x": {"f0": 5.0, "f1": 3.3, "f2": 1.4, "f3": 0.2}} +{"id": "EXAMPLE_35", "y": "setosa", "x": {"f0": 5.0, "f1": 3.2, "f2": 1.2, "f3": 0.2}} +{"id": "EXAMPLE_80", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.4, "f2": 3.8, "f3": 1.1}} +{"id": "EXAMPLE_77", "y": "versicolor", "x": {"f0": 6.7, "f1": 3.0, "f2": 5.0, "f3": 1.7}} +{"id": "EXAMPLE_34", "y": "setosa", "x": {"f0": 4.9, "f1": 3.1, "f2": 1.5, "f3": 0.1}} +{"id": "EXAMPLE_114", "y": "virginica", "x": {"f0": 5.8, "f1": 2.8, "f2": 5.1, "f3": 2.4}} +{"id": "EXAMPLE_7", "y": "setosa", "x": {"f0": 5.0, "f1": 3.4, "f2": 1.5, "f3": 0.2}} +{"id": "EXAMPLE_43", "y": "setosa", "x": {"f0": 5.0, "f1": 3.5, "f2": 1.6, "f3": 0.6}} +{"id": "EXAMPLE_70", "y": "versicolor", "x": {"f0": 5.9, "f1": 3.2, "f2": 4.8, "f3": 1.8}} +{"id": "EXAMPLE_98", "y": "versicolor", "x": {"f0": 5.1, "f1": 2.5, "f2": 3.0, "f3": 1.1}} +{"id": "EXAMPLE_120", "y": "virginica", "x": {"f0": 6.9, "f1": 3.2, "f2": 5.7, "f3": 2.3}} +{"id": "EXAMPLE_83", "y": "versicolor", "x": {"f0": 6.0, "f1": 2.7, "f2": 5.1, "f3": 1.6}} +{"id": "EXAMPLE_134", "y": "virginica", "x": {"f0": 6.1, "f1": 2.6, "f2": 5.6, "f3": 1.4}} +{"id": "EXAMPLE_135", "y": "virginica", "x": {"f0": 7.7, "f1": 3.0, "f2": 6.1, "f3": 2.3}} +{"id": "EXAMPLE_89", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.5, "f2": 4.0, "f3": 1.3}} +{"id": "EXAMPLE_8", "y": "setosa", "x": {"f0": 4.4, "f1": 2.9, "f2": 1.4, "f3": 0.2}} +{"id": "EXAMPLE_13", "y": "setosa", "x": {"f0": 4.3, "f1": 3.0, "f2": 1.1, "f3": 0.1}} +{"id": "EXAMPLE_119", "y": "virginica", "x": {"f0": 6.0, "f1": 2.2, "f2": 5.0, "f3": 1.5}} +{"id": "EXAMPLE_125", "y": "virginica", "x": {"f0": 7.2, "f1": 3.2, "f2": 6.0, "f3": 1.8}} +{"id": "EXAMPLE_3", "y": "setosa", "x": {"f0": 4.6, "f1": 3.1, "f2": 1.5, "f3": 0.2}} +{"id": "EXAMPLE_17", "y": "setosa", "x": {"f0": 5.1, "f1": 3.5, "f2": 1.4, "f3": 0.3}} +{"id": "EXAMPLE_38", "y": "setosa", "x": {"f0": 4.4, "f1": 3.0, "f2": 1.3, "f3": 0.2}} +{"id": "EXAMPLE_72", "y": "versicolor", "x": {"f0": 6.3, "f1": 2.5, "f2": 4.9, "f3": 1.5}} +{"id": "EXAMPLE_136", "y": "virginica", "x": {"f0": 6.3, "f1": 3.4, "f2": 5.6, "f3": 2.4}} +{"id": "EXAMPLE_6", "y": "setosa", "x": {"f0": 4.6, "f1": 3.4, "f2": 1.4, "f3": 0.3}} +{"id": "EXAMPLE_112", "y": "virginica", "x": {"f0": 6.8, "f1": 3.0, "f2": 5.5, "f3": 2.1}} +{"id": "EXAMPLE_100", "y": "virginica", "x": {"f0": 6.3, "f1": 3.3, "f2": 6.0, "f3": 2.5}} +{"id": "EXAMPLE_2", "y": "setosa", "x": {"f0": 4.7, "f1": 3.2, "f2": 1.3, "f3": 0.2}} +{"id": "EXAMPLE_63", "y": "versicolor", "x": {"f0": 6.1, "f1": 2.9, "f2": 4.7, "f3": 1.4}} +{"id": "EXAMPLE_54", "y": "versicolor", "x": {"f0": 6.5, "f1": 2.8, "f2": 4.6, "f3": 1.5}} +{"id": "EXAMPLE_126", "y": "virginica", "x": {"f0": 6.2, "f1": 2.8, "f2": 4.8, "f3": 1.8}} +{"id": "EXAMPLE_50", "y": "versicolor", "x": {"f0": 7.0, "f1": 3.2, "f2": 4.7, "f3": 1.4}} +{"id": "EXAMPLE_115", "y": "virginica", "x": {"f0": 6.4, "f1": 3.2, "f2": 5.3, "f3": 2.3}} +{"id": "EXAMPLE_46", "y": "setosa", "x": {"f0": 5.1, "f1": 3.8, "f2": 1.6, "f3": 0.2}} +{"id": "EXAMPLE_139", "y": "virginica", "x": {"f0": 6.9, "f1": 3.1, "f2": 5.4, "f3": 2.1}} +{"id": "EXAMPLE_61", "y": "versicolor", "x": {"f0": 5.9, "f1": 3.0, "f2": 4.2, "f3": 1.5}} +{"id": "EXAMPLE_147", "y": "virginica", "x": {"f0": 6.5, "f1": 3.0, "f2": 5.2, "f3": 2.0}} +{"id": "EXAMPLE_79", "y": "versicolor", "x": {"f0": 5.7, "f1": 2.6, "f2": 3.5, "f3": 1.0}} +{"id": "EXAMPLE_59", "y": "versicolor", "x": {"f0": 5.2, "f1": 2.7, "f2": 3.9, "f3": 1.4}} +{"id": "EXAMPLE_91", "y": "versicolor", "x": {"f0": 6.1, "f1": 3.0, "f2": 4.6, "f3": 1.4}} +{"id": "EXAMPLE_41", "y": "setosa", "x": {"f0": 4.5, "f1": 2.3, "f2": 1.3, "f3": 0.3}} +{"id": "EXAMPLE_58", "y": "versicolor", "x": {"f0": 6.6, "f1": 2.9, "f2": 4.6, "f3": 1.3}} +{"id": "EXAMPLE_90", "y": "versicolor", "x": {"f0": 5.5, "f1": 2.6, "f2": 4.4, "f3": 1.2}} +{"id": "EXAMPLE_48", "y": "setosa", "x": {"f0": 5.3, "f1": 3.7, "f2": 1.5, "f3": 0.2}} +{"id": "EXAMPLE_88", "y": "versicolor", "x": {"f0": 5.6, "f1": 3.0, "f2": 4.1, "f3": 1.3}} +{"id": "EXAMPLE_107", "y": "virginica", "x": {"f0": 7.3, "f1": 2.9, "f2": 6.3, "f3": 1.8}} +{"id": "EXAMPLE_124", "y": "virginica", "x": {"f0": 6.7, "f1": 3.3, "f2": 5.7, "f3": 2.1}} +{"id": "EXAMPLE_21", "y": "setosa", "x": {"f0": 5.1, "f1": 3.7, "f2": 1.5, "f3": 0.4}} +{"id": "EXAMPLE_57", "y": "versicolor", "x": {"f0": 4.9, "f1": 2.4, "f2": 3.3, "f3": 1.0}} +{"id": "EXAMPLE_144", "y": "virginica", "x": {"f0": 6.7, "f1": 3.3, "f2": 5.7, "f3": 2.5}} +{"id": "EXAMPLE_129", "y": "virginica", "x": {"f0": 7.2, "f1": 3.0, "f2": 5.8, "f3": 1.6}} +{"id": "EXAMPLE_37", "y": "setosa", "x": {"f0": 4.9, "f1": 3.1, "f2": 1.5, "f3": 0.1}} +{"id": "EXAMPLE_140", "y": "virginica", "x": {"f0": 6.7, "f1": 3.1, "f2": 5.6, "f3": 2.4}} +{"id": "EXAMPLE_1", "y": "setosa", "x": {"f0": 4.9, "f1": 3.0, "f2": 1.4, "f3": 0.2}} +{"id": "EXAMPLE_52", "y": "versicolor", "x": {"f0": 6.9, "f1": 3.1, "f2": 4.9, "f3": 1.5}} +{"id": "EXAMPLE_130", "y": "virginica", "x": {"f0": 7.4, "f1": 2.8, "f2": 6.1, "f3": 1.9}} +{"id": "EXAMPLE_103", "y": "virginica", "x": {"f0": 6.3, "f1": 2.9, "f2": 5.6, "f3": 1.8}} +{"id": "EXAMPLE_99", "y": "versicolor", "x": {"f0": 5.7, "f1": 2.8, "f2": 4.1, "f3": 1.3}} +{"id": "EXAMPLE_116", "y": "virginica", "x": {"f0": 6.5, "f1": 3.0, "f2": 5.5, "f3": 1.8}} +{"id": "EXAMPLE_87", "y": "versicolor", "x": {"f0": 6.3, "f1": 2.3, "f2": 4.4, "f3": 1.3}} +{"id": "EXAMPLE_74", "y": "versicolor", "x": {"f0": 6.4, "f1": 2.9, "f2": 4.3, "f3": 1.3}} +{"id": "EXAMPLE_121", "y": "virginica", "x": {"f0": 5.6, "f1": 2.8, "f2": 4.9, "f3": 2.0}} +{"id": "EXAMPLE_149", "y": "virginica", "x": {"f0": 5.9, "f1": 3.0, "f2": 5.1, "f3": 1.8}} +{"id": "EXAMPLE_20", "y": "setosa", "x": {"f0": 5.4, "f1": 3.4, "f2": 1.7, "f3": 0.2}} +{"id": "EXAMPLE_71", "y": "versicolor", "x": {"f0": 6.1, "f1": 2.8, "f2": 4.0, "f3": 1.3}} +{"id": "EXAMPLE_106", "y": "virginica", "x": {"f0": 4.9, "f1": 2.5, "f2": 4.5, "f3": 1.7}} +{"id": "EXAMPLE_14", "y": "setosa", "x": {"f0": 5.8, "f1": 4.0, "f2": 1.2, "f3": 0.2}} +{"id": "EXAMPLE_92", "y": "versicolor", "x": {"f0": 5.8, "f1": 2.6, "f2": 4.0, "f3": 1.2}} +{"id": "EXAMPLE_102", "y": "virginica", "x": {"f0": 7.1, "f1": 3.0, "f2": 5.9, "f3": 2.1}} diff --git a/tests/test_classification.py b/tests/test_classification.py index c026f154..dde6b237 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -26,9 +26,11 @@ from nose.tools import eq_, assert_almost_equal, raises from sklearn.exceptions import ConvergenceWarning +from sklearn.feature_extraction import FeatureHasher from sklearn.metrics import accuracy_score from skll.data import FeatureSet +from skll.data.readers import NDJReader from skll.data.writers import NDJWriter from skll.config import _parse_config_file from skll.experiments import run_configuration @@ -76,9 +78,11 @@ def tearDown(): for output_file in glob.glob(join(output_dir, 'train_test_single_file_*')): os.unlink(output_file) - config_file = join(config_dir, 'test_single_file.cfg') - if exists(config_file): - os.unlink(config_file) + config_files = [join(config_dir, cfgname) for cfgname in ['test_single_file.cfg', + 'test_single_file_saved_subset']] + for config_file in config_files: + if exists(config_file): + os.unlink(config_file) def check_predict(model, use_feature_hashing=False): @@ -127,6 +131,70 @@ def test_predict(): yield check_predict, model, use_feature_hashing +# test predictions when both the model and the data use DictVectorizers +def test_predict_dict_dict(): + train_file = join(_my_dir, 'other', 'examples_train.jsonlines') + test_file = join(_my_dir, 'other', 'examples_test.jsonlines') + train_fs = NDJReader.for_path(train_file).read() + test_fs = NDJReader.for_path(test_file).read() + learner = Learner('LogisticRegression') + learner.train(train_fs, grid_search=False) + predictions = learner.predict(test_fs) + eq_(len(predictions), test_fs.features.shape[0]) + + +# test predictions when both the model and the data use FeatureHashers +# and the same number of bins +def test_predict_hasher_hasher_same_bins(): + train_file = join(_my_dir, 'other', 'examples_train.jsonlines') + test_file = join(_my_dir, 'other', 'examples_test.jsonlines') + train_fs = NDJReader.for_path(train_file, feature_hasher=True, num_features=3).read() + test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read() + learner = Learner('LogisticRegression') + learner.train(train_fs, grid_search=False) + predictions = learner.predict(test_fs) + eq_(len(predictions), test_fs.features.shape[0]) + + +# test predictions when both the model and the data use FeatureHashers +# but different number of bins +@raises(RuntimeError) +def test_predict_hasher_hasher_different_bins(): + train_file = join(_my_dir, 'other', 'examples_train.jsonlines') + test_file = join(_my_dir, 'other', 'examples_test.jsonlines') + train_fs = NDJReader.for_path(train_file, feature_hasher=True, num_features=3).read() + test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=2).read() + learner = Learner('LogisticRegression') + learner.train(train_fs, grid_search=False) + _ = learner.predict(test_fs) + + +# test predictions when model uses a FeatureHasher and data +# uses a DictVectorizer +def test_predict_hasher_dict(): + train_file = join(_my_dir, 'other', 'examples_train.jsonlines') + test_file = join(_my_dir, 'other', 'examples_test.jsonlines') + train_fs = NDJReader.for_path(train_file, feature_hasher=True, num_features=3).read() + test_fs = NDJReader.for_path(test_file).read() + learner = Learner('LogisticRegression') + learner.train(train_fs, grid_search=False) + predictions = learner.predict(test_fs) + eq_(len(predictions), test_fs.features.shape[0]) + + +# test predictions when model uses a DictVectorizer and data +# uses a FeatureHasher +@raises(RuntimeError) +def test_predict_dict_hasher(): + train_file = join(_my_dir, 'other', 'examples_train.jsonlines') + test_file = join(_my_dir, 'other', 'examples_test.jsonlines') + train_fs = NDJReader.for_path(train_file).read() + test_fs = NDJReader.for_path(test_file, feature_hasher=True, num_features=3).read() + learner = Learner('LogisticRegression') + learner.train(train_fs, grid_search=False) + _ = learner.predict(test_fs) + + # the function to create data with rare labels for cross-validation def make_rare_class_data(): """ @@ -190,7 +258,7 @@ def test_sparse_predict(): [(0.45, 0.52), (0.52, 0.5), (0.48, 0.5), (0.49, 0.5), (0.43, 0), (0.53, 0.57), - (0.49, 0.49), (0.48, 0.5)]): + (0.49, 0.49), (0.5, 0.49)]): yield check_sparse_predict, learner_name, expected_scores[0], False if learner_name != 'MultinomialNB': yield check_sparse_predict, learner_name, expected_scores[1], True @@ -207,7 +275,7 @@ def test_mlp_classification(): learner = Learner('MLPClassifier') with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) - learner.train(train_fs, grid_search=True) + learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) @@ -217,7 +285,7 @@ def test_mlp_classification(): # using make_regression_data. To do this, we just # make sure that they are correlated accuracy = accuracy_score(predictions, test_fs.labels) - assert_almost_equal(accuracy, 0.825) + assert_almost_equal(accuracy, 0.858, places=3) def check_sparse_predict_sampler(use_feature_hashing=False): @@ -301,6 +369,12 @@ def make_single_file_featureset_data(): writer = NDJWriter(test_path, test_fs) writer.write() + # Also write another test feature set that has fewer features than the training set + test_fs.filter(features=['f01', 'f02']) + test_path = join(_my_dir, 'test', 'test_single_file_subset.jsonlines') + writer = NDJWriter(test_path, test_fs) + writer.write() + def test_train_file_test_file(): """ @@ -340,6 +414,43 @@ def test_train_file_test_file(): assert_almost_equal(result_dict['score'], 0.9491525423728813) +def test_predict_on_subset_with_existing_model(): + """ + Test generating predictions on subset with existing model + """ + # Create data files + make_single_file_featureset_data() + + # train and save a model on the training file + train_fs = NDJReader.for_path(join(_my_dir, 'train', 'train_single_file.jsonlines')).read() + learner = Learner('RandomForestClassifier') + learner.train(train_fs, grid_search=True, grid_objective="accuracy") + model_filename = join(_my_dir, 'output', ('train_test_single_file_train_train_' + 'single_file.jsonlines_test_test_single' + '_file_subset.jsonlines_RandomForestClassifier' + '.model')) + + learner.save(model_filename) + + # Run experiment + config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", + "test_single_file_saved_subset" + ".template.cfg"), + join(_my_dir, 'train', 'train_single_file.jsonlines'), + join(_my_dir, 'test', + 'test_single_file_subset.' + 'jsonlines')) + run_configuration(config_path, quiet=True, overwrite=False) + + # Check results + with open(join(_my_dir, 'output', ('train_test_single_file_train_train_' + 'single_file.jsonlines_test_test_single' + '_file_subset.jsonlines_RandomForestClassifier' + '.results.json'))) as f: + result_dict = json.load(f)[0] + assert_almost_equal(result_dict['score'], 0.7333333) + + def test_train_file_test_file_ablation(): """ Test that specifying ablation with train and test file is ignored diff --git a/tests/test_featureset.py b/tests/test_featureset.py index 3ae1f6ee..d5cb6f30 100644 --- a/tests/test_featureset.py +++ b/tests/test_featureset.py @@ -25,7 +25,7 @@ from sklearn.datasets.samples_generator import make_classification import skll -from skll.data import FeatureSet, Writer, Reader +from skll.data import FeatureSet, Writer, Reader, NDJReader, NDJWriter from skll.data.readers import DictListReader from skll.experiments import _load_featureset from skll.learner import _DEFAULT_PARAM_GRIDS @@ -62,6 +62,11 @@ def tearDown(): if exists(filepath): os.unlink(filepath) + filepaths = [join(_my_dir, 'other', '{}.jsonlines'.format(x)) for x in ['test_string_ids', 'test_string_ids_df', 'test_string_labels_df']] + for filepath in filepaths: + if exists(filepath): + os.unlink(filepath) + def _create_empty_file(filetype): filepath = join(_my_dir, 'other', 'empty.{}'.format(filetype)) @@ -1026,3 +1031,79 @@ def test_featureset_creation_from_dataframe_without_labels_with_vectorizer(): rtol=1e-6) and np.all(np.isnan(expected.labels)) and np.all(np.isnan(current.labels))) + + +def test_writing_ndj_featureset_with_string_ids(): + test_dict_vectorizer = DictVectorizer() + test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] + Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) + fs_test = FeatureSet('test', + ids=['1', '2'], + labels=[1, 2], + features=Xtest, + vectorizer=test_dict_vectorizer) + output_path = join(_my_dir, "other", "test_string_ids.jsonlines") + test_writer = NDJWriter(output_path, fs_test) + test_writer.write() + + # read in the written file into a featureset and confirm that the + # two featuresets are equal + fs_test2 = NDJReader.for_path(output_path).read() + + assert fs_test == fs_test2 + + +@attr('have_pandas_and_seaborn') +def test_featureset_creation_from_dataframe_with_string_ids(): + + import pandas + + dftest = pandas.DataFrame({"id": ['1', '2'], + "score": [1, 2], + "text": ["a b", "b c"]}) + dftest.set_index("id", inplace=True) + test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] + test_dict_vectorizer = DictVectorizer() + Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) + fs_test = FeatureSet('test', + ids=dftest.index.values, + labels=dftest['score'].values, + features=Xtest, + vectorizer=test_dict_vectorizer) + output_path = join(_my_dir, "other", "test_string_ids_df.jsonlines") + test_writer = NDJWriter(output_path, fs_test) + test_writer.write() + + # read in the written file into a featureset and confirm that the + # two featuresets are equal + fs_test2 = NDJReader.for_path(output_path).read() + + assert fs_test == fs_test2 + + +@attr('have_pandas_and_seaborn') +def test_featureset_creation_from_dataframe_with_string_labels(): + + import pandas + + dftest = pandas.DataFrame({"id": [1, 2], + "score": ['yes', 'no'], + "text": ["a b", "b c"]}) + dftest.set_index("id", inplace=True) + test_feat_dict_list = [{'a': 1.0, 'b': 1.0}, {'b': 1.0, 'c': 1.0}] + test_dict_vectorizer = DictVectorizer() + Xtest = test_dict_vectorizer.fit_transform(test_feat_dict_list) + fs_test = FeatureSet('test', + ids=dftest.index.values, + labels=dftest['score'].values, + features=Xtest, + vectorizer=test_dict_vectorizer) + output_path = join(_my_dir, "other", "test_string_labels_df.jsonlines") + test_writer = NDJWriter(output_path, fs_test) + test_writer.write() + + # read in the written file into a featureset and confirm that the + # two featuresets are equal + fs_test2 = NDJReader.for_path(output_path, ids_to_floats=True).read() + + assert fs_test == fs_test2 diff --git a/tests/test_input.py b/tests/test_input.py index a36b9fe5..158c2f27 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -62,6 +62,10 @@ def tearDown(): config_dir = join(_my_dir, 'configs') for config_file in glob(join(config_dir, 'test_config_parsing_*.cfg')): os.unlink(config_file) + for auto_dir in glob(join(_my_dir, 'auto*')): + for auto_dir_file in os.listdir(auto_dir): + os.unlink(join(auto_dir, auto_dir_file)) + os.rmdir(auto_dir) def check_safe_float_conversion(converted_val, expected_val): @@ -1119,6 +1123,60 @@ def test_config_parsing_relative_input_paths(): learning_curve_train_sizes, output_metrics) = _parse_config_file(config_path) +def test_config_parsing_automatic_output_directory_creation(): + + train_dir = '../train' + train_file = join(train_dir, 'f0.jsonlines') + test_file = join(train_dir, 'f1.jsonlines') + output_dir = '../output' + + # make a simple config file that has new directories that should + # be automatically created + new_log_path = join(_my_dir, 'autolog') + new_results_path = join(_my_dir, 'autoresults') + new_models_path = join(_my_dir, 'automodels') + new_predictions_path = join(_my_dir, 'autopredictions') + + ok_(not(exists(new_log_path))) + ok_(not(exists(new_results_path))) + ok_(not(exists(new_models_path))) + ok_(not(exists(new_predictions_path))) + + values_to_fill_dict = {'experiment_name': 'auto_dir_creation', + 'task': 'evaluate', + 'train_file': train_file, + 'test_file': test_file, + 'learners': "['LogisticRegression']", + 'log': new_log_path, + 'results': new_results_path, + 'models': new_models_path, + 'predictions': new_predictions_path, + 'objective': 'f1_score_micro'} + + config_template_path = join(_my_dir, 'configs', + 'test_relative_paths.template.cfg') + config_path = fill_in_config_options(config_template_path, + values_to_fill_dict, + 'auto_dir_creation') + + (experiment_name, task, sampler, fixed_sampler_parameters, + feature_hasher, hasher_features, id_col, label_col, train_set_name, + test_set_name, suffix, featuresets, do_shuffle, model_path, + do_grid_search, grid_objective, probability, results_path, + pos_label_str, feature_scaling, min_feature_count, folds_file, + grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds, + use_folds_file_for_grid_search, do_stratified_folds, + fixed_parameter_list, param_grid_list, featureset_names, learners, + prediction_dir, log_path, train_path, test_path, ids_to_floats, + class_map, custom_learner_path, learning_curve_cv_folds_list, + learning_curve_train_sizes, output_metrics) = _parse_config_file(config_path) + + ok_(exists(new_log_path)) + ok_(exists(new_results_path)) + ok_(exists(new_models_path)) + ok_(exists(new_predictions_path)) + + def check_config_parsing_metrics_and_objectives_overlap(task, metrics, objectives): diff --git a/tests/test_regression.py b/tests/test_regression.py index 098ed8ac..c090082c 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -135,7 +135,7 @@ def check_rescaling(name, grid_search=False): train_p_std = np.std(train_predictions) rescaled_train_p_std = np.std(rescaled_train_predictions) assert_less(abs(rescaled_train_p_std - train_y_std), - abs(train_p_std - train_y_std)) + abs(train_p_std - train_y_std)) def test_rescaling(): @@ -403,14 +403,14 @@ def check_ensemble_models(name, else: expected_feature_importances = [0.10266744, 0.18681777, 0.71051479] else: - expected_feature_importances = ([0.204, - 0.172, - 0.178, - 0.212, - 0.234] if use_feature_hashing else - [0.262, - 0.288, - 0.45]) + expected_feature_importances = ([0.471714, + 0.022797, + 0.283377, + 0.170823, + 0.051288] if use_feature_hashing else + [0.082621, + 0.166652, + 0.750726]) feature_importances = learner.model.feature_importances_ assert_allclose(feature_importances, expected_feature_importances, @@ -611,7 +611,7 @@ def test_ransac_regression(): 'SGDRegressor', 'DecisionTreeRegressor', 'SVR'], - [0.95, 0.45, 0.75, 0.65]): + [0.95, 0.45, 0.75, 0.65]): yield check_ransac_regression, base_estimator_name, pearson_value @@ -627,7 +627,7 @@ def check_mlp_regression(use_rescaling=False): # we don't want to see any convergence warnings during the grid search with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ConvergenceWarning) - learner.train(train_fs, grid_search=True, grid_objective='pearson') + learner.train(train_fs, grid_search=False) # now generate the predictions on the test set predictions = learner.predict(test_fs) diff --git a/tests/test_utilities.py b/tests/test_utilities.py index 2b26c57f..634623bc 100644 --- a/tests/test_utilities.py +++ b/tests/test_utilities.py @@ -11,6 +11,7 @@ import ast import copy +import csv import itertools import os import sys @@ -29,6 +30,7 @@ from nose.plugins.logcapture import LogCapture from nose.tools import eq_, assert_almost_equal, raises from numpy.testing import assert_allclose, assert_array_almost_equal +from numpy import concatenate import skll import skll.utilities.compute_eval_from_predictions as cefp @@ -284,19 +286,29 @@ def test_compute_eval_from_predictions_random_choice(): eq_(pred, 'C') -def check_generate_predictions(use_feature_hashing=False, use_threshold=False): - - # create some simple classification data without feature hashing - train_fs, test_fs = make_classification_data( - num_examples=1000, num_features=5, - use_feature_hashing=use_feature_hashing, feature_bins=4) +def check_generate_predictions(use_feature_hashing=False, + use_threshold=False, + test_on_subset=False, + use_all_labels=False): + # create some simple classification feature sets for training and testing + train_fs, test_fs = make_classification_data(num_examples=1000, + num_features=5, + use_feature_hashing=use_feature_hashing, + feature_bins=4) + enable_probability = use_threshold or use_all_labels # create a learner that uses an SGD classifier - learner = Learner('SGDClassifier', probability=use_threshold) + learner = Learner('SGDClassifier', probability=enable_probability) # train the learner with grid search learner.train(train_fs, grid_search=True) + # if we are asked to use only a subset, then filter out + # one of the features if we are not using feature hashing, + # do nothing if we are using feature hashing + if test_on_subset and not use_feature_hashing: + test_fs.filter(features=['f01', 'f02', 'f03', 'f04']) + # get the predictions on the test featureset predictions = learner.predict(test_fs) @@ -316,24 +328,90 @@ def check_generate_predictions(use_feature_hashing=False, use_threshold=False): # now use Predictor to generate the predictions and make # sure that they are the same as before saving the model - p = gp.Predictor(model_file, threshold=threshold) + p = gp.Predictor(model_file, threshold=threshold, + all_labels=use_all_labels) + + assert(p._pos_index == 1) + assert(p.threshold == threshold) + predictions_after_saving = p.predict(test_fs) eq_(predictions, predictions_after_saving) -def test_generate_predictions(): +def check_generate_predictions_file_headers(use_threshold=False, + use_all_labels=False): + # create some simple classification feature sets for training and testing + train_fs, test_fs = make_classification_data(num_examples=1000, + num_features=5, + feature_bins=4) + enable_probability = use_threshold or use_all_labels + # create a learner that uses an SGD classifier + learner = Learner('SGDClassifier', probability=enable_probability) + + # train the learner with grid search + learner.train(train_fs, grid_search=True) + + # get the predictions on the test featureset + predictions = learner.predict(test_fs) + + # if we asked for probabilities, then use the threshold + # to convert them into binary predictions + if use_threshold: + threshold = 0.6 + else: + threshold = None + + # save the learner to a file + model_file = join(_my_dir, 'output', + 'test_generate_predictions.model') + learner.save(model_file) + + # now use Predictor to generate the predictions and make + # sure that they are the same as before saving the model + p = gp.Predictor(model_file, threshold=threshold, + all_labels=use_all_labels) + predictions_after_saving = p.predict(test_fs) + + if threshold: + assert (p.output_file_header == ['id', 'prediction']) + elif use_all_labels: + assert (p.output_file_header == ['id', '0', '1']) + + + +@raises(ValueError) +def test_generate_predictions_conflicting_params(): """ - Test generate predictions API with hashing and a threshold + Test that ValueError is raised when `generate_predictions.Predictor` is + initialized with both `threshold` and `all_labels` turned on. """ + model_file = "not/real/model/file.model" + gp.Predictor(model_file, threshold=0.6, all_labels=True) + + +def test_generate_predictions(): + for (use_feature_hashing, + use_threshold, + test_on_subset, + all_probabilities) in product([True, False], [True, False], + [True, False], [True, False]): + if use_threshold and all_probabilities: + continue + yield (check_generate_predictions, use_feature_hashing, + use_threshold, test_on_subset, all_probabilities) + - yield check_generate_predictions, False, False - yield check_generate_predictions, True, False - yield check_generate_predictions, False, True - yield check_generate_predictions, True, True +def test_generate_predictions_file_header(): + for (use_threshold, all_probabilities) in ([True, False], [False, True]): + if use_threshold and all_probabilities: + continue + yield (check_generate_predictions_file_headers, + use_threshold, all_probabilities) -def check_generate_predictions_console(use_threshold=False): + +def check_generate_predictions_console(use_threshold=False, all_labels=False): # create some simple classification data without feature hashing train_fs, test_fs = make_classification_data(num_examples=1000, @@ -345,8 +423,9 @@ def check_generate_predictions_console(use_threshold=False): writer = NDJWriter(input_file, test_fs) writer.write() + enable_probability = use_threshold or all_labels # create a learner that uses an SGD classifier - learner = Learner('SGDClassifier', probability=use_threshold) + learner = Learner('SGDClassifier', probability=enable_probability) # train the learner with grid search learner.train(train_fs, grid_search=True) @@ -372,6 +451,9 @@ def check_generate_predictions_console(use_threshold=False): generate_cmd = [] if use_threshold: generate_cmd.append('-t {}'.format(threshold)) + elif all_labels: + generate_cmd.append('-a') + generate_cmd.extend([model_file, input_file]) # we need to capture stdout since that's what main() writes to @@ -384,21 +466,265 @@ def check_generate_predictions_console(use_threshold=False): gp.main(generate_cmd) out = mystdout.getvalue() err = mystderr.getvalue() - predictions_after_saving = [int(x) for x in out.strip().split('\n')] - eq_(predictions, predictions_after_saving) + output_lines = out.strip().split('\n')[1:] # Skip headers + if all_labels: + # Ignore the id (first column) in output. + predictions_after_saving = [[float(p) for p in x.split('\t')[1:]] + for x in output_lines] + else: + # Ignore the id (first column) in output. + predictions_after_saving = [int(x.split('\t')[1]) + for x in output_lines] + if all_labels: + assert_array_almost_equal(predictions, predictions_after_saving) + else: + eq_(predictions, predictions_after_saving) + finally: + sys.stdout = old_stdout + sys.stderr = old_stderr + print(err) + +def test_generate_predictions_console_bad_input_ext(): + lc = LogCapture() + lc.begin() + + # create some simple classification data without feature hashing + train_fs, test_fs = make_classification_data(num_examples=1000, + num_features=5) + + # create a learner that uses an SGD classifier + learner = Learner('SGDClassifier') + # train the learner with grid search + learner.train(train_fs, grid_search=True) + # get the predictions on the test featureset + predictions = learner.predict(test_fs) + # save the learner to a file + model_file = join(_my_dir, 'output', + 'test_generate_predictions_console.model') + learner.save(model_file) + + # now call main() from generate_predictions.py + generate_cmd = [model_file, "fake_input_file.txt"] + + # we need to capture stdout since that's what main() writes to + err = '' + try: + old_stdout = sys.stdout + old_stderr = sys.stderr + sys.stdout = mystdout = StringIO() + sys.stderr = mystderr = StringIO() + gp.main(generate_cmd) + out = mystdout.getvalue() + err = mystderr.getvalue() finally: sys.stdout = old_stdout sys.stderr = old_stderr print(err) + expected_log_mssg = ("skll.utilities.generate_predictions: ERROR: Input " + "file must be in either .arff, .csv, .jsonlines, " + ".libsvm, .megam, .ndj, or .tsv format. Skipping " + "file fake_input_file.txt") + + eq_(lc.handler.buffer[-1], expected_log_mssg) + def test_generate_predictions_console(): """ Test generate_predictions as a console script with/without a threshold """ - yield check_generate_predictions_console, False - yield check_generate_predictions_console, True + yield check_generate_predictions_console, False, False + yield check_generate_predictions_console, False, True + yield check_generate_predictions_console, True, False + + +def check_generate_predictions_file_output_multi_infiles(use_threshold=False, + all_labels=False): + """ + Make sure generate_predictions works with multiple input files. + """ + + # create some simple classification data without feature hashing + train_fs, test_fs = make_classification_data(num_examples=1000, + num_features=5) + + # save the test feature set to an NDJ file + input_file = join(_my_dir, 'test', 'test_generate_predictions.jsonlines') + writer = NDJWriter(input_file, test_fs) + writer.write() + + enable_probability = use_threshold or all_labels + # create a learner that uses an SGD classifier + learner = Learner('SGDClassifier', probability=enable_probability) + + # train the learner with grid search + learner.train(train_fs, grid_search=True) + + # get the predictions on the test featureset + predictions = learner.predict(test_fs) + predictions = concatenate([predictions, predictions]) + + # if we asked for probabilities, then use the threshold + # to convert them into binary predictions + if use_threshold: + threshold = 0.6 + predictions = [int(p[1] >= threshold) for p in predictions] + else: + predictions = predictions.tolist() + threshold = None + + # save the learner to a file + model_file = join(_my_dir, 'output', + 'test_generate_predictions_console.model') + learner.save(model_file) + + # now call main() from generate_predictions.py + generate_cmd = [] + if use_threshold: + generate_cmd.append('-t {}'.format(threshold)) + elif all_labels: + generate_cmd.append('-a') + + output_file_path = join(_my_dir, 'output', + 'output_test_{}_{}_MULTI.tsv' + .format(use_threshold, all_labels)) + generate_cmd.extend(["--output_file", output_file_path]) + + generate_cmd.extend([model_file, input_file, input_file]) + + gp.main(generate_cmd) + + with open(output_file_path) as saved_predictions_file: + predictions_after_saving = [] + reader = csv.reader(saved_predictions_file, delimiter=str("\t")) + next(reader) + if all_labels: + for row in reader: + predictions_after_saving.append([float(r) for r in row[1:]]) + else: + for row in reader: + predictions_after_saving.append(float(row[1])) + + assert_array_almost_equal(predictions, predictions_after_saving) + + +def test_generate_predictions_file_output_multi_infiles(): + """ + Test generate_predictions file output with/without a threshold + """ + + yield check_generate_predictions_file_output_multi_infiles, False, False + yield check_generate_predictions_file_output_multi_infiles, False, True + yield check_generate_predictions_file_output_multi_infiles, True, False + + + +def check_generate_predictions_file_output(use_threshold=False, + all_labels=False): + + # create some simple classification data without feature hashing + train_fs, test_fs = make_classification_data(num_examples=1000, + num_features=5) + + # save the test feature set to an NDJ file + input_file = join(_my_dir, 'test', 'test_generate_predictions.jsonlines') + writer = NDJWriter(input_file, test_fs) + writer.write() + + enable_probability = use_threshold or all_labels + # create a learner that uses an SGD classifier + learner = Learner('SGDClassifier', probability=enable_probability) + + # train the learner with grid search + learner.train(train_fs, grid_search=True) + + # get the predictions on the test featureset + predictions = learner.predict(test_fs) + + # if we asked for probabilities, then use the threshold + # to convert them into binary predictions + if use_threshold: + threshold = 0.6 + predictions = [int(p[1] >= threshold) for p in predictions] + else: + predictions = predictions.tolist() + threshold = None + + # save the learner to a file + model_file = join(_my_dir, 'output', + 'test_generate_predictions_console.model') + learner.save(model_file) + + # now call main() from generate_predictions.py + generate_cmd = [] + if use_threshold: + generate_cmd.append('-t {}'.format(threshold)) + elif all_labels: + generate_cmd.append('-a') + + output_file_path = join(_my_dir, 'output', + 'output_test_{}_{}.tsv' + .format(use_threshold, all_labels)) + generate_cmd.extend(["--output_file", output_file_path]) + + generate_cmd.extend([model_file, input_file]) + gp.main(generate_cmd) + + with open(output_file_path) as saved_predictions_file: + predictions_after_saving = [] + reader = csv.reader(saved_predictions_file, delimiter=str("\t")) + next(reader) + if all_labels: + for row in reader: + predictions_after_saving.append([float(r) for r in row[1:]]) + else: + for row in reader: + predictions_after_saving.append(float(row[1])) + + assert_array_almost_equal(predictions, predictions_after_saving) + + +def test_generate_predictions_file_output(): + """ + Test generate_predictions file output with/without a threshold + """ + + yield check_generate_predictions_file_output, False, False + yield check_generate_predictions_file_output, False, True + yield check_generate_predictions_file_output, True, False + + + + +@raises(SystemExit) +def test_mutually_exclusive_generate_predictions_args(): + # create some simple classification data without feature hashing + train_fs, test_fs = make_classification_data(num_examples=1000, + num_features=5) + threshold = 0.6 + + # save the test feature set to an NDJ file + input_file = join(_my_dir, 'test', + 'test_generate_predictions.jsonlines') + writer = NDJWriter(input_file, test_fs) + writer.write() + + # create a learner that uses an SGD classifier + learner = Learner('SGDClassifier') + + # train the learner with grid search + learner.train(train_fs, grid_search=True) + + # save the learner to a file + model_file = join(_my_dir, 'output', + 'test_generate_predictions_console.model') + learner.save(model_file) + + # now call main() from generate_predictions.py + generate_cmd = ['-t {}'.format(threshold), '-a'] + generate_cmd.extend([model_file, input_file]) + gp.main(generate_cmd) def check_skll_convert(from_suffix, to_suffix): @@ -522,7 +848,7 @@ def check_print_model_weights(task='classification'): # create some simple classification or regression data if task == 'classification' or task == 'classification_no_intercept': train_fs, _ = make_classification_data(train_test_ratio=0.8) - elif task == 'multiclass_classification': + elif task in ['multiclass_classification', 'multiclass_classification_svc']: train_fs, _ = make_classification_data(train_test_ratio=0.8, num_labels=3) else: train_fs, _, _ = make_regression_data(num_features=4, @@ -532,9 +858,14 @@ def check_print_model_weights(task='classification'): if task == 'classification' or task == 'multiclass_classification': learner = Learner('LogisticRegression') learner.train(train_fs, grid_objective='f1_score_micro') + elif task == 'multiclass_classification_svc': + learner = Learner('SVC', model_kwargs={'kernel': 'linear'}) + learner.train(train_fs, grid_objective='f1_score_micro') elif task == 'classification_no_intercept': learner = Learner('LogisticRegression') - learner.train(train_fs, grid_objective='f1_score_micro', param_grid=[{'fit_intercept':[False]}]) + learner.train(train_fs, + grid_objective='f1_score_micro', + param_grid=[{'fit_intercept': [False]}]) elif task == 'regression': learner = Learner('LinearRegression') learner.train(train_fs, grid_objective='pearson') @@ -596,6 +927,49 @@ def check_print_model_weights(task='classification'): assert_array_almost_equal(weights, feature_values[index]) assert_array_almost_equal(intercept, learner.model.intercept_) + elif task == 'multiclass_classification_svc': + # for multiple classes with the SVC with a linear kernel, + # we get an intercept for each class pair combination + # as well as a list of weights for each class pair + # combination + + # save the computed intercept values in a dictionary + # with the class oair label as the key + lines_to_parse = [l for l in out.split('\n')[1:] if l] + parsed_intercepts_dict = {} + for intercept_string in lines_to_parse[0:3]: + fields = intercept_string.split('\t') + parsed_intercepts_dict[fields[1]] = safe_float(fields[0]) + + # save the computed feature weights in a dictionary + # with the class pair label as the key and the value + # being a list; each feature weight for this class pair + # is stored at the index of the feature name as given + # by the feature vectorizer vocabulary dictionary + parsed_weights_dict = {} + for ltp in lines_to_parse[3:]: + (weight, class_pair, feature) = ltp.split('\t') + if class_pair not in parsed_weights_dict: + parsed_weights_dict[class_pair] = [0] * 10 + feature_index = learner.feat_vectorizer.vocabulary_[feature] + parsed_weights_dict['{}'.format(class_pair)][feature_index] = safe_float(weight) + + # to validate that our coefficients are correct, we will + # get the coefficient array (for all features) from `coef_` + # for a particular class pair and then check that this array + # is equal to the list that we computed above. We will do + # the same for intercepts which are even easier to validate + # since they _only_ depend on the class pair + for idx, (class1, class2) in enumerate(itertools.combinations([0, 1, 2], 2)): + class_pair_label = '{}-vs-{}'.format(class1, class2) + computed_coefficients = parsed_weights_dict[class_pair_label] + expected_coefficients = learner.model.coef_[idx].toarray()[0] + assert_array_almost_equal(computed_coefficients, expected_coefficients) + + computed_intercept = parsed_intercepts_dict[class_pair_label] + expected_intercept = learner.model.intercept_[idx] + assert_almost_equal(computed_intercept, expected_intercept) + elif task == 'classification_no_intercept': lines_to_parse = [l for l in out.split('\n')[0:] if l] intercept = safe_float(lines_to_parse[0].split('=')[1]) @@ -637,6 +1011,7 @@ def check_print_model_weights(task='classification'): def test_print_model_weights(): yield check_print_model_weights, 'classification' yield check_print_model_weights, 'multiclass_classification' + yield check_print_model_weights, 'multiclass_classification_svc' yield check_print_model_weights, 'classification_no_intercept' yield check_print_model_weights, 'regression' yield check_print_model_weights, 'regression_linearSVR'