diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst index eaa28ee2..a0344dbb 100644 --- a/doc/run_experiment.rst +++ b/doc/run_experiment.rst @@ -36,10 +36,8 @@ with the following added restrictions: * Only simple numeric, string, and nomimal values are supported. * Nominal values are converted to strings. -* There should be an attribute with the name specified by :ref:`id_col ` in - the :ref:`Input` section of the configuration file you create for your - experiment. This defaults to "id". If there is no such column, IDs will be - generated automatically. +* If the data has instance IDs, there should be an attribute with the name + specified by :ref:`id_col ` in the :ref:`Input` section of the configuration file you create for your experiment. This defaults to ``id``. If there is no such attribute, IDs will be generated automatically. * If the data is labelled, there must be an attribute with the name specified by :ref:`label_col ` in the :ref:`Input` section of the configuartion file you create for your experiment. This defaults to ``y``. @@ -56,10 +54,8 @@ A simple comma or tab-delimited format with the following restrictions: specified by :ref:`label_col ` in the :ref:`Input` section of the configuartion file you create for your experiment. This defaults to ``y``. -* There should be a column with the name specified by :ref:`id_col ` in - the :ref:`Input` section of the configuration file you create for your - experiment. This defaults to "id". If there is no such column, IDs will be - generated automatically. +* If the data has instance IDs, there should be a column with the name + specified by :ref:`id_col ` in the :ref:`Input` section of the configuration file you create for your experiment. This defaults to ``id``. If there is no such column, IDs will be generated automatically. * All other columns contain feature values, and every feature value must be specified (making this a poor choice for sparse data). @@ -144,7 +140,7 @@ possible settings for each section is provided below, but to summarize: cross-validation currently uses `StratifiedKFold `__. You also can optionally use predetermined folds with the - :ref:`cv_folds_location ` setting. + :ref:`cv_folds_file ` setting. .. _evaluate: @@ -162,6 +158,8 @@ possible settings for each section is provided below, but to summarize: * If you want to just **train a model**, specify a training location, and set :ref:`task` to ``train``. +.. _learners_required: + * A :ref:`list of classifiers/regressors ` to try on your feature files is required. @@ -199,7 +197,7 @@ Input The Input section has only one required field, :ref:`learners`, but also must contain either :ref:`train_file ` or -:ref:`train_location `. +:ref:`train_directory `. .. _learners: @@ -252,21 +250,21 @@ Regressors: .. _train_file: train_file *(Optional)* -""""""""""""""""""""""""""" +""""""""""""""""""""""" Path to a file containing the features to train on. Cannot be used in combination with :ref:`featuresets `, -:ref:`train_location `, or :ref:`test_location `. +:ref:`train_directory `, or :ref:`test_directory `. .. note:: If :ref:`train_file ` is not specified, - :ref:`train_location ` must be. + :ref:`train_directory ` must be. -.. _train_location: +.. _train_directory: -train_location *(Optional)* -""""""""""""""""""""""""""" +train_directory *(Optional)* +"""""""""""""""""""""""""""" Path to directory containing training data files. There must be a file for each featureset. Cannot be used in combination with :ref:`train_file ` @@ -274,22 +272,22 @@ or :ref:`test_file `. .. note:: - If :ref:`train_location ` is not specified, + If :ref:`train_directory ` is not specified, :ref:`train_file ` must be. .. _test_file: test_file *(Optional)* -""""""""""""""""""""""""""" +"""""""""""""""""""""" Path to a file containing the features to test on. Cannot be used in combination with :ref:`featuresets `, -:ref:`train_location `, or :ref:`test_location ` +:ref:`train_directory `, or :ref:`test_directory ` -.. _test_location: +.. _test_directory: -test_location *(Optional)* -"""""""""""""""""""""""""" +test_directory *(Optional)* +""""""""""""""""""""""""""" Path to directory containing test data files. There must be a file for each featureset. Cannot be used in combination with @@ -307,8 +305,8 @@ if this is not the case. Cannot be used in combination with .. note:: - If specifying :ref:`train_location ` or - :ref:`test_location `, :ref:`featuresets ` + If specifying :ref:`train_directory ` or + :ref:`test_directory `, :ref:`featuresets ` is required. .. _suffix: @@ -332,8 +330,8 @@ would like to combine. id_col *(Optional)* """"""""""""""""""" If you're using :ref:`ARFF `, :ref:`CSV `, or :ref:`TSV ` -files, the IDs for each instance are assumed to be in a column with this -name. If no column with this name is found, the IDs are generated +files, the IDs for each instance are assumed to be in a column with this +name. If no column with this name is found, the IDs are generated automatically. Defaults to ``id``. .. _label_col: @@ -385,9 +383,9 @@ example, if you wanted to collapse the labels ``beagle`` and ``dachsund`` into a Any labels not included in the dictionary will be left untouched. -.. _cv_folds_location: +.. _cv_folds_file: -cv_folds_location *(Optional)* +cv_folds_file *(Optional)* """""""""""""""""""""""""""""" Path to a csv file (with a header that is ignored) specifying folds for cross- diff --git a/doc/tutorial.rst b/doc/tutorial.rst index b93a27ab..f0d5a485 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -4,10 +4,20 @@ Tutorial ======== -For this tutorial, we're going to make use of the examples provided in the +Workflow +-------- + +In general, there are three steps to using SKLL: + +1. Get some data in a :ref:`SKLL-compatible format `. +2. Create a small :ref:`configuration file ` describing the + machine learning experiment you would like to run. +3. Run that configuration file with :ref:`run_experiment `. + +For this tutorial, we're going to use some of the examples provided in the `examples `__ -directory in your copy of SKLL. Of course, the provided examples are already -perfect and ready to use. If this weren't the case, you would need to... +directory included with your copy of SKLL. Of course, the provided examples +are already perfect and ready to use. If this weren't the case, you would need to... Get your data into the correct format ------------------------------------- @@ -48,10 +58,9 @@ experiment, we can train and test several models, either simultaneously or sequentially, depending on the availability of a grid engine. This will be described in more detail later on, when we are ready to run our experiment. -You can consult -:ref:`the full list of learners currently available in SKLL ` to get -an idea for the things you can do. As part of this tutorial, we will use the -following learners: +You can consult :ref:`the full list of learners currently available in SKLL ` +to get an idea for the things you can do. As part of this tutorial, we will +use the following learners: * Random Forest (``RandomForestClassifier``), C-Support Vector Classification (``SVC``), Linear Support Vector Classification (``LinearSVC``), Logistic @@ -77,8 +86,8 @@ are only going to train a model and evaluate its performance, because in the :ref:`General` section, :ref:`task` is set to "evaluate". We will explore the other options for :ref:`task` later. -In the :ref:`Input` section, you may want to adjust :ref:`train_location` and -:ref:`test_location` to point to the directories containing the Iris training +In the :ref:`Input` section, you may want to adjust :ref:`train_directory` and +:ref:`test_directory` to point to the directories containing the Iris training and testing data (most likely ``skll/examples/iris/train`` and ``skll/examples/iris/test`` respectively, relative to your installation of SKLL). :ref:`featuresets ` indicates the name of both the diff --git a/examples/boston/cross_val.cfg b/examples/boston/cross_val.cfg index be0b9426..7fcad885 100755 --- a/examples/boston/cross_val.cfg +++ b/examples/boston/cross_val.cfg @@ -4,7 +4,7 @@ task = cross_validate [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = boston/train +train_directory = boston/train featuresets = [["example_boston_features"]] # there is only set of features to try with one feature file in it here. featureset_names = ["example_boston"] diff --git a/examples/boston/evaluate.cfg b/examples/boston/evaluate.cfg index ffe4e073..377eafc2 100755 --- a/examples/boston/evaluate.cfg +++ b/examples/boston/evaluate.cfg @@ -4,8 +4,8 @@ task = evaluate [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = boston/train -test_location = boston/test +train_directory = boston/train +test_directory = boston/test featuresets = [["example_boston_features"]] # there is only set of features to try with one feature file in it here. featureset_names = ["example_boston"] diff --git a/examples/iris/cross_val.cfg b/examples/iris/cross_val.cfg index 416aa91a..37b77d30 100644 --- a/examples/iris/cross_val.cfg +++ b/examples/iris/cross_val.cfg @@ -5,7 +5,7 @@ task = cross_validate [Input] # this could also be an absolute path instead (and must be if you're not # running things in local mode) -train_location = train +train_directory = train featuresets = [["example_iris_features"]] # there is only set of features to try with one feature file in it here. featureset_names = ["example_iris"] diff --git a/examples/iris/evaluate.cfg b/examples/iris/evaluate.cfg index f8823761..7aa8608f 100644 --- a/examples/iris/evaluate.cfg +++ b/examples/iris/evaluate.cfg @@ -5,8 +5,8 @@ task = evaluate [Input] # this could also be an absolute path instead (and must be if you're not # running things in local mode) -train_location = train -test_location = test +train_directory = train +test_directory = test featuresets = [["example_iris_features"]] # there is only set of features to try with one feature file in it here. featureset_names = ["example_iris"] diff --git a/examples/titanic/cross_validate.cfg b/examples/titanic/cross_validate.cfg index 39a3f0dc..146df30d 100644 --- a/examples/titanic/cross_validate.cfg +++ b/examples/titanic/cross_validate.cfg @@ -4,7 +4,7 @@ task = cross_validate [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = train+dev +train_directory = train+dev featuresets = [["family.csv", "misc.csv", "socioeconomic.csv", "vitals.csv"]] learners = ["RandomForestClassifier", "DecisionTreeClassifier", "SVC", "MultinomialNB"] label_col = Survived diff --git a/examples/titanic/evaluate.cfg b/examples/titanic/evaluate.cfg index 4a74f7c6..be69a080 100644 --- a/examples/titanic/evaluate.cfg +++ b/examples/titanic/evaluate.cfg @@ -4,8 +4,8 @@ task = evaluate [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = train -test_location = dev +train_directory = train +test_directory = dev featuresets = [["family.csv", "misc.csv", "socioeconomic.csv", "vitals.csv"]] learners = ["RandomForestClassifier", "DecisionTreeClassifier", "SVC", "MultinomialNB"] label_col = Survived diff --git a/examples/titanic/evaluate_tuned.cfg b/examples/titanic/evaluate_tuned.cfg index f91e4c9e..bf23ea94 100644 --- a/examples/titanic/evaluate_tuned.cfg +++ b/examples/titanic/evaluate_tuned.cfg @@ -4,8 +4,8 @@ task = evaluate [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = train -test_location = dev +train_directory = train +test_directory = dev featuresets = [["family.csv", "misc.csv", "socioeconomic.csv", "vitals.csv"]] learners = ["RandomForestClassifier", "DecisionTreeClassifier", "SVC", "MultinomialNB"] label_col = Survived diff --git a/examples/titanic/predict_train+dev.cfg b/examples/titanic/predict_train+dev.cfg index d27624e1..5843bd00 100644 --- a/examples/titanic/predict_train+dev.cfg +++ b/examples/titanic/predict_train+dev.cfg @@ -4,8 +4,8 @@ task = predict [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = train+dev -test_location = test +train_directory = train+dev +test_directory = test featuresets = [["family.csv", "misc.csv", "socioeconomic.csv", "vitals.csv"]] # We know which learner is the best from previous experiments (using evaluate.cfg or cross_validate.cfg) learners = ["RandomForestClassifier", "DecisionTreeClassifier", "SVC", "MultinomialNB"] diff --git a/examples/titanic/predict_train+dev_tuned.cfg b/examples/titanic/predict_train+dev_tuned.cfg index 0df3de68..1766afdb 100644 --- a/examples/titanic/predict_train+dev_tuned.cfg +++ b/examples/titanic/predict_train+dev_tuned.cfg @@ -4,8 +4,8 @@ task = predict [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = train+dev -test_location = test +train_directory = train+dev +test_directory = test featuresets = [["family.csv", "misc.csv", "socioeconomic.csv", "vitals.csv"]] # We know which learner is the best from previous experiments (using evaluate.cfg or cross_validate.cfg) learners = ["RandomForestClassifier", "DecisionTreeClassifier", "SVC", "MultinomialNB"] diff --git a/examples/titanic/predict_train_only.cfg b/examples/titanic/predict_train_only.cfg index 46cde4ff..233e71df 100644 --- a/examples/titanic/predict_train_only.cfg +++ b/examples/titanic/predict_train_only.cfg @@ -4,8 +4,8 @@ task = predict [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = train -test_location = test +train_directory = train +test_directory = test featuresets = [["family.csv", "misc.csv", "socioeconomic.csv", "vitals.csv"]] # We know which learner is the best from previous experiments (using evaluate.cfg or cross_validate.cfg) learners = ["RandomForestClassifier", "DecisionTreeClassifier", "SVC", "MultinomialNB"] diff --git a/examples/titanic/predict_train_only_tuned.cfg b/examples/titanic/predict_train_only_tuned.cfg index 0a8ec4f5..bfe62a5b 100644 --- a/examples/titanic/predict_train_only_tuned.cfg +++ b/examples/titanic/predict_train_only_tuned.cfg @@ -4,8 +4,8 @@ task = predict [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = train -test_location = test +train_directory = train +test_directory = test featuresets = [["family.csv", "misc.csv", "socioeconomic.csv", "vitals.csv"]] # We know which learner is the best from previous experiments (using evaluate.cfg or cross_validate.cfg) learners = ["RandomForestClassifier", "DecisionTreeClassifier", "SVC", "MultinomialNB"] diff --git a/examples/titanic/train.cfg b/examples/titanic/train.cfg index 3f799729..2e2b3745 100644 --- a/examples/titanic/train.cfg +++ b/examples/titanic/train.cfg @@ -4,7 +4,7 @@ task = train [Input] # this could also be an absolute path instead (and must be if you're not running things in local mode) -train_location = train +train_directory = train featuresets = [["family.csv", "misc.csv", "socioeconomic.csv", "vitals.csv"]] # We know which learner is the best from previous experiments (using evaluate.cfg or cross_validate.cfg) learners = ["RandomForestClassifier"] diff --git a/skll/__init__.py b/skll/__init__.py index 9e080c4d..ff6f6d86 100644 --- a/skll/__init__.py +++ b/skll/__init__.py @@ -13,7 +13,7 @@ from sklearn.metrics import f1_score, make_scorer, SCORERS -from .data import Reader, Writer +from .data import FeatureSet, Reader, Writer from .experiments import run_configuration from .learner import Learner from .metrics import (kappa, kendall_tau, spearman, pearson, @@ -21,9 +21,9 @@ from .version import __version__, VERSION -__all__ = ['Learner', 'Reader', 'kappa', 'kendall_tau', 'spearman', - 'pearson', 'f1_score_least_frequent', 'run_configuration', - 'Writer'] +__all__ = ['FeatureSet', 'Learner', 'Reader', 'kappa', 'kendall_tau', + 'spearman', 'pearson', 'f1_score_least_frequent', + 'run_configuration', 'Writer'] # Add our scorers to the sklearn dictionary here so that they will always be # available if you import anything from skll diff --git a/skll/data/featureset.py b/skll/data/featureset.py index 3c56cdc6..231bdd05 100644 --- a/skll/data/featureset.py +++ b/skll/data/featureset.py @@ -25,7 +25,7 @@ class FeatureSet(object): Encapsulation of all of the features, values, and metadata about a given set of data. - This replaces ExamplesTuple in older versions. + This replaces ``ExamplesTuple`` from older versions. :param name: The name of this feature set. :type name: str diff --git a/skll/data/readers.py b/skll/data/readers.py index a53c355c..88dfb43b 100644 --- a/skll/data/readers.py +++ b/skll/data/readers.py @@ -99,8 +99,8 @@ def for_path(cls, path_or_list, **kwargs): matrix. :type sparse: bool :param id_col: Name of the column which contains the instance IDs for - ARFF/CSV/TSV files. If no column with that name exists, or - `None` is specified, the IDs will be generated + ARFF/CSV/TSV files. If no column with that name exists, + or `None` is specified, the IDs will be generated automatically. :type id_col: str :param label_col: Name of the column which contains the class labels @@ -117,8 +117,8 @@ def for_path(cls, path_or_list, **kwargs): kept the same. :type class_map: dict from str to str - :returns: New instance of the ``Reader`` sub-class that is - appropriate for the given path, or ``DictListReader`` if + :returns: New instance of the :class:`Reader` sub-class that is + appropriate for the given path, or :class:`DictListReader` if given a list of dictionaries. """ if not isinstance(path_or_list, string_types): @@ -166,7 +166,8 @@ def read(self): Loads examples in the ``.arff``, ``.csv``, ``.jsonlines``, ``.libsvm``, ``.megam``, ``.ndj``, or ``.tsv`` formats. - :returns: FeatureSet representing the file we read in. + :returns: :class:`~skll.data.featureset.FeatureSet` representing the + file we read in. """ # Setup logger logger = logging.getLogger(__name__) @@ -236,10 +237,11 @@ def feat_dict_generator(): class DictListReader(Reader): """ - This class is to facilitate programmatic use of ``Learner.predict()`` - and other functions that take ``FeatureSet`` objects as input. - It iterates over examples in the same way as other ``Reader``s, but uses - a list of example dictionaries instead of a path to a file. + This class is to facilitate programmatic use of + :meth:`~skll.learner.Learner.predict` and other functions that take + :class:`~skll.data.featureset.FeatureSet` objects as input. It iterates + over examples in the same way as other :class:`Reader` clases, but uses a + list of example dictionaries instead of a path to a file. :param path_or_list: List of example dictionaries. :type path_or_list: Iterable of dict @@ -295,7 +297,8 @@ def read(self): class NDJReader(Reader): """ - Reader to create a FeatureSet out of a .jsonlines/.ndj file + Reader to create a :class:`~skll.data.featureset.FeatureSet` out of a + .jsonlines/.ndj file If you would like to include example/instance IDs in your files, they must be specified in the following ways as an "id" key in each JSON @@ -336,7 +339,8 @@ def _sub_read(self, f): class MegaMReader(Reader): """ - Reader to create a FeatureSet out ouf a MegaM -fvals file. + Reader to create a :class:`~skll.data.featureset.FeatureSet` out ouf a + MegaM -fvals file. If you would like to include example/instance IDs in your files, they must be specified as a comment line directly preceding the line with @@ -403,12 +407,13 @@ def _sub_read(self, f): class LibSVMReader(Reader): """ - Reader to create a FeatureSet out ouf a LibSVM/LibLinear/SVMLight file. + Reader to create a :class:`~skll.data.featureset.FeatureSet` out ouf a + LibSVM/LibLinear/SVMLight file. We use a specially formatted comment for storing example IDs, class names, and feature names, which are normally not supported by the format. The comment is not mandatory, but without it, your labels and features will - not have names. The comment is structured as follows: + not have names. The comment is structured as follows:: ExampleID | 1=FirstClass | 1=FirstFeature 2=SecondFeature """ @@ -489,17 +494,18 @@ def _sub_read(self, f): class DelimitedReader(Reader): """ - Reader for creating a FeatureSet out of a delimited (CSV/TSV) file. + Reader for creating a :class:`~skll.data.featureset.FeatureSet` out of a + delimited (CSV/TSV) file. If you would like to include example/instance IDs in your files, they - must be specified as an "id" column. + must be specified as an ``id`` column. Also, for ARFF, CSV, and TSV files, there must be a column with the name specified by `label_col` if the data is labelled. For ARFF files, this column must also be the final one (as it is in Weka). :param dialect: The dialect of to pass on to the underlying CSV reader. - Default: 'excel-tab' + Default: ``excel-tab`` :type dialect: str """ @@ -560,7 +566,8 @@ def _sub_read(self, f): class CSVReader(DelimitedReader): """ - Reader for creating a FeatureSet out of a CSV file. + Reader for creating a :class:`~skll.data.featureset.FeatureSet` out of a + CSV file. If you would like to include example/instance IDs in your files, they must be specified as an "id" column. @@ -577,7 +584,8 @@ def __init__(self, path_or_list, **kwargs): class ARFFReader(DelimitedReader): """ - Reader for creating a FeatureSet out of an ARFF file. + Reader for creating a :class:`~skll.data.featureset.FeatureSet` out of an + ARFF file. If you would like to include example/instance IDs in your files, they must be specified as an "id" column. @@ -660,7 +668,8 @@ def _sub_read(self, f): class TSVReader(DelimitedReader): """ - Reader for creating a FeatureSet out of a TSV file. + Reader for creating a :class:`~skll.data.featureset.FeatureSet` out of a + TSV file. If you would like to include example/instance IDs in your files, they must be specified as an "id" column. diff --git a/skll/experiments.py b/skll/experiments.py index 0ac63ceb..57b0dc72 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -205,8 +205,8 @@ def _setup_config_parser(config_path): function to simplify testing. """ # initialize config parser - config = configparser.ConfigParser({'test_location': '', - 'train_location': '', + config = configparser.ConfigParser({'test_directory': '', + 'train_directory': '', 'train_file': '', 'test_file': '', 'log': '', @@ -229,7 +229,7 @@ def _setup_config_parser(config_path): 'min_feature_count': '1', 'grid_search_jobs': '0', 'grid_search_folds': '3', - 'cv_folds_location': '', + 'cv_folds_file': '', 'suffix': '', 'label_col': 'y', 'id_col': 'id', @@ -364,17 +364,17 @@ def _parse_config_file(config_path): "{}".format(feature_scaling)) # get all the input paths and directories (without trailing slashes) - train_path = config.get("Input", "train_location").rstrip('/') - test_path = config.get("Input", "test_location").rstrip('/') + train_path = config.get("Input", "train_directory").rstrip('/') + test_path = config.get("Input", "test_directory").rstrip('/') suffix = config.get("Input", "suffix") label_col = config.get("Input", "label_col") id_col = config.get("Input", "id_col") ids_to_floats = config.getboolean("Input", "ids_to_floats") # get the cv folds file and make a dictionary from it - cv_folds_location = config.get("Input", "cv_folds_location") - if cv_folds_location: - cv_folds = _load_cv_folds(cv_folds_location, + cv_folds_file = config.get("Input", "cv_folds_file") + if cv_folds_file: + cv_folds = _load_cv_folds(cv_folds_file, ids_to_floats=ids_to_floats) else: cv_folds = 10 @@ -390,19 +390,19 @@ def _parse_config_file(config_path): # The user must specify either train_file or train_path, not both. if not train_file and not train_path: raise ValueError('Invalid [Input] parameters: either "train_file" or ' - '"train_location" must be specified in the ' + '"train_directory" must be specified in the ' 'configuration file.') # Either train_file or train_path must be specified. if train_file and train_path: raise ValueError('Invalid [Input] parameters: only either "train_file"' - ' or "train_location" can be specified in the ' + ' or "train_directory" can be specified in the ' 'configuration file, not both.') # Cannot specify both test_file and test_path if test_file and test_path: raise ValueError('Invalid [Input] parameters: only either "test_file" ' - 'or "test_location" can be specified in the ' + 'or "test_directory" can be specified in the ' 'configuration file, not both.') # if train_file is specified, then assign its value to train_path @@ -976,12 +976,12 @@ def _fix_json(json_string): return json_string -def _load_cv_folds(cv_folds_location, ids_to_floats=False): +def _load_cv_folds(cv_folds_file, ids_to_floats=False): """ Loads CV folds from a CSV file with columns for example ID and fold ID (and a header). """ - with open(cv_folds_location, 'r') as f: + with open(cv_folds_file, 'r') as f: reader = csv.reader(f) next(reader) # discard the header res = {} diff --git a/tests/test_classification.py b/tests/test_classification.py index 747e2260..c1bb5e4e 100644 --- a/tests/test_classification.py +++ b/tests/test_classification.py @@ -76,8 +76,8 @@ def tearDown(): def fill_in_config_paths_for_single_file(config_template_path, train_file, - test_file, train_location='', - test_location=''): + test_file, train_directory='', + test_directory=''): """ Add paths to train and test files, and output directories to a given config template file. @@ -95,11 +95,11 @@ def fill_in_config_paths_for_single_file(config_template_path, train_file, if task == 'predict' or task == 'evaluate': config.set("Input", "test_file", join(test_dir, test_file)) - if train_location: - config.set("Input", "train_location", join(train_dir, train_location)) + if train_directory: + config.set("Input", "train_directory", join(train_dir, train_directory)) - if test_location: - config.set("Input", "test_location", join(test_dir, test_location)) + if test_directory: + config.set("Input", "test_directory", join(test_dir, test_directory)) to_fill_in = ['log', 'predictions'] @@ -113,10 +113,10 @@ def fill_in_config_paths_for_single_file(config_template_path, train_file, config.set("Output", d, join(output_dir)) if task == 'cross_validate': - cv_folds_location = config.get("Input", "cv_folds_location") - if cv_folds_location: - config.set("Input", "cv_folds_location", - join(train_dir, cv_folds_location)) + cv_folds_file = config.get("Input", "cv_folds_file") + if cv_folds_file: + config.set("Input", "cv_folds_file", + join(train_dir, cv_folds_file)) config_prefix = re.search(r'^(.*)\.template\.cfg', config_template_path).groups()[0] @@ -370,9 +370,9 @@ def test_train_file_test_file(): @raises(ValueError) -def test_train_file_and_train_location(): +def test_train_file_and_train_directory(): """ - Test that train_file + train_location = ValueError + Test that train_file + train_directory = ValueError """ # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", @@ -384,14 +384,14 @@ def test_train_file_and_train_location(): join(_my_dir, 'test', 'test_single_file.' 'jsonlines'), - train_location='foo') + train_directory='foo') _parse_config_file(config_path) @raises(ValueError) -def test_test_file_and_test_location(): +def test_test_file_and_test_directory(): """ - Test that test_file + test_location = ValueError + Test that test_file + test_directory = ValueError """ # Run experiment config_path = fill_in_config_paths_for_single_file(join(_my_dir, "configs", @@ -403,5 +403,5 @@ def test_test_file_and_test_location(): join(_my_dir, 'test', 'test_single_file.' 'jsonlines'), - test_location='foo') + test_directory='foo') _parse_config_file(config_path) diff --git a/tests/test_input.py b/tests/test_input.py index 176a51c4..8ebc39b8 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -76,7 +76,7 @@ def fill_in_config_paths_for_parsing(config_template_path, values_to_fill_dict, config = _setup_config_parser(config_template_path) to_fill_in = {'General': ['experiment_name', 'task'], - 'Input': ['train_location', 'train_file', 'test_location', + 'Input': ['train_directory', 'train_file', 'test_directory', 'test_file', 'featuresets', 'featureset_names', 'feature_hasher', 'hasher_features', 'learners', 'sampler', 'shuffle', 'feature_scaling'], @@ -159,8 +159,8 @@ def test_config_parsing_no_name(): # make a simple config file that has a bad task # but everything else is correct - values_to_fill_dict = {'train_location': train_dir, - 'test_location': test_dir, + values_to_fill_dict = {'train_directory': train_dir, + 'test_directory': test_dir, 'task': 'evaluate', 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LogisticRegression']", @@ -186,8 +186,8 @@ def test_config_parsing_bad_task(): # make a simple config file that has a bad task # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', - 'train_location': train_dir, - 'test_location': test_dir, + 'train_directory': train_dir, + 'test_directory': test_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LogisticRegression']", 'log': output_dir, @@ -226,8 +226,8 @@ def test_config_parsing_bad_learner(): # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', - 'train_location': train_dir, - 'test_location': test_dir, + 'train_directory': train_dir, + 'test_directory': test_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'log': output_dir, 'results': output_dir} @@ -263,8 +263,8 @@ def test_config_parsing_bad_sampler(): # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', - 'train_location': train_dir, - 'test_location': test_dir, + 'train_directory': train_dir, + 'test_directory': test_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LogisticRegression']", 'log': output_dir, @@ -294,8 +294,8 @@ def test_config_parsing_bad_hashing(): # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', - 'train_location': train_dir, - 'test_location': test_dir, + 'train_directory': train_dir, + 'test_directory': test_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LogisticRegression']", 'log': output_dir, @@ -323,8 +323,8 @@ def test_config_parsing_bad_featuresets(): # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', - 'train_location': train_dir, - 'test_location': test_dir, + 'train_directory': train_dir, + 'test_directory': test_dir, 'learners': "['LogisticRegression']", 'log': output_dir, 'results': output_dir} @@ -356,8 +356,8 @@ def test_config_parsing_bad_featurenames(): # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', - 'train_location': train_dir, - 'test_location': test_dir, + 'train_directory': train_dir, + 'test_directory': test_dir, 'learners': "['LogisticRegression']", 'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', " "'f6']]"), @@ -392,8 +392,8 @@ def test_config_parsing_bad_scaling(): # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', - 'train_location': train_dir, - 'test_location': test_dir, + 'train_directory': train_dir, + 'test_directory': test_dir, 'learners': "['LogisticRegression']", 'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', " "'f6']]"), @@ -427,7 +427,7 @@ def test_config_parsing_bad_train(): # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', - 'test_location': test_dir, + 'test_directory': test_dir, 'learners': "['LogisticRegression']", 'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', " "'f6']]"), @@ -446,10 +446,10 @@ def test_config_parsing_bad_train(): ('test_config_' 'parsing_'))) values_to_fill_dict['train_file'] = train_fh.name - values_to_fill_dict['train_location'] = train_dir + values_to_fill_dict['train_directory'] = train_dir elif sub_prefix == 'nonexistent_train_path': - values_to_fill_dict['train_location'] = join(train_dir, 'foo') + values_to_fill_dict['train_directory'] = join(train_dir, 'foo') elif sub_prefix == 'nonexistent_test_file': values_to_fill_dict['train_file'] = 'foo.jsonlines' @@ -478,7 +478,7 @@ def test_config_parsing_bad_test(): # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', - 'train_location': train_dir, + 'train_directory': train_dir, 'learners': "['LogisticRegression']", 'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', " "'f6']]"), @@ -496,10 +496,10 @@ def test_config_parsing_bad_test(): ('test_config_' 'parsing_'))) values_to_fill_dict['test_file'] = test_fh.name - values_to_fill_dict['test_location'] = test_dir + values_to_fill_dict['test_directory'] = test_dir elif sub_prefix == 'nonexistent_test_path': - values_to_fill_dict['test_location'] = join(test_dir, 'foo') + values_to_fill_dict['test_directory'] = join(test_dir, 'foo') elif sub_prefix == 'nonexistent_test_file': values_to_fill_dict['test_file'] = 'foo.jsonlines' @@ -530,8 +530,8 @@ def test_config_parsing_bad_objective(): # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'evaluate', - 'train_location': train_dir, - 'test_location': test_dir, + 'train_directory': train_dir, + 'test_directory': test_dir, 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LogisticRegression']", 'log': output_dir, @@ -558,7 +558,7 @@ def test_config_parsing_bad_task_paths(): # make a simple config file that has a bad task # but everything else is correct values_to_fill_dict = {'experiment_name': 'config_parsing', - 'train_location': train_dir, + 'train_directory': train_dir, 'learners': "['LogisticRegression']", 'featuresets': ("[['f1', 'f2', 'f3'], ['f4', 'f5', " "'f6']]"), @@ -582,12 +582,12 @@ def test_config_parsing_bad_task_paths(): elif sub_prefix == 'xv_with_test_path': values_to_fill_dict['task'] = 'cross_validate' values_to_fill_dict['results'] = output_dir - values_to_fill_dict['test_location'] = test_dir + values_to_fill_dict['test_directory'] = test_dir elif sub_prefix == 'train_with_test_path': values_to_fill_dict['task'] = 'train' values_to_fill_dict['models'] = output_dir - values_to_fill_dict['test_location'] = test_dir + values_to_fill_dict['test_directory'] = test_dir elif sub_prefix == 'xv_with_test_file': values_to_fill_dict['task'] = 'cross_validate' @@ -617,7 +617,7 @@ def test_config_parsing_bad_task_paths(): elif sub_prefix == 'predict_with_results': values_to_fill_dict['task'] = 'predict' - values_to_fill_dict['test_location'] = test_dir + values_to_fill_dict['test_directory'] = test_dir values_to_fill_dict['predictions'] = output_dir values_to_fill_dict['results'] = output_dir diff --git a/tests/test_regression.py b/tests/test_regression.py index a859d19e..39bb5665 100644 --- a/tests/test_regression.py +++ b/tests/test_regression.py @@ -362,7 +362,7 @@ def fill_in_config_paths_for_fancy_output(config_template_path): config = _setup_config_parser(config_template_path) config.set("Input", "train_file", join(train_dir, "fancy_train.jsonlines")) - config.set("Input", "test_location", join(test_dir, + config.set("Input", "test_directory", join(test_dir, "fancy_test.jsonlines")) config.set("Output", "results", output_dir) config.set("Output", "log", output_dir) diff --git a/tests/utils.py b/tests/utils.py index 3f6ee9b5..395cdcbf 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -35,7 +35,7 @@ def fill_in_config_paths(config_template_path): task = config.get("General", "task") - config.set("Input", "train_location", train_dir) + config.set("Input", "train_directory", train_dir) to_fill_in = ['log', 'vocabs', 'predictions'] @@ -49,13 +49,13 @@ def fill_in_config_paths(config_template_path): config.set("Output", d, join(output_dir)) if task == 'cross_validate': - cv_folds_location = config.get("Input", "cv_folds_location") - if cv_folds_location: - config.set("Input", "cv_folds_location", - join(train_dir, cv_folds_location)) + cv_folds_file = config.get("Input", "cv_folds_file") + if cv_folds_file: + config.set("Input", "cv_folds_file", + join(train_dir, cv_folds_file)) if task == 'predict' or task == 'evaluate': - config.set("Input", "test_location", test_dir) + config.set("Input", "test_directory", test_dir) # set up custom learner path, if relevant custom_learner_path = config.get("Input", "custom_learner_path")