From bfe75811ecabace05787b1dd583a15c457ec132a Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Mon, 29 Jun 2015 14:35:43 -0400 Subject: [PATCH 01/12] adding option to have non-stratified folds, and specify the number of folds --- skll/experiments.py | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/skll/experiments.py b/skll/experiments.py index 33c74efd..72055951 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -246,6 +246,8 @@ def _setup_config_parser(config_path): 'grid_search_jobs': '0', 'grid_search_folds': '3', 'cv_folds_file': '', + 'num_cv_folds': '10', + 'stratified_folds': 'True', 'suffix': '', 'label_col': 'y', 'id_col': 'id', @@ -387,13 +389,23 @@ def _parse_config_file(config_path): id_col = config.get("Input", "id_col") ids_to_floats = config.getboolean("Input", "ids_to_floats") - # get the cv folds file and make a dictionary from it + # get the cv folds file and make a dictionary from it, if it exists cv_folds_file = config.get("Input", "cv_folds_file") + num_cv_folds = config.get("Input", "num_cv_folds") if cv_folds_file: cv_folds = _load_cv_folds(cv_folds_file, ids_to_floats=ids_to_floats) else: - cv_folds = 10 + # set the number of folds for cross-validation + if num_cv_folds: + cv_folds = num_cv_folds + else: + # default number of cross-validation folds + cv_folds = 10 + + # whether or not to do stratified cross validation + stratified_folds = config.get("Input", "stratified_folds") + do_stratified_folds = True if stratified_folds else False train_file = config.get("Input", "train_file") test_file = config.get("Input", "test_file") @@ -541,7 +553,7 @@ def _parse_config_file(config_path): test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, min_feature_count, - grid_search_jobs, grid_search_folds, cv_folds, + grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) @@ -650,6 +662,7 @@ def _classify_featureset(args): grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") + stratified_folds = args.pop("stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") @@ -666,8 +679,8 @@ def _classify_featureset(args): # logging print("Task:", task, file=log_file) if task == 'cross_validate': - print(("Cross-validating on {}, feature " + - "set {} ...").format(train_set_name, featureset), + print(("Cross-validating ({} folds) on {}, feature " + + "set {} ...").format(cv_folds, train_set_name, featureset), file=log_file) elif task == 'evaluate': print(("Training on {}, Test on {}, " + @@ -756,6 +769,7 @@ def _classify_featureset(args): 'grid_search_folds': grid_search_folds, 'min_feature_count': min_feature_count, 'cv_folds': cv_folds, + 'stratified_folds': stratified_folds, 'scikit_learn_version': SCIKIT_VERSION} # check if we're doing cross-validation, because we only load/save @@ -764,7 +778,7 @@ def _classify_featureset(args): if task == 'cross_validate': print('\tcross-validating', file=log_file) task_results, grid_scores = learner.cross_validate( - train_examples, shuffle=shuffle, + train_examples, shuffle=shuffle, stratified=stratified_folds, prediction_prefix=prediction_prefix, grid_search=grid_search, grid_search_folds=grid_search_folds, cv_folds=cv_folds, grid_objective=grid_objective, param_grid=param_grid, From f65e7ee36c29c0f6ff34d26d13d498a4c85f1807 Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Mon, 29 Jun 2015 16:34:09 -0400 Subject: [PATCH 02/12] update documentation for cv_folds, random_folds, and grid_search_folds --- doc/run_experiment.rst | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst index 20e7e90f..5792e8d3 100644 --- a/doc/run_experiment.rst +++ b/doc/run_experiment.rst @@ -383,6 +383,20 @@ example, if you wanted to collapse the labels ``beagle`` and ``dachsund`` into a Any labels not included in the dictionary will be left untouched. +.. _cv_folds: + +cv_folds *(Optional)* +"""""""""""""""""""""" + +The number of folds to use for cross-validation. Defaults to 10. + +.. _random_folds: + +random_folds *(Optional)* +""""""""""""""""""""""""" + +Whether to use random folds for cross-validation. Defaults to ``False``. + .. _cv_folds_file: cv_folds_file *(Optional)* @@ -593,6 +607,13 @@ grid_search *(Optional)* Whether or not to perform grid search to find optimal parameters for classifier. Defaults to ``False``. +.. _grid_search_folds: + +grid_search_folds *(Optional)* +"""""""""""""""""""""""""""""" + +The number of folds to use for grid search. Defaults to 3. + .. _grid_search_jobs: grid_search_jobs *(Optional)* From c58fa9c93cb8192ddedf9a30ac092a691afe4336 Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Mon, 29 Jun 2015 16:34:58 -0400 Subject: [PATCH 03/12] output information about folds/stratification in results file; fix bug with grid_search_folds --- skll/experiments.py | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/skll/experiments.py b/skll/experiments.py index 72055951..8aef8645 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -171,6 +171,11 @@ def _print_fancy_output(learner_result_dicts, output_file=sys.stdout): print('Feature Set: {}'.format(lrd['featureset']), file=output_file) print('Learner: {}'.format(lrd['learner_name']), file=output_file) print('Task: {}'.format(lrd['task']), file=output_file) + if lrd['task'] == 'cross_validate': + print('Number of Folds: {}'.format(lrd['cv_folds']), + file=output_file) + print('Stratified Folds: {}'.format(lrd['stratified_folds']), + file=output_file) print('Feature Scaling: {}'.format(lrd['feature_scaling']), file=output_file) print('Grid Search: {}'.format(lrd['grid_search']), file=output_file) @@ -247,7 +252,7 @@ def _setup_config_parser(config_path): 'grid_search_folds': '3', 'cv_folds_file': '', 'num_cv_folds': '10', - 'stratified_folds': 'True', + 'random_folds': 'False', 'suffix': '', 'label_col': 'y', 'id_col': 'id', @@ -398,14 +403,17 @@ def _parse_config_file(config_path): else: # set the number of folds for cross-validation if num_cv_folds: - cv_folds = num_cv_folds + cv_folds = int(num_cv_folds) else: # default number of cross-validation folds cv_folds = 10 # whether or not to do stratified cross validation - stratified_folds = config.get("Input", "stratified_folds") - do_stratified_folds = True if stratified_folds else False + random_folds = config.get("Input", "random_folds") + if random_folds == 'True': + do_stratified_folds = False + else: + do_stratified_folds = True train_file = config.get("Input", "train_file") test_file = config.get("Input", "test_file") @@ -662,7 +670,7 @@ def _classify_featureset(args): grid_search_jobs = args.pop("grid_search_jobs") grid_search_folds = args.pop("grid_search_folds") cv_folds = args.pop("cv_folds") - stratified_folds = args.pop("stratified_folds") + stratified_folds = args.pop("do_stratified_folds") label_col = args.pop("label_col") id_col = args.pop("id_col") ids_to_floats = args.pop("ids_to_floats") @@ -790,7 +798,6 @@ def _classify_featureset(args): '{} model').format(learner_name), file=log_file) - grid_search_folds = 3 if not isinstance(cv_folds, int): grid_search_folds = cv_folds @@ -1076,7 +1083,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q', hasher_features, id_col, label_col, train_set_name, test_set_name, suffix, featuresets, do_shuffle, model_path, do_grid_search, grid_objective, probability, results_path, pos_label_str, feature_scaling, - min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, + min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map, custom_learner_path) = _parse_config_file(config_file) @@ -1216,6 +1223,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q', job_args["grid_search_jobs"] = grid_search_jobs job_args["grid_search_folds"] = grid_search_folds job_args["cv_folds"] = cv_folds + job_args["do_stratified_folds"] = do_stratified_folds job_args["label_col"] = label_col job_args["id_col"] = id_col job_args["ids_to_floats"] = ids_to_floats From 6468872b76123bc11c3a1be3eba7b84d6763c970 Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Mon, 29 Jun 2015 17:35:38 -0400 Subject: [PATCH 04/12] fix version of sklearn in requirements_rtd.txt --- requirements_rtd.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements_rtd.txt b/requirements_rtd.txt index 4522183f..7c619457 100644 --- a/requirements_rtd.txt +++ b/requirements_rtd.txt @@ -1,7 +1,7 @@ configparser logutils mock -scikit-learn>=0.14 +scikit-learn==0.15.2 six PrettyTable beautifulsoup4 From bb7e6949b0448563f771cabf3ac44d1d427b7c03 Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Mon, 29 Jun 2015 17:41:43 -0400 Subject: [PATCH 05/12] update unit test for output --- tests/test_output.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_output.py b/tests/test_output.py index a05c3098..3d795b19 100644 --- a/tests/test_output.py +++ b/tests/test_output.py @@ -134,10 +134,10 @@ def check_summary_score(use_feature_hashing=False): reader = csv.DictReader(f, dialect='excel-tab') for row in reader: - # the learner results dictionaries should have 27 rows, + # the learner results dictionaries should have 28 rows, # and all of these except results_table # should be printed (though some columns will be blank). - eq_(len(row), 27) + eq_(len(row), 28) assert row['model_params'] assert row['grid_score'] assert row['score'] From 3f028d3a97c6e5c6d818518dc3735413c5ab4133 Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Mon, 29 Jun 2015 17:48:38 -0400 Subject: [PATCH 06/12] fixing scikit learn version for travis too --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 582780dc..ac8e5b18 100644 --- a/.travis.yml +++ b/.travis.yml @@ -24,7 +24,7 @@ before_install: - conda config --add channels https://conda.binstar.org/dan_blanchard - conda update --yes conda install: - - conda install --yes pip python=$TRAVIS_PYTHON_VERSION numpy scipy beautiful-soup six scikit-learn joblib prettytable python-coveralls pyyaml + - conda install --yes pip python=$TRAVIS_PYTHON_VERSION scikit-learn==0.15.2 numpy scipy beautiful-soup six joblib prettytable python-coveralls pyyaml - if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then conda install --yes configparser logutils mock; fi - if [ $GRIDMAP == "true" ]; then conda install --yes drmaa gridmap; fi # Have to use pip for nose-cov because its entry points are not supported by conda yet From 65291316133dbdb4c2ed4d5af2ecc9114e0136c2 Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Tue, 30 Jun 2015 10:10:59 -0400 Subject: [PATCH 07/12] some testing --- skll/experiments.py | 6 +++++- tests/test_input.py | 31 +++++++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/skll/experiments.py b/skll/experiments.py index 8aef8645..dcbaf76c 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -403,7 +403,11 @@ def _parse_config_file(config_path): else: # set the number of folds for cross-validation if num_cv_folds: - cv_folds = int(num_cv_folds) + try: + cv_folds = int(num_cv_folds) + except: + raise ValueError("The value for cv_folds should be an integer. " + + "You specified {}".format(cv_folds)) else: # default number of cross-validation folds cv_folds = 10 diff --git a/tests/test_input.py b/tests/test_input.py index 8ebc39b8..9ee8f43b 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -647,3 +647,34 @@ def test_config_parsing_bad_task_paths(): elif sub_prefix == 'train_with_test_file': test_fh2.close() + +@raises(ValueError) +def test_config_parsing_bad_cv_folds(): + """ + Test to ensure config file parsing raises an error with an invalid cv_folds + """ + + train_dir = join(_my_dir, 'train') + test_dir = join(_my_dir, 'test') + output_dir = join(_my_dir, 'output') + + # make a simple config file that has a bad value for cv_folds + # but everything else is correct + values_to_fill_dict = {'experiment_name': 'config_parsing', + 'task': 'cross_validate', + 'train_directory': train_dir, + 'test_directory': test_dir, + 'cv_folds': 'random', + 'featuresets': "[['f1', 'f2', 'f3']]", + 'learners': "['LogisticRegression']", + 'log': output_dir, + 'results': output_dir, + 'objective': 'f1_macro'} + + config_template_path = join(_my_dir, 'configs', + 'test_config_parsing.template.cfg') + config_path = fill_in_config_paths_for_parsing(config_template_path, + values_to_fill_dict, + 'bad_cv_folds') + + _parse_config_file(config_path) \ No newline at end of file From f4bf9d324ec2b931618ae0ac89b1030ee50567fe Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Sat, 4 Jul 2015 14:46:47 -0400 Subject: [PATCH 08/12] adding more tests --- skll/experiments.py | 2 +- tests/test_input.py | 85 +++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 80 insertions(+), 7 deletions(-) diff --git a/skll/experiments.py b/skll/experiments.py index dcbaf76c..2a43e9a9 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -407,7 +407,7 @@ def _parse_config_file(config_path): cv_folds = int(num_cv_folds) except: raise ValueError("The value for cv_folds should be an integer. " + - "You specified {}".format(cv_folds)) + "You specified {}".format(num_cv_folds)) else: # default number of cross-validation folds cv_folds = 10 diff --git a/tests/test_input.py b/tests/test_input.py index 9ee8f43b..ccba8bdf 100644 --- a/tests/test_input.py +++ b/tests/test_input.py @@ -79,7 +79,7 @@ def fill_in_config_paths_for_parsing(config_template_path, values_to_fill_dict, 'Input': ['train_directory', 'train_file', 'test_directory', 'test_file', 'featuresets', 'featureset_names', 'feature_hasher', 'hasher_features', 'learners', - 'sampler', 'shuffle', 'feature_scaling'], + 'sampler', 'shuffle', 'feature_scaling', 'num_cv_folds'], 'Tuning': ['grid_search', 'objective'], 'Output': ['probability', 'results', 'log', 'models', 'predictions']} @@ -655,7 +655,6 @@ def test_config_parsing_bad_cv_folds(): """ train_dir = join(_my_dir, 'train') - test_dir = join(_my_dir, 'test') output_dir = join(_my_dir, 'output') # make a simple config file that has a bad value for cv_folds @@ -663,13 +662,12 @@ def test_config_parsing_bad_cv_folds(): values_to_fill_dict = {'experiment_name': 'config_parsing', 'task': 'cross_validate', 'train_directory': train_dir, - 'test_directory': test_dir, - 'cv_folds': 'random', + 'num_cv_folds': 'random', 'featuresets': "[['f1', 'f2', 'f3']]", 'learners': "['LogisticRegression']", 'log': output_dir, 'results': output_dir, - 'objective': 'f1_macro'} + 'objective': 'f1_score_macro'} config_template_path = join(_my_dir, 'configs', 'test_config_parsing.template.cfg') @@ -677,4 +675,79 @@ def test_config_parsing_bad_cv_folds(): values_to_fill_dict, 'bad_cv_folds') - _parse_config_file(config_path) \ No newline at end of file + _parse_config_file(config_path) + +def test_default_number_of_cv_folds(): + """ + Test to ensure that the cv folds value is set correctly + """ + + train_dir = join(_my_dir, 'train') + output_dir = join(_my_dir, 'output') + # make a simple config file that does not set cv_folds + + values_to_fill_dict = {'experiment_name': 'config_parsing', + 'task': 'cross_validate', + 'train_directory': train_dir, + 'featuresets': "[['f1', 'f2', 'f3']]", + 'learners': "['LogisticRegression']", + 'log': output_dir, + 'results': output_dir, + 'objective': 'f1_score_macro'} + + config_template_path = join(_my_dir, 'configs', + 'test_config_parsing.template.cfg') + config_path = fill_in_config_paths_for_parsing(config_template_path, + values_to_fill_dict, + 'default_cv_folds') + + + (experiment_name, task, sampler, fixed_sampler_parameters, + feature_hasher, hasher_features, id_col, label_col, train_set_name, + test_set_name, suffix, featuresets, do_shuffle, model_path, + do_grid_search, grid_objective, probability, results_path, + pos_label_str, feature_scaling, min_feature_count, + grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds, + fixed_parameter_list, param_grid_list, featureset_names, learners, + prediction_dir, log_path, train_path, test_path, ids_to_floats, + class_map, custom_learner_path) = _parse_config_file(config_path) + + eq_(cv_folds, 10) + +def test_setting_number_of_cv_folds(): + """ + Test to ensure that the cv folds value is set correctly + """ + + train_dir = join(_my_dir, 'train') + output_dir = join(_my_dir, 'output') + # make a simple config file that does not set cv_folds + + values_to_fill_dict = {'experiment_name': 'config_parsing', + 'task': 'cross_validate', + 'train_directory': train_dir, + 'featuresets': "[['f1', 'f2', 'f3']]", + 'learners': "['LogisticRegression']", + 'log': output_dir, + 'results': output_dir, + 'num_cv_folds': "5", + 'objective': 'f1_score_macro'} + + config_template_path = join(_my_dir, 'configs', + 'test_config_parsing.template.cfg') + config_path = fill_in_config_paths_for_parsing(config_template_path, + values_to_fill_dict, + 'default_cv_folds') + + + (experiment_name, task, sampler, fixed_sampler_parameters, + feature_hasher, hasher_features, id_col, label_col, train_set_name, + test_set_name, suffix, featuresets, do_shuffle, model_path, + do_grid_search, grid_objective, probability, results_path, + pos_label_str, feature_scaling, min_feature_count, + grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds, + fixed_parameter_list, param_grid_list, featureset_names, learners, + prediction_dir, log_path, train_path, test_path, ids_to_floats, + class_map, custom_learner_path) = _parse_config_file(config_path) + + eq_(cv_folds, 5) \ No newline at end of file From 3062046263c8ffe6b714170405213ce179b6e4a9 Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Sun, 5 Jul 2015 09:58:26 -0400 Subject: [PATCH 09/12] adding a warning --- skll/experiments.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/skll/experiments.py b/skll/experiments.py index 2a43e9a9..dd8a3f84 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -415,6 +415,9 @@ def _parse_config_file(config_path): # whether or not to do stratified cross validation random_folds = config.get("Input", "random_folds") if random_folds == 'True': + if cv_folds_file: + logger.warning('Random folds will not override'+ + 'values in cv_folds_file') do_stratified_folds = False else: do_stratified_folds = True From 73d5ff5527ca752a7dc0e8072495a3b9c3c058f8 Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Sat, 11 Jul 2015 16:58:27 -0400 Subject: [PATCH 10/12] update warning message --- skll/experiments.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/skll/experiments.py b/skll/experiments.py index dd8a3f84..fb13c36f 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -416,8 +416,8 @@ def _parse_config_file(config_path): random_folds = config.get("Input", "random_folds") if random_folds == 'True': if cv_folds_file: - logger.warning('Random folds will not override'+ - 'values in cv_folds_file') + logger.warning('Specifying cv_fold_file '+ + 'overrides random_folds') do_stratified_folds = False else: do_stratified_folds = True From 6c55959d9c939430afe7541e9a05ceb8e74e7434 Mon Sep 17 00:00:00 2001 From: Aoife Cahill Date: Sat, 11 Jul 2015 17:08:44 -0400 Subject: [PATCH 11/12] catch ValueError explicitly --- skll/experiments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skll/experiments.py b/skll/experiments.py index fb13c36f..383fe980 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -405,7 +405,7 @@ def _parse_config_file(config_path): if num_cv_folds: try: cv_folds = int(num_cv_folds) - except: + except ValueError: raise ValueError("The value for cv_folds should be an integer. " + "You specified {}".format(num_cv_folds)) else: From 186a015bbcc256ff2443d2faa9259bfc0147b7b6 Mon Sep 17 00:00:00 2001 From: Nitin Madnani Date: Sat, 11 Jul 2015 18:55:54 -0400 Subject: [PATCH 12/12] Fix typo in warning. --- skll/experiments.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skll/experiments.py b/skll/experiments.py index 383fe980..b9fa9bec 100644 --- a/skll/experiments.py +++ b/skll/experiments.py @@ -416,7 +416,7 @@ def _parse_config_file(config_path): random_folds = config.get("Input", "random_folds") if random_folds == 'True': if cv_folds_file: - logger.warning('Specifying cv_fold_file '+ + logger.warning('Specifying cv_folds_file '+ 'overrides random_folds') do_stratified_folds = False else: