Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Expose cv_folds and stratified #240

Merged
merged 12 commits into from
Jul 11, 2015
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ before_install:
- conda config --add channels https://conda.binstar.org/dan_blanchard
- conda update --yes conda
install:
- conda install --yes pip python=$TRAVIS_PYTHON_VERSION numpy scipy beautiful-soup six scikit-learn joblib prettytable python-coveralls pyyaml
- conda install --yes pip python=$TRAVIS_PYTHON_VERSION scikit-learn==0.15.2 numpy scipy beautiful-soup six joblib prettytable python-coveralls pyyaml
- if [ ${TRAVIS_PYTHON_VERSION:0:1} == "2" ]; then conda install --yes configparser logutils mock; fi
- if [ $GRIDMAP == "true" ]; then conda install --yes drmaa gridmap; fi
# Have to use pip for nose-cov because its entry points are not supported by conda yet
Expand Down
21 changes: 21 additions & 0 deletions doc/run_experiment.rst
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,20 @@ example, if you wanted to collapse the labels ``beagle`` and ``dachsund`` into a

Any labels not included in the dictionary will be left untouched.

.. _cv_folds:

cv_folds *(Optional)*
""""""""""""""""""""""

The number of folds to use for cross-validation. Defaults to 10.

.. _random_folds:

random_folds *(Optional)*
"""""""""""""""""""""""""

Whether to use random folds for cross-validation. Defaults to ``False``.

.. _cv_folds_file:

cv_folds_file *(Optional)*
Expand Down Expand Up @@ -593,6 +607,13 @@ grid_search *(Optional)*
Whether or not to perform grid search to find optimal parameters for
classifier. Defaults to ``False``.

.. _grid_search_folds:

grid_search_folds *(Optional)*
""""""""""""""""""""""""""""""

The number of folds to use for grid search. Defaults to 3.

.. _grid_search_jobs:

grid_search_jobs *(Optional)*
Expand Down
2 changes: 1 addition & 1 deletion requirements_rtd.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
configparser
logutils
mock
scikit-learn>=0.14
scikit-learn==0.15.2
six
PrettyTable
beautifulsoup4
Expand Down
45 changes: 37 additions & 8 deletions skll/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ def _print_fancy_output(learner_result_dicts, output_file=sys.stdout):
print('Feature Set: {}'.format(lrd['featureset']), file=output_file)
print('Learner: {}'.format(lrd['learner_name']), file=output_file)
print('Task: {}'.format(lrd['task']), file=output_file)
if lrd['task'] == 'cross_validate':
print('Number of Folds: {}'.format(lrd['cv_folds']),
file=output_file)
print('Stratified Folds: {}'.format(lrd['stratified_folds']),
file=output_file)
print('Feature Scaling: {}'.format(lrd['feature_scaling']),
file=output_file)
print('Grid Search: {}'.format(lrd['grid_search']), file=output_file)
Expand Down Expand Up @@ -246,6 +251,8 @@ def _setup_config_parser(config_path):
'grid_search_jobs': '0',
'grid_search_folds': '3',
'cv_folds_file': '',
'num_cv_folds': '10',
'random_folds': 'False',
'suffix': '',
'label_col': 'y',
'id_col': 'id',
Expand Down Expand Up @@ -387,13 +394,33 @@ def _parse_config_file(config_path):
id_col = config.get("Input", "id_col")
ids_to_floats = config.getboolean("Input", "ids_to_floats")

# get the cv folds file and make a dictionary from it
# get the cv folds file and make a dictionary from it, if it exists
cv_folds_file = config.get("Input", "cv_folds_file")
num_cv_folds = config.get("Input", "num_cv_folds")
if cv_folds_file:
cv_folds = _load_cv_folds(cv_folds_file,
ids_to_floats=ids_to_floats)
else:
cv_folds = 10
# set the number of folds for cross-validation
if num_cv_folds:
try:
cv_folds = int(num_cv_folds)
except:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's probably better to specify a ValueError explicitly here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what you mean? There is a ValueError raised?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I meant that you want to catch a ValueError explicitly in the except clause.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't I do that?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here's the code I see:

        if num_cv_folds:
            try:
                cv_folds = int(num_cv_folds)
            except:
                raise ValueError("The value for cv_folds should be an integer. " +
                                 "You specified {}".format(num_cv_folds))

Here's what I am saying it should look like:

        if num_cv_folds:
            try:
                cv_folds = int(num_cv_folds)
            except ValueError:
                raise ValueError("The value for cv_folds should be an integer. " +
                                 "You specified {}".format(num_cv_folds))

Note the difference in the except statement.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see, will update.

raise ValueError("The value for cv_folds should be an integer. " +
"You specified {}".format(num_cv_folds))
else:
# default number of cross-validation folds
cv_folds = 10

# whether or not to do stratified cross validation
random_folds = config.get("Input", "random_folds")
if random_folds == 'True':
if cv_folds_file:
logger.warning('Random folds will not override'+
'values in cv_folds_file')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This message is not entirely clear. So, basically, random_folds is being ignored here, right? May be something like: "Specifying cv_folds_file overrides random_folds"?

do_stratified_folds = False
else:
do_stratified_folds = True

train_file = config.get("Input", "train_file")
test_file = config.get("Input", "test_file")
Expand Down Expand Up @@ -541,7 +568,7 @@ def _parse_config_file(config_path):
test_set_name, suffix, featuresets, do_shuffle, model_path,
do_grid_search, grid_objective, probability, results_path,
pos_label_str, feature_scaling, min_feature_count,
grid_search_jobs, grid_search_folds, cv_folds,
grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
fixed_parameter_list, param_grid_list, featureset_names, learners,
prediction_dir, log_path, train_path, test_path, ids_to_floats,
class_map, custom_learner_path)
Expand Down Expand Up @@ -650,6 +677,7 @@ def _classify_featureset(args):
grid_search_jobs = args.pop("grid_search_jobs")
grid_search_folds = args.pop("grid_search_folds")
cv_folds = args.pop("cv_folds")
stratified_folds = args.pop("do_stratified_folds")
label_col = args.pop("label_col")
id_col = args.pop("id_col")
ids_to_floats = args.pop("ids_to_floats")
Expand All @@ -666,8 +694,8 @@ def _classify_featureset(args):
# logging
print("Task:", task, file=log_file)
if task == 'cross_validate':
print(("Cross-validating on {}, feature " +
"set {} ...").format(train_set_name, featureset),
print(("Cross-validating ({} folds) on {}, feature " +
"set {} ...").format(cv_folds, train_set_name, featureset),
file=log_file)
elif task == 'evaluate':
print(("Training on {}, Test on {}, " +
Expand Down Expand Up @@ -756,6 +784,7 @@ def _classify_featureset(args):
'grid_search_folds': grid_search_folds,
'min_feature_count': min_feature_count,
'cv_folds': cv_folds,
'stratified_folds': stratified_folds,
'scikit_learn_version': SCIKIT_VERSION}

# check if we're doing cross-validation, because we only load/save
Expand All @@ -764,7 +793,7 @@ def _classify_featureset(args):
if task == 'cross_validate':
print('\tcross-validating', file=log_file)
task_results, grid_scores = learner.cross_validate(
train_examples, shuffle=shuffle,
train_examples, shuffle=shuffle, stratified=stratified_folds,
prediction_prefix=prediction_prefix, grid_search=grid_search,
grid_search_folds=grid_search_folds, cv_folds=cv_folds,
grid_objective=grid_objective, param_grid=param_grid,
Expand All @@ -776,7 +805,6 @@ def _classify_featureset(args):
'{} model').format(learner_name),
file=log_file)

grid_search_folds = 3
if not isinstance(cv_folds, int):
grid_search_folds = cv_folds

Expand Down Expand Up @@ -1062,7 +1090,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
probability, results_path, pos_label_str, feature_scaling,
min_feature_count, grid_search_jobs, grid_search_folds, cv_folds,
min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
fixed_parameter_list, param_grid_list, featureset_names, learners,
prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map,
custom_learner_path) = _parse_config_file(config_file)
Expand Down Expand Up @@ -1202,6 +1230,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
job_args["grid_search_jobs"] = grid_search_jobs
job_args["grid_search_folds"] = grid_search_folds
job_args["cv_folds"] = cv_folds
job_args["do_stratified_folds"] = do_stratified_folds
job_args["label_col"] = label_col
job_args["id_col"] = id_col
job_args["ids_to_floats"] = ids_to_floats
Expand Down
106 changes: 105 additions & 1 deletion tests/test_input.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def fill_in_config_paths_for_parsing(config_template_path, values_to_fill_dict,
'Input': ['train_directory', 'train_file', 'test_directory',
'test_file', 'featuresets', 'featureset_names',
'feature_hasher', 'hasher_features', 'learners',
'sampler', 'shuffle', 'feature_scaling'],
'sampler', 'shuffle', 'feature_scaling', 'num_cv_folds'],
'Tuning': ['grid_search', 'objective'],
'Output': ['probability', 'results', 'log', 'models',
'predictions']}
Expand Down Expand Up @@ -647,3 +647,107 @@ def test_config_parsing_bad_task_paths():

elif sub_prefix == 'train_with_test_file':
test_fh2.close()

@raises(ValueError)
def test_config_parsing_bad_cv_folds():
"""
Test to ensure config file parsing raises an error with an invalid cv_folds
"""

train_dir = join(_my_dir, 'train')
output_dir = join(_my_dir, 'output')

# make a simple config file that has a bad value for cv_folds
# but everything else is correct
values_to_fill_dict = {'experiment_name': 'config_parsing',
'task': 'cross_validate',
'train_directory': train_dir,
'num_cv_folds': 'random',
'featuresets': "[['f1', 'f2', 'f3']]",
'learners': "['LogisticRegression']",
'log': output_dir,
'results': output_dir,
'objective': 'f1_score_macro'}

config_template_path = join(_my_dir, 'configs',
'test_config_parsing.template.cfg')
config_path = fill_in_config_paths_for_parsing(config_template_path,
values_to_fill_dict,
'bad_cv_folds')

_parse_config_file(config_path)

def test_default_number_of_cv_folds():
"""
Test to ensure that the cv folds value is set correctly
"""

train_dir = join(_my_dir, 'train')
output_dir = join(_my_dir, 'output')
# make a simple config file that does not set cv_folds

values_to_fill_dict = {'experiment_name': 'config_parsing',
'task': 'cross_validate',
'train_directory': train_dir,
'featuresets': "[['f1', 'f2', 'f3']]",
'learners': "['LogisticRegression']",
'log': output_dir,
'results': output_dir,
'objective': 'f1_score_macro'}

config_template_path = join(_my_dir, 'configs',
'test_config_parsing.template.cfg')
config_path = fill_in_config_paths_for_parsing(config_template_path,
values_to_fill_dict,
'default_cv_folds')


(experiment_name, task, sampler, fixed_sampler_parameters,
feature_hasher, hasher_features, id_col, label_col, train_set_name,
test_set_name, suffix, featuresets, do_shuffle, model_path,
do_grid_search, grid_objective, probability, results_path,
pos_label_str, feature_scaling, min_feature_count,
grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
fixed_parameter_list, param_grid_list, featureset_names, learners,
prediction_dir, log_path, train_path, test_path, ids_to_floats,
class_map, custom_learner_path) = _parse_config_file(config_path)

eq_(cv_folds, 10)

def test_setting_number_of_cv_folds():
"""
Test to ensure that the cv folds value is set correctly
"""

train_dir = join(_my_dir, 'train')
output_dir = join(_my_dir, 'output')
# make a simple config file that does not set cv_folds

values_to_fill_dict = {'experiment_name': 'config_parsing',
'task': 'cross_validate',
'train_directory': train_dir,
'featuresets': "[['f1', 'f2', 'f3']]",
'learners': "['LogisticRegression']",
'log': output_dir,
'results': output_dir,
'num_cv_folds': "5",
'objective': 'f1_score_macro'}

config_template_path = join(_my_dir, 'configs',
'test_config_parsing.template.cfg')
config_path = fill_in_config_paths_for_parsing(config_template_path,
values_to_fill_dict,
'default_cv_folds')


(experiment_name, task, sampler, fixed_sampler_parameters,
feature_hasher, hasher_features, id_col, label_col, train_set_name,
test_set_name, suffix, featuresets, do_shuffle, model_path,
do_grid_search, grid_objective, probability, results_path,
pos_label_str, feature_scaling, min_feature_count,
grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
fixed_parameter_list, param_grid_list, featureset_names, learners,
prediction_dir, log_path, train_path, test_path, ids_to_floats,
class_map, custom_learner_path) = _parse_config_file(config_path)

eq_(cv_folds, 5)
4 changes: 2 additions & 2 deletions tests/test_output.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ def check_summary_score(use_feature_hashing=False):
reader = csv.DictReader(f, dialect='excel-tab')

for row in reader:
# the learner results dictionaries should have 27 rows,
# the learner results dictionaries should have 28 rows,
# and all of these except results_table
# should be printed (though some columns will be blank).
eq_(len(row), 27)
eq_(len(row), 28)
assert row['model_params']
assert row['grid_score']
assert row['score']
Expand Down