Skip to content

Commit

Permalink
Merge pull request #458 from EducationalTestingService/remove-default…
Browse files Browse the repository at this point in the history
…-objective

Remove default value of objective function and `objective` config field.
  • Loading branch information
desilinguist committed Feb 13, 2019
2 parents 0b13413 + 570d324 commit 1067873
Show file tree
Hide file tree
Showing 30 changed files with 347 additions and 338 deletions.
92 changes: 39 additions & 53 deletions skll/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,7 @@ def __init__(self):
'models': '',
'num_cv_folds': '10',
'metrics': "[]",
'objectives': "['f1_score_micro']",
'objective': "f1_score_micro",
'objectives': "[]",
'param_grids': '[]',
'pos_label_str': '',
'predictions': '',
Expand Down Expand Up @@ -115,7 +114,6 @@ def __init__(self):
'models': 'Output',
'num_cv_folds': 'Input',
'objectives': 'Tuning',
'objective': 'Tuning',
'param_grids': 'Tuning',
'pos_label_str': 'Tuning',
'predictions': 'Output',
Expand Down Expand Up @@ -293,10 +291,6 @@ def _setup_config_parser(config_path, validate=True):
------
IOError
If the configuration file does not exist.
ValueError
If the configuration file specifies both objective and objectives.
TypeError
If any objective is not a string.
"""
# initialize config parser with the given defaults
config = SKLLConfigParser()
Expand All @@ -307,31 +301,6 @@ def _setup_config_parser(config_path, validate=True):
config_path)
config.read(config_path)

# normalize objective to objectives
objective_value = config.get('Tuning', 'objective')
objectives_value = config.get('Tuning', 'objectives')
objective_default = config._defaults['objective']
objectives_default = config._defaults['objectives']

# if both of them are non default, raise error
if (objectives_value != objectives_default and objective_value != objective_default):
raise ValueError("The configuration file can specify "
"either 'objective' or 'objectives', "
"not both")
else:
# if objective is default value, delete it
if objective_value == objective_default:
config.remove_option('Tuning', 'objective')
else:
# else convert objective into objectives and delete objective
objective_value = yaml.safe_load(_fix_json(objective_value), )
if isinstance(objective_value, string_types):
config.set(
'Tuning', 'objectives', "['{}']".format(objective_value))
config.remove_option('Tuning', 'objective')
else:
raise TypeError("objective should be a string")

if validate:
config.validate()

Expand Down Expand Up @@ -390,7 +359,7 @@ def _parse_config_file(config_path, log_level=logging.INFO):
do_grid_search : bool
Whether to perform grid search.
grid_objectives : list of str
A list of objects functions to use for tuning.
A list of scoring functions to use for tuning.
probability : bool
Whether to output probabilities for each class.
results_path : str
Expand Down Expand Up @@ -471,7 +440,9 @@ def _parse_config_file(config_path, log_level=logging.INFO):

# extract parameters from the various sections in the config file

# 1. General
######################
# 1. General section #
######################
if config.has_option("General", "experiment_name"):
experiment_name = config.get("General", "experiment_name")
else:
Expand All @@ -489,7 +460,7 @@ def _parse_config_file(config_path, log_level=logging.INFO):
os.makedirs(log_path)

# Create a top-level log file under the log path
main_log_file =join(log_path, '{}.log'.format(experiment_name))
main_log_file = join(log_path, '{}.log'.format(experiment_name))

# Now create a SKLL logger that will log to this file as well
# as to the console. Use the log level provided - note that
Expand All @@ -508,7 +479,9 @@ def _parse_config_file(config_path, log_level=logging.INFO):
raise ValueError('An invalid task was specified: {}. Valid tasks are:'
' {}'.format(task, ', '.join(_VALID_TASKS)))

# 2. Input
####################
# 2. Input section #
####################
sampler = config.get("Input", "sampler")
if sampler not in _VALID_SAMPLERS:
raise ValueError('An invalid sampler was specified: {}. Valid '
Expand Down Expand Up @@ -585,8 +558,8 @@ def _parse_config_file(config_path, log_level=logging.INFO):
learning_curve_cv_folds_list = [10] * len(learners)
else:
if (not isinstance(learning_curve_cv_folds_list, list) or
not all([isinstance(fold, int) for fold in learning_curve_cv_folds_list]) or
not len(learning_curve_cv_folds_list) == len(learners)):
not all([isinstance(fold, int) for fold in learning_curve_cv_folds_list]) or
not len(learning_curve_cv_folds_list) == len(learners)):
raise ValueError("The learning_curve_cv_folds parameter should "
"be a list of integers of the same length as "
"the number of learners. You specified: {}"
Expand All @@ -603,7 +576,7 @@ def _parse_config_file(config_path, log_level=logging.INFO):
else:
if (not isinstance(learning_curve_train_sizes, list) or
not all([isinstance(size, int) or isinstance(size, float) for size in
learning_curve_train_sizes])):
learning_curve_train_sizes])):
raise ValueError("The learning_curve_train_sizes parameter should "
"be a list of integers or floats. You specified: {}"
.format(learning_curve_train_sizes))
Expand Down Expand Up @@ -728,7 +701,9 @@ def _parse_config_file(config_path, log_level=logging.INFO):
else:
class_map = None

# 3. Output
#####################
# 3. Output section #
#####################
probability = config.getboolean("Output", "probability")

# do we want to keep the predictions?
Expand Down Expand Up @@ -760,15 +735,24 @@ def _parse_config_file(config_path, log_level=logging.INFO):
# what are the output metrics?
output_metrics = config.get("Output", "metrics")
output_metrics = _parse_and_validate_metrics(output_metrics,
'metrics',
logger=logger)
'metrics',
logger=logger)

#####################
# 4. Tuning section #
#####################

# 4. Tuning
# do we need to run a grid search for the hyperparameters or are we just
# using the defaults?
do_grid_search = config.getboolean("Tuning", "grid_search")

# Check if `param_grids` is specified, but `grid_search` is False
# parse any provided grid objective functions
grid_objectives = config.get("Tuning", "objectives")
grid_objectives = _parse_and_validate_metrics(grid_objectives,
'objectives',
logger=logger)

# Check if `param_grids` is specified, but `do_grid_search` is False
if param_grid_list and not do_grid_search:
logger.warning('Since "grid_search" is set to False, the specified'
' "param_grids" will be ignored.')
Expand Down Expand Up @@ -800,16 +784,17 @@ def _parse_config_file(config_path, log_level=logging.INFO):
# how many folds should we run in parallel for grid search
grid_search_folds = config.getint("Tuning", "grid_search_folds")

# what are the objective functions for the grid search?
grid_objectives = config.get("Tuning", "objectives")
grid_objectives = _parse_and_validate_metrics(grid_objectives,
'objectives',
logger=logger)

# check whether the right things are set for the given task
if (task == 'evaluate' or task == 'predict') and not test_path:
raise ValueError('The test set must be set when task is evaluate or '
'predict.')
if task in ['cross_validate', 'evaluate', 'train']:
if do_grid_search and len(grid_objectives) == 0:
raise ValueError('You must specify a list of objectives if doing grid search.')
if not do_grid_search and len(grid_objectives) > 0:
logger.warning('Since "grid_search" is set to False, any specified'
' "objectives" will be ignored.')
grid_objectives = []
if task in ['cross_validate', 'train', 'learning_curve'] and test_path:
raise ValueError('The test set should not be set when task is '
'{}.'.format(task))
Expand Down Expand Up @@ -846,7 +831,7 @@ def _parse_config_file(config_path, log_level=logging.INFO):
# some overlaps - we will assume that the user meant to
# use the metric for tuning _and_ evaluation, not just evaluation
if (len(grid_objectives) > 0 and
len(output_metrics) > 0):
len(output_metrics) > 0):
common_metrics_and_objectives = set(grid_objectives).intersection(output_metrics)
if common_metrics_and_objectives:
logger.warning('The following are specified both as '
Expand Down Expand Up @@ -990,10 +975,11 @@ def _parse_and_validate_metrics(metrics, option_name, logger=None):
if not logger:
logger = logging.getLogger(__name__)

# what are the objective functions for the grid search?
# make sure the given metrics data type is a list
# and parse it correctly
metrics = yaml.safe_load(_fix_json(metrics))
if not isinstance(metrics, list):
raise TypeError("{} should be a list".format(option_name))
raise TypeError("{} should be a list, not a {}.".format(option_name, type(metrics)))

# `mean_squared_error` should be replaced with `neg_mean_squared_error`
if 'mean_squared_error' in metrics:
Expand Down
33 changes: 19 additions & 14 deletions skll/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ def _write_learning_curve_file(result_json_paths, output_file):
train_scores_stds_by_size = lrd['learning_curve_train_scores_stds']
test_scores_stds_by_size = lrd['learning_curve_test_scores_stds']

# rename `grid_objective` to `objective` since that can be confusing
# rename `grid_objective` to `metric` since the latter name can be confusing
lrd['metric'] = lrd['grid_objective']

for (size,
Expand Down Expand Up @@ -321,8 +321,8 @@ def _print_fancy_output(learner_result_dicts, output_file=sys.stdout):
print('Grid Objective Function: {}'.format(lrd['grid_objective']),
file=output_file)
if (lrd['task'] == 'cross_validate' and
lrd['grid_search'] and
lrd['cv_folds'].endswith('folds file')):
lrd['grid_search'] and
lrd['cv_folds'].endswith('folds file')):
print('Using Folds File for Grid Search: {}'.format(lrd['use_folds_file_for_grid_search']),
file=output_file)
if lrd['task'] in ['evaluate', 'cross_validate'] and lrd['additional_scores']:
Expand Down Expand Up @@ -358,7 +358,7 @@ def _print_fancy_output(learner_result_dicts, output_file=sys.stdout):
file=output_file)
print('Pearson = {: f}'.format(lrd['pearson']),
file=output_file)
print('Objective Function Score (Test) = {}'.format(lrd['score']),
print('Objective Function Score (Test) = {}'.format(lrd.get('score', '')),
file=output_file)

# now print the additional metrics, if there were any
Expand Down Expand Up @@ -580,8 +580,8 @@ def _classify_featureset(args):
# featureset already exists if so, load it and then use it on test data
modelfile = join(model_path, '{}.model'.format(job_name))
if (task in ['cross_validate', 'learning_curve'] or
not exists(modelfile) or
overwrite):
not exists(modelfile) or
overwrite):
train_examples = _load_featureset(train_path,
featureset,
suffix,
Expand Down Expand Up @@ -651,7 +651,6 @@ def _classify_featureset(args):
else:
grid_search_folds_to_print = str(grid_search_folds)


# create a list of dictionaries of the results information
learner_result_dict_base = {'experiment_name': experiment_name,
'train_set_name': train_set_name,
Expand All @@ -674,8 +673,7 @@ def _classify_featureset(args):
'grid_search_folds': grid_search_folds_to_print,
'min_feature_count': min_feature_count,
'cv_folds': cv_folds_to_print,
'using_folds_file': isinstance(cv_folds, dict) \
or isinstance(grid_search_folds, dict),
'using_folds_file': isinstance(cv_folds, dict) or isinstance(grid_search_folds, dict),
'save_cv_folds': save_cv_folds,
'use_folds_file_for_grid_search': use_folds_file_for_grid_search,
'stratified_folds': stratified_folds,
Expand Down Expand Up @@ -706,9 +704,9 @@ def _classify_featureset(args):
(curve_train_scores,
curve_test_scores,
computed_curve_train_sizes) = learner.learning_curve(train_examples,
grid_objective,
cv_folds=learning_curve_cv_folds,
train_sizes=learning_curve_train_sizes,
metric=grid_objective)
train_sizes=learning_curve_train_sizes)
else:
# if we have do not have a saved model, we need to train one.
if not exists(modelfile) or overwrite:
Expand Down Expand Up @@ -1034,7 +1032,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
param_grid_list, featureset_names, learners, prediction_dir, log_path, train_path,
test_path, ids_to_floats, class_map, custom_learner_path, learning_curve_cv_folds_list,
learning_curve_train_sizes, output_metrics) = _parse_config_file(config_file,
log_level=log_level)
log_level=log_level)

# get the main experiment logger that will already have been
# created by the configuration parser so we don't need anything
Expand Down Expand Up @@ -1131,14 +1129,21 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
if task == 'learning_curve' and len(output_metrics) > 0:
grid_objectives = output_metrics

# if there were no grid objectives provided, just set it to
# a list containing a single None so as to allow the parallelization
# to proceeed and to pass the correct default value of grid_objective
# down to _classify_featureset().
if not grid_objectives:
grid_objectives = [None]

# Run each featureset-learner-objective combination
for featureset, featureset_name in zip(featuresets, featureset_names):
for learner_num, learner_name in enumerate(learners):
for grid_objective in grid_objectives:

# for the individual job name, we need to add the feature set name
# and the learner name
if len(grid_objectives) == 1:
if grid_objective is None or len(grid_objectives) == 1:
job_name_components = [experiment_name, featureset_name,
learner_name]
else:
Expand Down Expand Up @@ -1424,4 +1429,4 @@ def _generate_learning_curve_plots(experiment_name,
ncol=1,
frameon=True)
g.fig.tight_layout(w_pad=1)
plt.savefig(join(output_dir,'{}_{}.png'.format(experiment_name, fs_name)), dpi=300);
plt.savefig(join(output_dir, '{}_{}.png'.format(experiment_name, fs_name)), dpi=300);

0 comments on commit 1067873

Please sign in to comment.