Skip to content

Commit

Permalink
Merge branch 'master' into bugfix/print_model_weights_fix
Browse files Browse the repository at this point in the history
Conflicts:
	.travis.yml
  • Loading branch information
desilinguist committed Jan 4, 2016
2 parents 32d6677 + 3562ded commit 5f487dd
Show file tree
Hide file tree
Showing 8 changed files with 200 additions and 39 deletions.
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ before_install:
- sudo chmod 777 /scratch/
- wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
- chmod +x miniconda.sh
- ./miniconda.sh -b -p /home/travis/miniconda
- export PATH=/home/travis/miniconda/bin:$PATH
- ./miniconda.sh -b
- export PATH=/home/travis/miniconda2/bin:$PATH
- conda config --add channels https://conda.binstar.org/dan_blanchard
- conda update --yes conda
install:
Expand Down
39 changes: 19 additions & 20 deletions skll/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
_VALID_TASKS = frozenset(['predict', 'train', 'evaluate', 'cross_validate'])
_VALID_SAMPLERS = frozenset(['Nystroem', 'RBFSampler', 'SkewedChi2Sampler',
'AdditiveChi2Sampler', ''])
_VALID_FEATURE_SCALING_OPTIONS = frozenset(['with_std', 'with_mean', 'both',
'none'])


class SKLLConfigParser(configparser.ConfigParser):
Expand Down Expand Up @@ -70,6 +72,7 @@ def __init__(self):
'results': '',
'sampler': '',
'sampler_parameters': '[]',
'save_cv_folds': 'False',
'shuffle': 'False',
'suffix': '',
'test_directory': '',
Expand Down Expand Up @@ -105,6 +108,7 @@ def __init__(self):
'results': 'Output',
'sampler': 'Input',
'sampler_parameters': 'Input',
'save_cv_folds': 'Output',
'shuffle': 'Input',
'suffix': 'Input',
'test_directory': 'Input',
Expand Down Expand Up @@ -246,23 +250,23 @@ def _parse_config_file(config_path):
experiment_name = config.get("General", "experiment_name")
else:
raise ValueError("Configuration file does not contain experiment_name "
"in the [Input] section.")
"in the [General] section.")

if config.has_option("General", "task"):
task = config.get("General", "task")
else:
raise ValueError("Configuration file does not contain task in the "
"[Input] section.")
"[General] section.")
if task not in _VALID_TASKS:
raise ValueError('An invalid task was specified: {}. Valid tasks are:'
' {}'.format(task, ', '.join(_VALID_TASKS)))

# 2. Input
sampler = config.get("Input", "sampler")
if sampler not in _VALID_SAMPLERS:
raise ValueError('An invalid sample was specified: {}. Valid samplers'
' are: {}'.format(sampler,
', '.join(_VALID_SAMPLERS)))
raise ValueError('An invalid sampler was specified: {}. Valid '
'samplers are: {}'.format(sampler,
', '.join(_VALID_SAMPLERS)))

# produce warnings if feature_hasher is set but hasher_features
# is less than or equal to zero.
Expand Down Expand Up @@ -336,7 +340,7 @@ def _parse_config_file(config_path):
# ensure that feature_scaling is specified only as one of the
# four available choices
feature_scaling = config.get("Input", "feature_scaling")
if feature_scaling not in ['with_std', 'with_mean', 'both', 'none']:
if feature_scaling not in _VALID_FEATURE_SCALING_OPTIONS:
raise ValueError("Invalid value for feature_scaling parameter: {}"
.format(feature_scaling))

Expand All @@ -347,22 +351,17 @@ def _parse_config_file(config_path):

# get the cv folds file and make a dictionary from it, if it exists
cv_folds_file = config.get("Input", "cv_folds_file")
num_cv_folds = config.get("Input", "num_cv_folds")
num_cv_folds = config.getint("Input", "num_cv_folds")
if cv_folds_file:
cv_folds_file = _locate_file(cv_folds_file, config_path)
cv_folds = _load_cv_folds(cv_folds_file,
ids_to_floats=ids_to_floats)
else:
# set the number of folds for cross-validation
if num_cv_folds:
try:
cv_folds = int(num_cv_folds)
except ValueError:
raise ValueError("The value for cv_folds should be an integer. "
"You specified {}".format(num_cv_folds))
else:
# default number of cross-validation folds
cv_folds = 10
cv_folds = num_cv_folds if num_cv_folds else 10

# whether or not to save the cv fold ids
save_cv_folds = config.get("Output", "save_cv_folds")

# whether or not to do stratified cross validation
random_folds = config.getboolean("Input", "random_folds")
Expand Down Expand Up @@ -528,10 +527,10 @@ def _parse_config_file(config_path):
test_set_name, suffix, featuresets, do_shuffle, model_path,
do_grid_search, grid_objective, probability, results_path,
pos_label_str, feature_scaling, min_feature_count,
grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
fixed_parameter_list, param_grid_list, featureset_names, learners,
prediction_dir, log_path, train_path, test_path, ids_to_floats,
class_map, custom_learner_path)
grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds,
do_stratified_folds, fixed_parameter_list, param_grid_list,
featureset_names, learners, prediction_dir, log_path, train_path,
test_path, ids_to_floats, class_map, custom_learner_path)


def _munge_featureset_name(featureset):
Expand Down
40 changes: 34 additions & 6 deletions skll/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,23 @@ def _get_stat_float(label_result_dict, stat):
else:
return float('nan')

def _write_skll_folds(skll_fold_ids, skll_fold_ids_file):
"""
Function to take a dictionary of id:test-fold-number and
write it to a file
:param skll_fold_ids: the dictionary of id: test-fold-numbers
:param skll_fold_ids_file: the open file handle to write to
:return: None
"""

f = csv.writer(skll_fold_ids_file)
f.writerow(['id', 'cv_test_fold'])
for example_id in skll_fold_ids:
f.writerow([example_id, skll_fold_ids[example_id]])

skll_fold_ids_file.flush()


def _write_summary_file(result_json_paths, output_file, ablation=0):
"""
Expand Down Expand Up @@ -323,6 +340,7 @@ def _classify_featureset(args):
grid_search_jobs = args.pop("grid_search_jobs")
grid_search_folds = args.pop("grid_search_folds")
cv_folds = args.pop("cv_folds")
save_cv_folds = args.pop("save_cv_folds")
stratified_folds = args.pop("do_stratified_folds")
label_col = args.pop("label_col")
id_col = args.pop("id_col")
Expand Down Expand Up @@ -430,6 +448,7 @@ def _classify_featureset(args):
'grid_search_folds': grid_search_folds,
'min_feature_count': min_feature_count,
'cv_folds': cv_folds,
'save_cv_folds': save_cv_folds,
'stratified_folds': stratified_folds,
'scikit_learn_version': SCIKIT_VERSION}

Expand All @@ -438,12 +457,12 @@ def _classify_featureset(args):
task_results = None
if task == 'cross_validate':
print('\tcross-validating', file=log_file)
task_results, grid_scores = learner.cross_validate(
task_results, grid_scores, skll_fold_ids = learner.cross_validate(
train_examples, shuffle=shuffle, stratified=stratified_folds,
prediction_prefix=prediction_prefix, grid_search=grid_search,
grid_search_folds=grid_search_folds, cv_folds=cv_folds,
grid_objective=grid_objective, param_grid=param_grid,
grid_jobs=grid_search_jobs)
grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds)
else:
# if we have do not have a saved model, we need to train one.
if not exists(modelfile) or overwrite:
Expand Down Expand Up @@ -523,6 +542,14 @@ def _classify_featureset(args):
else:
res = [learner_result_dict_base]

# write out the cv folds if required
if task == 'cross_validate' and save_cv_folds:
skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
with open(join(results_path,skll_fold_ids_file),
file_mode) as output_file:
_write_skll_folds(skll_fold_ids, output_file)

return res


Expand Down Expand Up @@ -691,10 +718,10 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
probability, results_path, pos_label_str, feature_scaling,
min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
fixed_parameter_list, param_grid_list, featureset_names, learners,
prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map,
custom_learner_path) = _parse_config_file(config_file)
min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds,
do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names,
learners, prediction_dir, log_path, train_path, test_path, ids_to_floats,
class_map, custom_learner_path) = _parse_config_file(config_file)

# Check if we have gridmap
if not local and not _HAVE_GRIDMAP:
Expand Down Expand Up @@ -831,6 +858,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
job_args["grid_search_jobs"] = grid_search_jobs
job_args["grid_search_folds"] = grid_search_folds
job_args["cv_folds"] = cv_folds
job_args["save_cv_folds"] = save_cv_folds
job_args["do_stratified_folds"] = do_stratified_folds
job_args["label_col"] = label_col
job_args["id_col"] = id_col
Expand Down
21 changes: 17 additions & 4 deletions skll/learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -1339,7 +1339,7 @@ def _compute_num_folds_from_example_counts(self, cv_folds, labels):
def cross_validate(self, examples, stratified=True, cv_folds=10,
grid_search=False, grid_search_folds=3, grid_jobs=None,
grid_objective='f1_score_micro', prediction_prefix=None,
param_grid=None, shuffle=False):
param_grid=None, shuffle=False, save_cv_folds=False):
"""
Cross-validates a given model on the training examples.
Expand Down Expand Up @@ -1376,11 +1376,15 @@ def cross_validate(self, examples, stratified=True, cv_folds=10,
:type prediction_prefix: str
:param shuffle: Shuffle examples before splitting into folds for CV.
:type shuffle: bool
:param save_cv_folds: Whether to save the cv fold ids or not
:type save_cv_folds: bool
:return: The confusion matrix, overall accuracy, per-label PRFs, and
model parameters for each fold in one list, and another list
with the grid search scores for each fold.
:rtype: (list of 4-tuples, list of float)
with the grid search scores for each fold. Also return a
dictionary containing the test-fold number for each id
if save_cv_folds is True, otherwise None.
:rtype: (list of 4-tuples, list of float, dict)
"""
# seed the random number generator so that randomized algorithms are
# replicable
Expand Down Expand Up @@ -1430,6 +1434,15 @@ def cross_validate(self, examples, stratified=True, cv_folds=10,
kfold = FilteredLeaveOneLabelOut(fold_labels, cv_folds, examples)
grid_search_folds = cv_folds

# Save the cross-validation fold information, if required
# The format is that the test-fold that each id appears in is stored
skll_fold_ids = None
if save_cv_folds:
skll_fold_ids = {}
for fold_num, (_, test_indices) in enumerate(kfold):
for index in test_indices:
skll_fold_ids[examples.ids[index]] = str(fold_num)

# handle each fold separately and accumulate the predictions and the
# numbers
results = []
Expand Down Expand Up @@ -1470,4 +1483,4 @@ def cross_validate(self, examples, stratified=True, cv_folds=10,
append_predictions = True

# return list of results for all folds
return results, grid_search_scores
return results, grid_search_scores, skll_fold_ids
15 changes: 15 additions & 0 deletions tests/configs/test_save_cv_folds.template.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
[General]
experiment_name=test_save_cv_folds
task=cross_validate

[Input]
featuresets=[["test_save_cv_folds"]]
learners=["LogisticRegression"]
suffix=.jsonlines

[Tuning]
grid_search=False

[Output]
save_cv_folds=True
results=output
Loading

0 comments on commit 5f487dd

Please sign in to comment.