Merge branch 'master' into bugfix/print_model_weights_fix

Conflicts: .travis.yml
EducationalTestingService · Jan 4, 2016 · 5f487dd · 5f487dd
2 parents 32d6677 + 3562ded
commit 5f487dd
Show file tree

Hide file tree

Showing 8 changed files with 200 additions and 39 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -19,8 +19,8 @@ before_install:
   - sudo chmod 777 /scratch/
   - wget http://repo.continuum.io/miniconda/Miniconda-latest-Linux-x86_64.sh -O miniconda.sh
   - chmod +x miniconda.sh
-  - ./miniconda.sh -b -p /home/travis/miniconda
-  - export PATH=/home/travis/miniconda/bin:$PATH
+  - ./miniconda.sh -b
+  - export PATH=/home/travis/miniconda2/bin:$PATH
   - conda config --add channels https://conda.binstar.org/dan_blanchard
   - conda update --yes conda
 install:

diff --git a/skll/config.py b/skll/config.py
@@ -28,6 +28,8 @@
 _VALID_TASKS = frozenset(['predict', 'train', 'evaluate', 'cross_validate'])
 _VALID_SAMPLERS = frozenset(['Nystroem', 'RBFSampler', 'SkewedChi2Sampler',
                              'AdditiveChi2Sampler', ''])
+_VALID_FEATURE_SCALING_OPTIONS = frozenset(['with_std', 'with_mean', 'both',
+                                            'none'])
 
 
 class SKLLConfigParser(configparser.ConfigParser):
@@ -70,6 +72,7 @@ def __init__(self):
                     'results': '',
                     'sampler': '',
                     'sampler_parameters': '[]',
+                    'save_cv_folds': 'False',
                     'shuffle': 'False',
                     'suffix': '',
                     'test_directory': '',
@@ -105,6 +108,7 @@ def __init__(self):
                                    'results': 'Output',
                                    'sampler': 'Input',
                                    'sampler_parameters': 'Input',
+                                   'save_cv_folds': 'Output',
                                    'shuffle': 'Input',
                                    'suffix': 'Input',
                                    'test_directory': 'Input',
@@ -246,23 +250,23 @@ def _parse_config_file(config_path):
         experiment_name = config.get("General", "experiment_name")
     else:
         raise ValueError("Configuration file does not contain experiment_name "
-                         "in the [Input] section.")
+                         "in the [General] section.")
 
     if config.has_option("General", "task"):
         task = config.get("General", "task")
     else:
         raise ValueError("Configuration file does not contain task in the "
-                         "[Input] section.")
+                         "[General] section.")
     if task not in _VALID_TASKS:
         raise ValueError('An invalid task was specified: {}.  Valid tasks are:'
                          ' {}'.format(task, ', '.join(_VALID_TASKS)))
 
     # 2. Input
     sampler = config.get("Input", "sampler")
     if sampler not in _VALID_SAMPLERS:
-        raise ValueError('An invalid sample was specified: {}.  Valid samplers'
-                         ' are: {}'.format(sampler,
-                                           ', '.join(_VALID_SAMPLERS)))
+        raise ValueError('An invalid sampler was specified: {}.  Valid '
+                         'samplers are: {}'.format(sampler,
+                                                   ', '.join(_VALID_SAMPLERS)))
 
     # produce warnings if feature_hasher is set but hasher_features
     # is less than or equal to zero.
@@ -336,7 +340,7 @@ def _parse_config_file(config_path):
     # ensure that feature_scaling is specified only as one of the
     # four available choices
     feature_scaling = config.get("Input", "feature_scaling")
-    if feature_scaling not in ['with_std', 'with_mean', 'both', 'none']:
+    if feature_scaling not in _VALID_FEATURE_SCALING_OPTIONS:
         raise ValueError("Invalid value for feature_scaling parameter: {}"
                          .format(feature_scaling))
 
@@ -347,22 +351,17 @@ def _parse_config_file(config_path):
 
     # get the cv folds file and make a dictionary from it, if it exists
     cv_folds_file = config.get("Input", "cv_folds_file")
-    num_cv_folds = config.get("Input", "num_cv_folds")
+    num_cv_folds = config.getint("Input", "num_cv_folds")
     if cv_folds_file:
         cv_folds_file = _locate_file(cv_folds_file, config_path)
         cv_folds = _load_cv_folds(cv_folds_file,
                                   ids_to_floats=ids_to_floats)
     else:
         # set the number of folds for cross-validation
-        if num_cv_folds:
-            try:
-                cv_folds = int(num_cv_folds)
-            except ValueError:
-                raise ValueError("The value for cv_folds should be an integer. "
-                                 "You specified {}".format(num_cv_folds))
-        else:
-            # default number of cross-validation folds
-            cv_folds = 10
+        cv_folds = num_cv_folds if num_cv_folds else 10
+
+    # whether or not to save the cv fold ids
+    save_cv_folds = config.get("Output", "save_cv_folds")
 
     # whether or not to do stratified cross validation
     random_folds = config.getboolean("Input", "random_folds")
@@ -528,10 +527,10 @@ def _parse_config_file(config_path):
             test_set_name, suffix, featuresets, do_shuffle, model_path,
             do_grid_search, grid_objective, probability, results_path,
             pos_label_str, feature_scaling, min_feature_count,
-            grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
-            fixed_parameter_list, param_grid_list, featureset_names, learners,
-            prediction_dir, log_path, train_path, test_path, ids_to_floats,
-            class_map, custom_learner_path)
+            grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds,
+            do_stratified_folds, fixed_parameter_list, param_grid_list,
+            featureset_names, learners, prediction_dir, log_path, train_path,
+            test_path, ids_to_floats, class_map, custom_learner_path)
 
 
 def _munge_featureset_name(featureset):

diff --git a/skll/experiments.py b/skll/experiments.py
@@ -87,6 +87,23 @@ def _get_stat_float(label_result_dict, stat):
     else:
         return float('nan')
 
+def _write_skll_folds(skll_fold_ids, skll_fold_ids_file):
+    """
+    Function to take a dictionary of id:test-fold-number and
+    write it to a file
+
+    :param skll_fold_ids: the dictionary of id: test-fold-numbers
+    :param skll_fold_ids_file: the open file handle to write to
+    :return: None
+    """
+
+    f = csv.writer(skll_fold_ids_file)
+    f.writerow(['id', 'cv_test_fold'])
+    for example_id in skll_fold_ids:
+        f.writerow([example_id, skll_fold_ids[example_id]])
+
+    skll_fold_ids_file.flush()
+
 
 def _write_summary_file(result_json_paths, output_file, ablation=0):
     """
@@ -323,6 +340,7 @@ def _classify_featureset(args):
     grid_search_jobs = args.pop("grid_search_jobs")
     grid_search_folds = args.pop("grid_search_folds")
     cv_folds = args.pop("cv_folds")
+    save_cv_folds = args.pop("save_cv_folds")
     stratified_folds = args.pop("do_stratified_folds")
     label_col = args.pop("label_col")
     id_col = args.pop("id_col")
@@ -430,6 +448,7 @@ def _classify_featureset(args):
                                     'grid_search_folds': grid_search_folds,
                                     'min_feature_count': min_feature_count,
                                     'cv_folds': cv_folds,
+                                    'save_cv_folds': save_cv_folds,
                                     'stratified_folds': stratified_folds,
                                     'scikit_learn_version': SCIKIT_VERSION}
 
@@ -438,12 +457,12 @@ def _classify_featureset(args):
         task_results = None
         if task == 'cross_validate':
             print('\tcross-validating', file=log_file)
-            task_results, grid_scores = learner.cross_validate(
+            task_results, grid_scores, skll_fold_ids = learner.cross_validate(
                 train_examples, shuffle=shuffle, stratified=stratified_folds,
                 prediction_prefix=prediction_prefix, grid_search=grid_search,
                 grid_search_folds=grid_search_folds, cv_folds=cv_folds,
                 grid_objective=grid_objective, param_grid=param_grid,
-                grid_jobs=grid_search_jobs)
+                grid_jobs=grid_search_jobs, save_cv_folds=save_cv_folds)
         else:
             # if we have do not have a saved model, we need to train one.
             if not exists(modelfile) or overwrite:
@@ -523,6 +542,14 @@ def _classify_featureset(args):
         else:
             res = [learner_result_dict_base]
 
+        # write out the cv folds if required
+        if task == 'cross_validate' and save_cv_folds:
+            skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
+            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
+            with open(join(results_path,skll_fold_ids_file),
+                      file_mode) as output_file:
+                _write_skll_folds(skll_fold_ids, output_file)
+
     return res
 
 
@@ -691,10 +718,10 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
      hasher_features, id_col, label_col, train_set_name, test_set_name, suffix,
      featuresets, do_shuffle, model_path, do_grid_search, grid_objective,
      probability, results_path, pos_label_str, feature_scaling,
-     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, do_stratified_folds,
-     fixed_parameter_list, param_grid_list, featureset_names, learners,
-     prediction_dir, log_path, train_path, test_path, ids_to_floats, class_map,
-     custom_learner_path) = _parse_config_file(config_file)
+     min_feature_count, grid_search_jobs, grid_search_folds, cv_folds, save_cv_folds,
+     do_stratified_folds, fixed_parameter_list, param_grid_list, featureset_names,
+     learners, prediction_dir, log_path, train_path, test_path, ids_to_floats,
+     class_map, custom_learner_path) = _parse_config_file(config_file)
 
     # Check if we have gridmap
     if not local and not _HAVE_GRIDMAP:
@@ -831,6 +858,7 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
             job_args["grid_search_jobs"] = grid_search_jobs
             job_args["grid_search_folds"] = grid_search_folds
             job_args["cv_folds"] = cv_folds
+            job_args["save_cv_folds"] = save_cv_folds
             job_args["do_stratified_folds"] = do_stratified_folds
             job_args["label_col"] = label_col
             job_args["id_col"] = id_col

diff --git a/skll/learner.py b/skll/learner.py
@@ -1339,7 +1339,7 @@ def _compute_num_folds_from_example_counts(self, cv_folds, labels):
     def cross_validate(self, examples, stratified=True, cv_folds=10,
                        grid_search=False, grid_search_folds=3, grid_jobs=None,
                        grid_objective='f1_score_micro', prediction_prefix=None,
-                       param_grid=None, shuffle=False):
+                       param_grid=None, shuffle=False, save_cv_folds=False):
         """
         Cross-validates a given model on the training examples.
 
@@ -1376,11 +1376,15 @@ def cross_validate(self, examples, stratified=True, cv_folds=10,
         :type prediction_prefix: str
         :param shuffle: Shuffle examples before splitting into folds for CV.
         :type shuffle: bool
+        :param save_cv_folds: Whether to save the cv fold ids or not
+        :type save_cv_folds: bool
 
         :return: The confusion matrix, overall accuracy, per-label PRFs, and
                  model parameters for each fold in one list, and another list
-                 with the grid search scores for each fold.
-        :rtype: (list of 4-tuples, list of float)
+                 with the grid search scores for each fold. Also return a
+                 dictionary containing the test-fold number for each id
+                 if save_cv_folds is True, otherwise None.
+        :rtype: (list of 4-tuples, list of float, dict)
         """
         # seed the random number generator so that randomized algorithms are
         # replicable
@@ -1430,6 +1434,15 @@ def cross_validate(self, examples, stratified=True, cv_folds=10,
             kfold = FilteredLeaveOneLabelOut(fold_labels, cv_folds, examples)
             grid_search_folds = cv_folds
 
+        # Save the cross-validation fold information, if required
+        # The format is that the test-fold that each id appears in is stored
+        skll_fold_ids = None
+        if save_cv_folds:
+            skll_fold_ids = {}
+            for fold_num, (_, test_indices) in enumerate(kfold):
+                for index in test_indices:
+                    skll_fold_ids[examples.ids[index]] = str(fold_num)
+
         # handle each fold separately and accumulate the predictions and the
         # numbers
         results = []
@@ -1470,4 +1483,4 @@ def cross_validate(self, examples, stratified=True, cv_folds=10,
             append_predictions = True
 
         # return list of results for all folds
-        return results, grid_search_scores
+        return results, grid_search_scores, skll_fold_ids
diff --git a/tests/configs/test_save_cv_folds.template.cfg b/tests/configs/test_save_cv_folds.template.cfg
@@ -0,0 +1,15 @@
+[General]
+experiment_name=test_save_cv_folds
+task=cross_validate
+
+[Input]
+featuresets=[["test_save_cv_folds"]]
+learners=["LogisticRegression"]
+suffix=.jsonlines
+
+[Tuning]
+grid_search=False
+
+[Output]
+save_cv_folds=True
+results=output