Merge pull request #510 from EducationalTestingService/further-clean-…

…up-python27-seaborn More Python 2.7 and Seaborn cleanups
EducationalTestingService · Sep 18, 2019 · d913c7b · d913c7b
2 parents 29b71ae + 890c997
commit d913c7b
Show file tree

Hide file tree

Showing 17 changed files with 70 additions and 107 deletions.
diff --git a/skll/config.py b/skll/config.py
@@ -13,7 +13,6 @@
 import itertools
 import logging
 import os
-from io import open
 from os.path import (basename, dirname, exists,
                      isabs, join, normpath, realpath)
 
@@ -543,8 +542,8 @@ def _parse_config_file(config_path, log_level=logging.INFO):
 
     # ensure that featureset_names is a list of strings, if specified
     if featureset_names:
-        if (not isinstance(featureset_names, list)
-            or not all([isinstance(fs, str) for fs in
+        if (not isinstance(featureset_names, list) or
+                not all([isinstance(fs, str) for fs in
                         featureset_names])):
             raise ValueError("The featureset_names parameter should be a list "
                              "of strings. You specified: {}"

diff --git a/skll/data/readers.py b/skll/data/readers.py
@@ -38,7 +38,7 @@
 import re
 import sys
 from itertools import islice
-from io import open, StringIO
+from io import StringIO
 
 import numpy as np
 import pandas as pd

diff --git a/skll/data/writers.py b/skll/data/writers.py
@@ -14,7 +14,6 @@
 import sys
 import pandas as pd
 from decimal import Decimal
-from io import open
 
 from sklearn.feature_extraction import FeatureHasher
 

diff --git a/skll/experiments.py b/skll/experiments.py
@@ -15,12 +15,14 @@
 import math
 import sys
 
+import matplotlib
+import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import ruamel.yaml as yaml
+import seaborn as sns
 
 from collections import defaultdict
-from io import open
 from itertools import combinations
 from os.path import exists, isfile, join, getsize
 
@@ -42,16 +44,9 @@
 else:
     _HAVE_GRIDMAP = True
 
-# Check if seaborn (and matplotlib) are available
-try:
-    import matplotlib
-    import seaborn as sns
-except ImportError:
-    _HAVE_SEABORN = False
-else:
-    import matplotlib.pyplot as plt
-    plt.ioff()
-    _HAVE_SEABORN = True
+# Turn off interactive plotting for matplotlib
+plt.ioff()
+
 
 _VALID_TASKS = frozenset(['predict', 'train', 'evaluate', 'cross_validate'])
 _VALID_SAMPLERS = frozenset(['Nystroem', 'RBFSampler', 'SkewedChi2Sampler',
@@ -308,9 +303,9 @@ def _print_fancy_output(learner_result_dicts, output_file=sys.stdout):
               file=output_file)
         print('Grid Objective Function: {}'.format(lrd['grid_objective']),
               file=output_file)
-    if (lrd['task'] == 'cross_validate'
-            and lrd['grid_search']
-            and lrd['cv_folds'].endswith('folds file')):
+    if (lrd['task'] == 'cross_validate' and
+            lrd['grid_search'] and
+            lrd['cv_folds'].endswith('folds file')):
         print('Using Folds File for Grid Search: {}'.format(lrd['use_folds_file_for_grid_search']),
               file=output_file)
     if lrd['task'] in ['evaluate', 'cross_validate'] and lrd['additional_scores']:
@@ -572,9 +567,9 @@ def _classify_featureset(args):
     # check whether a trained model on the same data with the same
     # featureset already exists if so, load it and then use it on test data
     modelfile = join(model_path, '{}.model'.format(job_name))
-    if (task in ['cross_validate', 'learning_curve']
-            or not exists(modelfile)
-            or overwrite):
+    if (task in ['cross_validate', 'learning_curve'] or
+            not exists(modelfile) or
+            overwrite):
         train_examples = _load_featureset(train_path,
                                           featureset,
                                           suffix,
@@ -766,8 +761,7 @@ def _classify_featureset(args):
                                            learner_result_dict_base)
 
         # write out the result dictionary to a json file
-        file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
-        with open(results_json_path, file_mode) as json_file:
+        with open(results_json_path, 'w') as json_file:
             json.dump(res, json_file, cls=NumpyTypeEncoder)
 
         with open(join(results_path,
@@ -793,8 +787,7 @@ def _classify_featureset(args):
         res = [res]
 
         # write out the result dictionary to a json file
-        file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
-        with open(results_json_path, file_mode) as json_file:
+        with open(results_json_path, 'w') as json_file:
             json.dump(res, json_file, cls=NumpyTypeEncoder)
 
     # For all other tasks, i.e. train or predict
@@ -810,17 +803,15 @@ def _classify_featureset(args):
                 grid_search_cv_results_dicts[0]
             grid_search_cv_results_dict.update(learner_result_dict_base)
             # write out the result dictionary to a json file
-            file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
-            with open(results_json_path, file_mode) as json_file:
+            with open(results_json_path, 'w') as json_file:
                 json.dump(grid_search_cv_results_dict, json_file, cls=NumpyTypeEncoder)
         res = [learner_result_dict_base]
 
     # write out the cv folds if required
     if task == 'cross_validate' and save_cv_folds:
         skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
-        file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
         with open(join(results_path, skll_fold_ids_file),
-                  file_mode) as output_file:
+                  'w') as output_file:
             _write_skll_folds(skll_fold_ids, output_file)
 
     return res
@@ -917,8 +908,8 @@ def _create_learner_result_dicts(task_results,
                     recall_sum_dict[actual_label] += float(label_recall)
                 if not math.isnan(label_f):
                     f_sum_dict[actual_label] += float(label_f)
-                result_row = ([actual_label] + conf_matrix[i]
-                              + [label_prec, label_recall, label_f])
+                result_row = ([actual_label] + conf_matrix[i] +
+                              [label_prec, label_recall, label_f])
                 rows.append(result_row)
 
             result_table = tabulate(rows,
@@ -1099,19 +1090,19 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
             if ablation is None:
                 for i in range(1, len(features)):
                     for excluded_features in combinations(features, i):
-                        expanded_fs.append(sorted(featureset
-                                                  - set(excluded_features)))
-                        expanded_fs_names.append(featureset_name
-                                                 + '_minus_'
-                                                 + _munge_featureset_name(excluded_features))
+                        expanded_fs.append(sorted(featureset -
+                                                  set(excluded_features)))
+                        expanded_fs_names.append(featureset_name +
+                                                 '_minus_' +
+                                                 _munge_featureset_name(excluded_features))
             # Otherwise, just expand removing the specified number at a time
             else:
                 for excluded_features in combinations(features, ablation):
                     expanded_fs.append(sorted(featureset -
                                               set(excluded_features)))
-                    expanded_fs_names.append(featureset_name
-                                             + '_minus_'
-                                             + _munge_featureset_name(excluded_features))
+                    expanded_fs_names.append(featureset_name +
+                                             '_minus_' +
+                                             _munge_featureset_name(excluded_features))
             # Also add version with nothing removed as baseline
             expanded_fs.append(features)
             expanded_fs_names.append(featureset_name + '_all')
@@ -1269,27 +1260,20 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
     # write out the summary results file
     if (task == 'cross_validate' or task == 'evaluate') and write_summary:
         summary_file_name = experiment_name + '_summary.tsv'
-        file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
-        with open(join(results_path, summary_file_name), file_mode) as output_file:
+        with open(join(results_path, summary_file_name), 'w') as output_file:
             _write_summary_file(result_json_paths,
                                 output_file,
                                 ablation=ablation)
     elif task == 'learning_curve':
         output_file_name = experiment_name + '_summary.tsv'
-        file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
         output_file_path = join(results_path, output_file_name)
-        with open(output_file_path, file_mode) as output_file:
+        with open(output_file_path, 'w') as output_file:
             _write_learning_curve_file(result_json_paths, output_file)
 
         # generate the actual plot if we have the requirements installed
-        if _HAVE_SEABORN:
-            _generate_learning_curve_plots(experiment_name,
-                                           results_path,
-                                           output_file_path)
-        else:
-            logger.warning("Raw data for the learning curve saved in "
-                           "{}. No plots were generated since pandas and "
-                           "seaborn are not installed. ".format(output_file_path))
+        _generate_learning_curve_plots(experiment_name,
+                                       results_path,
+                                       output_file_path)
 
     return result_json_paths
 
@@ -1421,12 +1405,12 @@ def _generate_learning_curve_plots(experiment_name,
                 for j, col_name in enumerate(g.col_names):
                     ax = g.axes[i][j]
                     ax.set(ylim=ylimits[row_name])
-                    df_ax_train = df_fs[(df_fs['learner_name'] == col_name)
-                                        & (df_fs['metric'] == row_name)
-                                        & (df_fs['variable'] == 'train_score_mean')]
-                    df_ax_test = df_fs[(df_fs['learner_name'] == col_name)
-                                       & (df_fs['metric'] == row_name)
-                                       & (df_fs['variable'] == 'test_score_mean')]
+                    df_ax_train = df_fs[(df_fs['learner_name'] == col_name) &
+                                        (df_fs['metric'] == row_name) &
+                                        (df_fs['variable'] == 'train_score_mean')]
+                    df_ax_test = df_fs[(df_fs['learner_name'] == col_name) &
+                                       (df_fs['metric'] == row_name) &
+                                       (df_fs['variable'] == 'test_score_mean')]
                     ax.fill_between(list(range(len(df_ax_train))),
                                     df_ax_train['value'] - df_ax_train['train_score_std'],
                                     df_ax_train['value'] + df_ax_train['train_score_std'],

diff --git a/skll/utilities/plot_learning_curves.py b/skll/utilities/plot_learning_curves.py
@@ -22,7 +22,6 @@
 from os import makedirs
 from os.path import basename, exists
 
-from skll.experiments import _HAVE_SEABORN
 from skll.experiments import _generate_learning_curve_plots
 from skll.version import __version__
 
@@ -65,11 +64,6 @@ def main(argv=None):
     if not exists(args.output_dir):
         makedirs(args.output_dir)
 
-    # check that we have pandas and seaborn available
-    if not _HAVE_SEABORN:
-        logging.error("Error: need seaborn to generate learning curve plots.")
-        sys.exit(1)
-
     # get the experiment name from the learning curve TSV file
     # output_file_name = experiment_name + '_summary.tsv'
     experiment_name = basename(args.tsv_file).rstrip('_summary.tsv')

diff --git a/skll/utilities/summarize_results.py b/skll/utilities/summarize_results.py
@@ -10,8 +10,6 @@
 
 import argparse
 import logging
-import sys
-from io import open
 
 from skll.experiments import _write_summary_file
 from skll.version import __version__
@@ -50,8 +48,7 @@ def main(argv=None):
     logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                 '%(message)s'))
 
-    file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
-    with open(args.summary_file, file_mode) as output_file:
+    with open(args.summary_file, 'w') as output_file:
         _write_summary_file(args.json_file, output_file,
                             ablation=int(args.ablation))
 

diff --git a/tests/backward_compatibility/v0.9.17_test_summary_test_summary_LogisticRegression.2.model b/tests/backward_compatibility/v0.9.17_test_summary_test_summary_LogisticRegression.2.model
diff --git a/...y_test_summary_LogisticRegression.3.model → ...ary_test_summary_LogisticRegression.model b/...y_test_summary_LogisticRegression.3.model → ...ary_test_summary_LogisticRegression.model
diff --git a/tests/other/test_load_saved_model.2.model b/tests/other/test_load_saved_model.2.model
diff --git a/tests/other/test_load_saved_model.3.model → tests/other/test_load_saved_model.model b/tests/other/test_load_saved_model.3.model → tests/other/test_load_saved_model.model
diff --git a/tests/test_ablation.py b/tests/test_ablation.py
@@ -13,7 +13,6 @@
 import os
 
 from glob import glob
-from io import open
 from os.path import abspath, dirname, exists, join
 
 from nose.tools import eq_

diff --git a/tests/test_classification.py b/tests/test_classification.py
@@ -14,10 +14,8 @@
 import json
 import os
 import re
-import sys
 import warnings
 
-from io import open
 from os.path import abspath, dirname, exists, join
 
 import numpy as np
@@ -346,7 +344,7 @@ def test_dummy_classifier_predict():
                                               {"strategy": "constant", "constant": 1}],
                                              [np.array([0, 0, 0, 1, 0, 1, 1, 0, 0, 0]),
                                               np.zeros(10),
-                                              np.ones(10)*1]):
+                                              np.ones(10) * 1]):
         yield check_dummy_classifier_predict, model_args, train_labels, expected_output
 
 
@@ -755,7 +753,7 @@ def test_learner_api_load_into_existing_instance():
 
     # now use `load()` to replace the existing instance with a
     # different saved learner
-    other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0]))
+    other_model_file = join(_my_dir, 'other', 'test_load_saved_model.model')
     learner1.load(other_model_file)
 
     # now load the saved model into another instance using the class method

diff --git a/tests/test_custom_learner.py b/tests/test_custom_learner.py
@@ -12,7 +12,6 @@
 import csv
 import os
 from glob import glob
-from io import open
 from os.path import abspath, dirname, exists, join
 
 import numpy as np

diff --git a/tests/test_cv.py b/tests/test_cv.py
@@ -11,16 +11,16 @@
 
 import csv
 import itertools
-from io import open
-import os
-from os.path import abspath, dirname, join, exists
 import json
-from glob import glob
+import os
 import re
 
+from glob import glob
+from os.path import abspath, dirname, join, exists
+
 import numpy as np
-from nose.tools import eq_, raises
 
+from nose.tools import eq_, raises
 from sklearn.feature_extraction import FeatureHasher
 from sklearn.datasets.samples_generator import make_classification
 from sklearn.utils.testing import (assert_greater,

diff --git a/tests/test_featureset.py b/tests/test_featureset.py
@@ -638,8 +638,8 @@ def test_feature_merging_order_invariance():
     merged_fs_shuf = train_fs1 + train_fs2_shuf
 
     # check that the two merged versions are the same
-    feature_names = (train_fs1.vectorizer.get_feature_names()
-                     + train_fs2.vectorizer.get_feature_names())
+    feature_names = (train_fs1.vectorizer.get_feature_names() +
+                     train_fs2.vectorizer.get_feature_names())
     assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names)
     assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(),
                        feature_names)
@@ -659,8 +659,7 @@ def test_feature_merging_order_invariance():
     assert_array_equal(merged_fs.features.todense(),
                        merged_fs_shuf.features.todense())
 
-    assert not np.all(merged_fs.features[:, 0:2].todense()
-                      == merged_fs.features[:, 2:4].todense())
+    assert not np.all(merged_fs.features[:, 0:2].todense() == merged_fs.features[:, 2:4].todense())
 
 
 # Tests related to loading featuresets and merging them