Skip to content

Commit

Permalink
Merge pull request #510 from EducationalTestingService/further-clean-…
Browse files Browse the repository at this point in the history
…up-python27-seaborn

More Python 2.7 and Seaborn cleanups
  • Loading branch information
desilinguist committed Sep 18, 2019
2 parents 29b71ae + 890c997 commit d913c7b
Show file tree
Hide file tree
Showing 17 changed files with 70 additions and 107 deletions.
5 changes: 2 additions & 3 deletions skll/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import itertools
import logging
import os
from io import open
from os.path import (basename, dirname, exists,
isabs, join, normpath, realpath)

Expand Down Expand Up @@ -543,8 +542,8 @@ def _parse_config_file(config_path, log_level=logging.INFO):

# ensure that featureset_names is a list of strings, if specified
if featureset_names:
if (not isinstance(featureset_names, list)
or not all([isinstance(fs, str) for fs in
if (not isinstance(featureset_names, list) or
not all([isinstance(fs, str) for fs in
featureset_names])):
raise ValueError("The featureset_names parameter should be a list "
"of strings. You specified: {}"
Expand Down
2 changes: 1 addition & 1 deletion skll/data/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@
import re
import sys
from itertools import islice
from io import open, StringIO
from io import StringIO

import numpy as np
import pandas as pd
Expand Down
1 change: 0 additions & 1 deletion skll/data/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import sys
import pandas as pd
from decimal import Decimal
from io import open

from sklearn.feature_extraction import FeatureHasher

Expand Down
90 changes: 37 additions & 53 deletions skll/experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,14 @@
import math
import sys

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import ruamel.yaml as yaml
import seaborn as sns

from collections import defaultdict
from io import open
from itertools import combinations
from os.path import exists, isfile, join, getsize

Expand All @@ -42,16 +44,9 @@
else:
_HAVE_GRIDMAP = True

# Check if seaborn (and matplotlib) are available
try:
import matplotlib
import seaborn as sns
except ImportError:
_HAVE_SEABORN = False
else:
import matplotlib.pyplot as plt
plt.ioff()
_HAVE_SEABORN = True
# Turn off interactive plotting for matplotlib
plt.ioff()


_VALID_TASKS = frozenset(['predict', 'train', 'evaluate', 'cross_validate'])
_VALID_SAMPLERS = frozenset(['Nystroem', 'RBFSampler', 'SkewedChi2Sampler',
Expand Down Expand Up @@ -308,9 +303,9 @@ def _print_fancy_output(learner_result_dicts, output_file=sys.stdout):
file=output_file)
print('Grid Objective Function: {}'.format(lrd['grid_objective']),
file=output_file)
if (lrd['task'] == 'cross_validate'
and lrd['grid_search']
and lrd['cv_folds'].endswith('folds file')):
if (lrd['task'] == 'cross_validate' and
lrd['grid_search'] and
lrd['cv_folds'].endswith('folds file')):
print('Using Folds File for Grid Search: {}'.format(lrd['use_folds_file_for_grid_search']),
file=output_file)
if lrd['task'] in ['evaluate', 'cross_validate'] and lrd['additional_scores']:
Expand Down Expand Up @@ -572,9 +567,9 @@ def _classify_featureset(args):
# check whether a trained model on the same data with the same
# featureset already exists if so, load it and then use it on test data
modelfile = join(model_path, '{}.model'.format(job_name))
if (task in ['cross_validate', 'learning_curve']
or not exists(modelfile)
or overwrite):
if (task in ['cross_validate', 'learning_curve'] or
not exists(modelfile) or
overwrite):
train_examples = _load_featureset(train_path,
featureset,
suffix,
Expand Down Expand Up @@ -766,8 +761,7 @@ def _classify_featureset(args):
learner_result_dict_base)

# write out the result dictionary to a json file
file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
with open(results_json_path, file_mode) as json_file:
with open(results_json_path, 'w') as json_file:
json.dump(res, json_file, cls=NumpyTypeEncoder)

with open(join(results_path,
Expand All @@ -793,8 +787,7 @@ def _classify_featureset(args):
res = [res]

# write out the result dictionary to a json file
file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
with open(results_json_path, file_mode) as json_file:
with open(results_json_path, 'w') as json_file:
json.dump(res, json_file, cls=NumpyTypeEncoder)

# For all other tasks, i.e. train or predict
Expand All @@ -810,17 +803,15 @@ def _classify_featureset(args):
grid_search_cv_results_dicts[0]
grid_search_cv_results_dict.update(learner_result_dict_base)
# write out the result dictionary to a json file
file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
with open(results_json_path, file_mode) as json_file:
with open(results_json_path, 'w') as json_file:
json.dump(grid_search_cv_results_dict, json_file, cls=NumpyTypeEncoder)
res = [learner_result_dict_base]

# write out the cv folds if required
if task == 'cross_validate' and save_cv_folds:
skll_fold_ids_file = experiment_name + '_skll_fold_ids.csv'
file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
with open(join(results_path, skll_fold_ids_file),
file_mode) as output_file:
'w') as output_file:
_write_skll_folds(skll_fold_ids, output_file)

return res
Expand Down Expand Up @@ -917,8 +908,8 @@ def _create_learner_result_dicts(task_results,
recall_sum_dict[actual_label] += float(label_recall)
if not math.isnan(label_f):
f_sum_dict[actual_label] += float(label_f)
result_row = ([actual_label] + conf_matrix[i]
+ [label_prec, label_recall, label_f])
result_row = ([actual_label] + conf_matrix[i] +
[label_prec, label_recall, label_f])
rows.append(result_row)

result_table = tabulate(rows,
Expand Down Expand Up @@ -1099,19 +1090,19 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
if ablation is None:
for i in range(1, len(features)):
for excluded_features in combinations(features, i):
expanded_fs.append(sorted(featureset
- set(excluded_features)))
expanded_fs_names.append(featureset_name
+ '_minus_'
+ _munge_featureset_name(excluded_features))
expanded_fs.append(sorted(featureset -
set(excluded_features)))
expanded_fs_names.append(featureset_name +
'_minus_' +
_munge_featureset_name(excluded_features))
# Otherwise, just expand removing the specified number at a time
else:
for excluded_features in combinations(features, ablation):
expanded_fs.append(sorted(featureset -
set(excluded_features)))
expanded_fs_names.append(featureset_name
+ '_minus_'
+ _munge_featureset_name(excluded_features))
expanded_fs_names.append(featureset_name +
'_minus_' +
_munge_featureset_name(excluded_features))
# Also add version with nothing removed as baseline
expanded_fs.append(features)
expanded_fs_names.append(featureset_name + '_all')
Expand Down Expand Up @@ -1269,27 +1260,20 @@ def run_configuration(config_file, local=False, overwrite=True, queue='all.q',
# write out the summary results file
if (task == 'cross_validate' or task == 'evaluate') and write_summary:
summary_file_name = experiment_name + '_summary.tsv'
file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
with open(join(results_path, summary_file_name), file_mode) as output_file:
with open(join(results_path, summary_file_name), 'w') as output_file:
_write_summary_file(result_json_paths,
output_file,
ablation=ablation)
elif task == 'learning_curve':
output_file_name = experiment_name + '_summary.tsv'
file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
output_file_path = join(results_path, output_file_name)
with open(output_file_path, file_mode) as output_file:
with open(output_file_path, 'w') as output_file:
_write_learning_curve_file(result_json_paths, output_file)

# generate the actual plot if we have the requirements installed
if _HAVE_SEABORN:
_generate_learning_curve_plots(experiment_name,
results_path,
output_file_path)
else:
logger.warning("Raw data for the learning curve saved in "
"{}. No plots were generated since pandas and "
"seaborn are not installed. ".format(output_file_path))
_generate_learning_curve_plots(experiment_name,
results_path,
output_file_path)

return result_json_paths

Expand Down Expand Up @@ -1421,12 +1405,12 @@ def _generate_learning_curve_plots(experiment_name,
for j, col_name in enumerate(g.col_names):
ax = g.axes[i][j]
ax.set(ylim=ylimits[row_name])
df_ax_train = df_fs[(df_fs['learner_name'] == col_name)
& (df_fs['metric'] == row_name)
& (df_fs['variable'] == 'train_score_mean')]
df_ax_test = df_fs[(df_fs['learner_name'] == col_name)
& (df_fs['metric'] == row_name)
& (df_fs['variable'] == 'test_score_mean')]
df_ax_train = df_fs[(df_fs['learner_name'] == col_name) &
(df_fs['metric'] == row_name) &
(df_fs['variable'] == 'train_score_mean')]
df_ax_test = df_fs[(df_fs['learner_name'] == col_name) &
(df_fs['metric'] == row_name) &
(df_fs['variable'] == 'test_score_mean')]
ax.fill_between(list(range(len(df_ax_train))),
df_ax_train['value'] - df_ax_train['train_score_std'],
df_ax_train['value'] + df_ax_train['train_score_std'],
Expand Down
6 changes: 0 additions & 6 deletions skll/utilities/plot_learning_curves.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@
from os import makedirs
from os.path import basename, exists

from skll.experiments import _HAVE_SEABORN
from skll.experiments import _generate_learning_curve_plots
from skll.version import __version__

Expand Down Expand Up @@ -65,11 +64,6 @@ def main(argv=None):
if not exists(args.output_dir):
makedirs(args.output_dir)

# check that we have pandas and seaborn available
if not _HAVE_SEABORN:
logging.error("Error: need seaborn to generate learning curve plots.")
sys.exit(1)

# get the experiment name from the learning curve TSV file
# output_file_name = experiment_name + '_summary.tsv'
experiment_name = basename(args.tsv_file).rstrip('_summary.tsv')
Expand Down
5 changes: 1 addition & 4 deletions skll/utilities/summarize_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@

import argparse
import logging
import sys
from io import open

from skll.experiments import _write_summary_file
from skll.version import __version__
Expand Down Expand Up @@ -50,8 +48,7 @@ def main(argv=None):
logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
'%(message)s'))

file_mode = 'w' if sys.version_info >= (3, 0) else 'wb'
with open(args.summary_file, file_mode) as output_file:
with open(args.summary_file, 'w') as output_file:
_write_summary_file(args.json_file, output_file,
ablation=int(args.ablation))

Expand Down
Binary file not shown.
Binary file removed tests/other/test_load_saved_model.2.model
Binary file not shown.
File renamed without changes.
1 change: 0 additions & 1 deletion tests/test_ablation.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import os

from glob import glob
from io import open
from os.path import abspath, dirname, exists, join

from nose.tools import eq_
Expand Down
6 changes: 2 additions & 4 deletions tests/test_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,8 @@
import json
import os
import re
import sys
import warnings

from io import open
from os.path import abspath, dirname, exists, join

import numpy as np
Expand Down Expand Up @@ -346,7 +344,7 @@ def test_dummy_classifier_predict():
{"strategy": "constant", "constant": 1}],
[np.array([0, 0, 0, 1, 0, 1, 1, 0, 0, 0]),
np.zeros(10),
np.ones(10)*1]):
np.ones(10) * 1]):
yield check_dummy_classifier_predict, model_args, train_labels, expected_output


Expand Down Expand Up @@ -755,7 +753,7 @@ def test_learner_api_load_into_existing_instance():

# now use `load()` to replace the existing instance with a
# different saved learner
other_model_file = join(_my_dir, 'other', 'test_load_saved_model.{}.model'.format(sys.version_info[0]))
other_model_file = join(_my_dir, 'other', 'test_load_saved_model.model')
learner1.load(other_model_file)

# now load the saved model into another instance using the class method
Expand Down
1 change: 0 additions & 1 deletion tests/test_custom_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
import csv
import os
from glob import glob
from io import open
from os.path import abspath, dirname, exists, join

import numpy as np
Expand Down
10 changes: 5 additions & 5 deletions tests/test_cv.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,16 +11,16 @@

import csv
import itertools
from io import open
import os
from os.path import abspath, dirname, join, exists
import json
from glob import glob
import os
import re

from glob import glob
from os.path import abspath, dirname, join, exists

import numpy as np
from nose.tools import eq_, raises

from nose.tools import eq_, raises
from sklearn.feature_extraction import FeatureHasher
from sklearn.datasets.samples_generator import make_classification
from sklearn.utils.testing import (assert_greater,
Expand Down
7 changes: 3 additions & 4 deletions tests/test_featureset.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,8 +638,8 @@ def test_feature_merging_order_invariance():
merged_fs_shuf = train_fs1 + train_fs2_shuf

# check that the two merged versions are the same
feature_names = (train_fs1.vectorizer.get_feature_names()
+ train_fs2.vectorizer.get_feature_names())
feature_names = (train_fs1.vectorizer.get_feature_names() +
train_fs2.vectorizer.get_feature_names())
assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names)
assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(),
feature_names)
Expand All @@ -659,8 +659,7 @@ def test_feature_merging_order_invariance():
assert_array_equal(merged_fs.features.todense(),
merged_fs_shuf.features.todense())

assert not np.all(merged_fs.features[:, 0:2].todense()
== merged_fs.features[:, 2:4].todense())
assert not np.all(merged_fs.features[:, 0:2].todense() == merged_fs.features[:, 2:4].todense())


# Tests related to loading featuresets and merging them
Expand Down

0 comments on commit d913c7b

Please sign in to comment.