Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve SKLL logging #380

Merged
merged 20 commits into from
Nov 8, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions doc/run_experiment.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1133,6 +1133,25 @@ the result, log, model, and prediction files will share the prefix
``EXPERIMENT_FEATURESET_LEARNER``. For backward-compatibility, the same
applies when a single objective is specified using ``objective=x``.

In addition to the above log files that are specific to each "job"
(a specific combination of featuresets, learners, and objectives specified
in the configuration file), SKLL also produces a single, top level "experiment"
log file with only ``EXPERIMENT`` as the prefix. While the job-level log files
contain messages that pertain to the specific characteristics of the job, the
experiment-level log file will contain logging messages that pertain to the
overall experiment and configuration file. The messages in the log files are
in the following format:

.. code-block:: bash

TIMESTAMP - LEVEL - MSG

where ``TIMESTAMP`` refers to the exact time when the message was logged,
``LEVEL`` refers to the level of the logging message (e.g., ``INFO``, ``WARNING``,
etc.), and ``MSG`` is the actual content of the message. All of the messages
are also printed to the console in addition to being saved in the job-level log
files and the experiment-level log file.

For every experiment you run, there will also be a result summary file
generated that is a tab-delimited file summarizing the results for each
learner-featureset combination you have in your configuration file. It is named
Expand Down
4 changes: 2 additions & 2 deletions skll/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from __future__ import absolute_import, print_function, unicode_literals

from sklearn.metrics import f1_score, make_scorer, SCORERS

from .logutils import get_skll_logger
from .data import FeatureSet, Reader, Writer
from .experiments import run_configuration
from .learner import Learner
Expand All @@ -22,7 +22,7 @@

__all__ = ['FeatureSet', 'Learner', 'Reader', 'kappa', 'kendall_tau',
'spearman', 'pearson', 'f1_score_least_frequent',
'run_configuration', 'Writer']
'get_skll_logger', 'run_configuration', 'Writer']

# Add our scorers to the sklearn dictionary here so that they will always be
# available if you import anything from skll
Expand Down
34 changes: 23 additions & 11 deletions skll/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import ruamel.yaml as yaml

from six import string_types, iteritems # Python 2/3
from skll import get_skll_logger
from sklearn.metrics import SCORERS


Expand Down Expand Up @@ -269,14 +270,12 @@ def _setup_config_parser(config_path, validate=True):
return config


def _parse_config_file(config_path):
def _parse_config_file(config_path, log_level=logging.INFO):
"""
Parses a SKLL experiment configuration file with the given path.
Log messages with the given log level (default: INFO).
"""

# Initialize logger
logger = logging.getLogger(__name__)

# check that config_path is not empty
if config_path == "":
raise IOError("The name of the configuration file is empty")
Expand All @@ -297,6 +296,26 @@ def _parse_config_file(config_path):
raise ValueError("Configuration file does not contain experiment_name "
"in the [General] section.")

# next, get the log path before anything else since we need to
# save all logging messages to a log file in addition to displaying
# them on the console
log_path = _locate_file(config.get("Output", "log"), config_dir)
if log_path:
log_path = join(config_dir, log_path)
if not exists(log_path):
os.makedirs(log_path)

# Create a top-level log file under the log path
main_log_file =join(log_path, '{}.log'.format(experiment_name))

# Now create a SKLL logger that will log to this file as well
# as to the console. Use the log level provided - note that
# we only have to do this the first time we call `get_skll_logger()`
# with a given name.
logger = get_skll_logger('experiment',
filepath=main_log_file,
log_level=log_level)

if config.has_option("General", "task"):
task = config.get("General", "task")
else:
Expand Down Expand Up @@ -536,13 +555,6 @@ def _parse_config_file(config_path):
if not exists(prediction_dir):
os.makedirs(prediction_dir)

# make sure log path exists
log_path = _locate_file(config.get("Output", "log"), config_dir)
if log_path:
log_path = join(config_dir, log_path)
if not exists(log_path):
os.makedirs(log_path)

# make sure model path exists
model_path = _locate_file(config.get("Output", "models"), config_dir)
if model_path:
Expand Down
26 changes: 16 additions & 10 deletions skll/data/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,11 +69,14 @@ class Reader(object):
of 2 greater than the actual number of features to
avoid collisions.
:type num_features: int
:param logger: A logger instance to use to log messages instead of creating
a new one by default.
:type logger: logging.Logger
"""

def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
label_col='y', id_col='id', class_map=None, sparse=True,
feature_hasher=False, num_features=None):
feature_hasher=False, num_features=None, logger=None):
super(Reader, self).__init__()
self.path_or_list = path_or_list
self.quiet = quiet
Expand All @@ -86,6 +89,7 @@ def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
self.vectorizer = FeatureHasher(n_features=num_features)
else:
self.vectorizer = DictVectorizer(sparse=sparse)
self.logger = logger if logger else logging.getLogger(__name__)

@classmethod
def for_path(cls, path_or_list, **kwargs):
Expand Down Expand Up @@ -169,10 +173,7 @@ def read(self):
:returns: :class:`~skll.data.featureset.FeatureSet` representing the
file we read in.
"""
# Setup logger
logger = logging.getLogger(__name__)

logger.debug('Path: %s', self.path_or_list)
self.logger.debug('Path: %s', self.path_or_list)

if not self.quiet:
self._progress_msg = "Loading {}...".format(self.path_or_list)
Expand Down Expand Up @@ -682,7 +683,7 @@ def __init__(self, path_or_list, **kwargs):
super(TSVReader, self).__init__(path_or_list, **kwargs)


def safe_float(text, replace_dict=None):
def safe_float(text, replace_dict=None, logger=None):
"""
Attempts to convert a string to an int, and then a float, but if neither is
possible, just returns the original string value.
Expand All @@ -695,19 +696,24 @@ def safe_float(text, replace_dict=None):
floats. Anything not in the mapping will be kept the
same.
:type replace_dict: dict from str to str
:param logger: The Logger instance to use to log messages. Used instead of
creating a new Logger instance by default.
:type logger: logging.Logger
"""

# convert to text to be "Safe"!
text = text_type(text)

# get a logger unless we are passed one
if not logger:
logger = logging.getLogger(__name__)

if replace_dict is not None:
if text in replace_dict:
text = replace_dict[text]
else:
logging.getLogger(__name__).warning('Encountered value that was '
'not in replacement '
'dictionary (e.g., class_map):'
' {}'.format(text))
logger.warning('Encountered value that was not in replacement '
'dictionary (e.g., class_map): {}'.format(text))
try:
return int(text)
except ValueError:
Expand Down
22 changes: 11 additions & 11 deletions skll/data/writers.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ class Writer(object):
enumerate all of these boolean feature names in your
mapping.
:type subsets: dict (str to list of str)
:param logger: A logger instance to use to log messages instead of creating
a new one by default.
:type logger: logging.Logger
"""

def __init__(self, path, feature_set, **kwargs):
Expand All @@ -66,6 +69,9 @@ def __init__(self, path, feature_set, **kwargs):
self.path = path
self.feat_set = feature_set
self.subsets = kwargs.pop('subsets', None)
logger = kwargs.pop('logger', None)
self.logger = logger if logger else logging.getLogger(__name__)

# Get prefix & extension for checking file types & writing subset files
# TODO: Determine if we purposefully used this instead of os.path.split
self.root, self.ext = re.search(r'^(.*)(\.[^.]*)$', path).groups()
Expand Down Expand Up @@ -104,9 +110,6 @@ def write(self):
Writes out this Writer's FeatureSet to a file in its
format.
"""
# Setup logger
logger = logging.getLogger(__name__)

if isinstance(self.feat_set.vectorizer, FeatureHasher):
raise ValueError('Writer cannot write sets that use'
'FeatureHasher for vectorization.')
Expand All @@ -117,8 +120,8 @@ def write(self):
# Otherwise write one feature file per subset
else:
for subset_name, filter_features in iteritems(self.subsets):
logger.debug('Subset (%s) features: %s', subset_name,
filter_features)
self.logger.debug('Subset ({}) features: {}'.format(subset_name,
filter_features))
sub_path = os.path.join(self.root, '{}{}'.format(subset_name,
self.ext))
self._write_subset(sub_path, set(filter_features))
Expand All @@ -134,12 +137,9 @@ def _write_subset(self, sub_path, filter_features):
file.
:type filter_features: set of str
"""
# Setup logger
logger = logging.getLogger(__name__)

logger.debug('sub_path: %s', sub_path)
logger.debug('feature_set: %s', self.feat_set.name)
logger.debug('filter_features: %s', filter_features)
self.logger.debug('sub_path: %s', sub_path)
self.logger.debug('feature_set: %s', self.feat_set.name)
self.logger.debug('filter_features: %s', filter_features)

if not self.quiet:
self._progress_msg = "Writing {}...".format(sub_path)
Expand Down
Loading