EducationalTestingService · desilinguist · Nov 8, 2017 · Oct 26, 2017 · Oct 30, 2017 · Oct 30, 2017
diff --git a/doc/run_experiment.rst b/doc/run_experiment.rst
@@ -1133,6 +1133,25 @@ the result, log, model, and prediction files will share the prefix
 ``EXPERIMENT_FEATURESET_LEARNER``. For backward-compatibility, the same
 applies when a single objective is specified using ``objective=x``.
 
+In addition to the above log files that are specific to each "job"
+(a specific combination of featuresets, learners, and objectives specified
+in the configuration file), SKLL also produces a single, top level "experiment"
+log file with only ``EXPERIMENT`` as the prefix. While the job-level log files
+contain messages that pertain to the specific characteristics of the job, the
+experiment-level log file will contain logging messages that pertain to the
+overall experiment and configuration file. The messages in the log files are
+in the following format:
+
+.. code-block:: bash
+
+    TIMESTAMP - LEVEL - MSG
+
+where ``TIMESTAMP`` refers to the exact time when the message was logged,
+``LEVEL`` refers to the level of the logging message (e.g., ``INFO``, ``WARNING``,
+etc.), and ``MSG`` is the actual content of the message. All of the messages
+are also printed to the console in addition to being saved in the job-level log
+files and the experiment-level log file.
+
 For every experiment you run, there will also be a result summary file
 generated that is a tab-delimited file summarizing the results for each
 learner-featureset combination you have in your configuration file. It is named

diff --git a/skll/__init__.py b/skll/__init__.py
@@ -12,7 +12,7 @@
 from __future__ import absolute_import, print_function, unicode_literals
 
 from sklearn.metrics import f1_score, make_scorer, SCORERS
-
+from .logutils import get_skll_logger
 from .data import FeatureSet, Reader, Writer
 from .experiments import run_configuration
 from .learner import Learner
@@ -22,7 +22,7 @@
 
 __all__ = ['FeatureSet', 'Learner', 'Reader', 'kappa', 'kendall_tau',
            'spearman', 'pearson', 'f1_score_least_frequent',
-           'run_configuration', 'Writer']
+           'get_skll_logger', 'run_configuration', 'Writer']
 
 # Add our scorers to the sklearn dictionary here so that they will always be
 # available if you import anything from skll

diff --git a/skll/config.py b/skll/config.py
@@ -24,6 +24,7 @@
 import ruamel.yaml as yaml
 
 from six import string_types, iteritems  # Python 2/3
+from skll import get_skll_logger
 from sklearn.metrics import SCORERS
 
 
@@ -269,14 +270,12 @@ def _setup_config_parser(config_path, validate=True):
     return config
 
 
-def _parse_config_file(config_path):
+def _parse_config_file(config_path, log_level=logging.INFO):
     """
     Parses a SKLL experiment configuration file with the given path.
+    Log messages with the given log level (default: INFO).
     """
 
-    # Initialize logger
-    logger = logging.getLogger(__name__)
-
     # check that config_path is not empty
     if config_path == "":
         raise IOError("The name of the configuration file is empty")
@@ -297,6 +296,26 @@ def _parse_config_file(config_path):
         raise ValueError("Configuration file does not contain experiment_name "
                          "in the [General] section.")
 
+    # next, get the log path before anything else since we need to
+    # save all logging messages to a log file in addition to displaying
+    # them on the console
+    log_path = _locate_file(config.get("Output", "log"), config_dir)
+    if log_path:
+        log_path = join(config_dir, log_path)
+        if not exists(log_path):
+            os.makedirs(log_path)
+
+    # Create a top-level log file under the log path
+    main_log_file =join(log_path, '{}.log'.format(experiment_name))
+
+    # Now create a SKLL logger that will log to this file as well
+    # as to the console. Use the log level provided - note that
+    # we only have to do this the first time we call `get_skll_logger()`
+    # with a given name.
+    logger = get_skll_logger('experiment',
+                             filepath=main_log_file,
+                             log_level=log_level)
+
     if config.has_option("General", "task"):
         task = config.get("General", "task")
     else:
@@ -536,13 +555,6 @@ def _parse_config_file(config_path):
         if not exists(prediction_dir):
             os.makedirs(prediction_dir)
 
-    # make sure log path exists
-    log_path = _locate_file(config.get("Output", "log"), config_dir)
-    if log_path:
-        log_path = join(config_dir, log_path)
-        if not exists(log_path):
-            os.makedirs(log_path)
-
     # make sure model path exists
     model_path = _locate_file(config.get("Output", "models"), config_dir)
     if model_path:

diff --git a/skll/data/readers.py b/skll/data/readers.py
@@ -69,11 +69,14 @@ class Reader(object):
                          of 2 greater than the actual number of features to
                          avoid collisions.
     :type num_features: int
+    :param logger: A logger instance to use to log messages instead of creating
+                   a new one by default.
+    :type logger: logging.Logger
     """
 
     def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
                  label_col='y', id_col='id', class_map=None, sparse=True,
-                 feature_hasher=False, num_features=None):
+                 feature_hasher=False, num_features=None, logger=None):
         super(Reader, self).__init__()
         self.path_or_list = path_or_list
         self.quiet = quiet
@@ -86,6 +89,7 @@ def __init__(self, path_or_list, quiet=True, ids_to_floats=False,
             self.vectorizer = FeatureHasher(n_features=num_features)
         else:
             self.vectorizer = DictVectorizer(sparse=sparse)
+        self.logger = logger if logger else logging.getLogger(__name__)
 
     @classmethod
     def for_path(cls, path_or_list, **kwargs):
@@ -169,10 +173,7 @@ def read(self):
         :returns: :class:`~skll.data.featureset.FeatureSet` representing the
                   file we read in.
         """
-        # Setup logger
-        logger = logging.getLogger(__name__)
-
-        logger.debug('Path: %s', self.path_or_list)
+        self.logger.debug('Path: %s', self.path_or_list)
 
         if not self.quiet:
             self._progress_msg = "Loading {}...".format(self.path_or_list)
@@ -682,7 +683,7 @@ def __init__(self, path_or_list, **kwargs):
         super(TSVReader, self).__init__(path_or_list, **kwargs)
 
 
-def safe_float(text, replace_dict=None):
+def safe_float(text, replace_dict=None, logger=None):
     """
     Attempts to convert a string to an int, and then a float, but if neither is
     possible, just returns the original string value.
@@ -695,19 +696,24 @@ def safe_float(text, replace_dict=None):
                          floats. Anything not in the mapping will be kept the
                          same.
     :type replace_dict: dict from str to str
+    :param logger: The Logger instance to use to log messages. Used instead of
+                 creating a new Logger instance by default.
+    :type logger: logging.Logger
     """
 
     # convert to text to be "Safe"!
     text = text_type(text)
 
+    # get a logger unless we are passed one
+    if not logger:
+        logger = logging.getLogger(__name__)
+
     if replace_dict is not None:
         if text in replace_dict:
             text = replace_dict[text]
         else:
-            logging.getLogger(__name__).warning('Encountered value that was '
-                                                'not in replacement '
-                                                'dictionary (e.g., class_map):'
-                                                ' {}'.format(text))
+            logger.warning('Encountered value that was not in replacement '
+                           'dictionary (e.g., class_map): {}'.format(text))
     try:
         return int(text)
     except ValueError:

diff --git a/skll/data/writers.py b/skll/data/writers.py
@@ -57,6 +57,9 @@ class Writer(object):
                     enumerate all of these boolean feature names in your
                     mapping.
     :type subsets: dict (str to list of str)
+    :param logger: A logger instance to use to log messages instead of creating
+                   a new one by default.
+    :type logger: logging.Logger
     """
 
     def __init__(self, path, feature_set, **kwargs):
@@ -66,6 +69,9 @@ def __init__(self, path, feature_set, **kwargs):
         self.path = path
         self.feat_set = feature_set
         self.subsets = kwargs.pop('subsets', None)
+        logger = kwargs.pop('logger', None)
+        self.logger = logger if logger else logging.getLogger(__name__)
+
         # Get prefix & extension for checking file types & writing subset files
         # TODO: Determine if we purposefully used this instead of os.path.split
         self.root, self.ext = re.search(r'^(.*)(\.[^.]*)$', path).groups()
@@ -104,9 +110,6 @@ def write(self):
         Writes out this Writer's FeatureSet to a file in its
         format.
         """
-        # Setup logger
-        logger = logging.getLogger(__name__)
-
         if isinstance(self.feat_set.vectorizer, FeatureHasher):
             raise ValueError('Writer cannot write sets that use'
                              'FeatureHasher for vectorization.')
@@ -117,8 +120,8 @@ def write(self):
         # Otherwise write one feature file per subset
         else:
             for subset_name, filter_features in iteritems(self.subsets):
-                logger.debug('Subset (%s) features: %s', subset_name,
-                             filter_features)
+                self.logger.debug('Subset ({}) features: {}'.format(subset_name,
+                                                                    filter_features))
                 sub_path = os.path.join(self.root, '{}{}'.format(subset_name,
                                                                  self.ext))
                 self._write_subset(sub_path, set(filter_features))
@@ -134,12 +137,9 @@ def _write_subset(self, sub_path, filter_features):
                                 file.
         :type filter_features: set of str
         """
-        # Setup logger
-        logger = logging.getLogger(__name__)
-
-        logger.debug('sub_path: %s', sub_path)
-        logger.debug('feature_set: %s', self.feat_set.name)
-        logger.debug('filter_features: %s', filter_features)
+        self.logger.debug('sub_path: %s', sub_path)
+        self.logger.debug('feature_set: %s', self.feat_set.name)
+        self.logger.debug('filter_features: %s', filter_features)
 
         if not self.quiet:
             self._progress_msg = "Writing {}...".format(sub_path)