diff --git a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/conda.yaml b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/conda.yaml index b9e0a36d2d..a3c760b8de 100644 --- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/conda.yaml +++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/conda.yaml @@ -6,8 +6,6 @@ dependencies: - python=3.8.12 - pip=21.2.2 - pip: - - --extra-index-url https://pypi.org/simple - - git+https://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger - mldesigner==0.1.0b4 - watchdog==0.10.3 - torch==1.8.1 diff --git a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/__init__.py b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/__init__.py index e1ec42a71c..f0758088ea 100644 --- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/__init__.py +++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from . import logger from . import dataloaders from . import training from . import utils diff --git a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/logger.py b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/logger.py deleted file mode 100644 index b7e845ab1d..0000000000 --- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/logger.py +++ /dev/null @@ -1,311 +0,0 @@ -# Copyright (c) 2018-2019, NVIDIA CORPORATION -# Copyright (c) 2017- Facebook, Inc -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from collections import OrderedDict -import dllogger -import numpy as np - - -def format_step(step): - if isinstance(step, str): - return step - s = "" - if len(step) > 0: - s += "Epoch: {} ".format(step[0]) - if len(step) > 1: - s += "Iteration: {} ".format(step[1]) - if len(step) > 2: - s += "Validation Iteration: {} ".format(step[2]) - if len(step) == 0: - s = "Summary:" - return s - - -PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter()) -LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter()) -ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter()) -LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter()) - -LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1)) -LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99)) -LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95)) - - -class Meter(object): - def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator): - self.run_aggregator = run_aggregator - self.epoch_aggregator = epoch_aggregator - self.iteration_aggregator = iteration_aggregator - - def record(self, val, n=1): - self.iteration_aggregator.record(val, n=n) - - def get_iteration(self): - v, n = self.iteration_aggregator.get_val() - return v - - def reset_iteration(self): - v, n = self.iteration_aggregator.get_data() - self.iteration_aggregator.reset() - if v is not None: - self.epoch_aggregator.record(v, n=n) - - def get_epoch(self): - v, n = self.epoch_aggregator.get_val() - return v - - def reset_epoch(self): - v, n = self.epoch_aggregator.get_data() - self.epoch_aggregator.reset() - if v is not None: - self.run_aggregator.record(v, n=n) - - def get_run(self): - v, n = self.run_aggregator.get_val() - return v - - def reset_run(self): - self.run_aggregator.reset() - - -class QuantileMeter(object): - def __init__(self, q): - self.q = q - self.reset() - - def reset(self): - self.vals = [] - self.n = 0 - - def record(self, val, n=1): - if isinstance(val, list): - self.vals += val - self.n += len(val) - else: - self.vals += [val] * n - self.n += n - - def get_val(self): - if not self.vals: - return None, self.n - return np.quantile(self.vals, self.q, interpolation="nearest"), self.n - - def get_data(self): - return self.vals, self.n - - -class MaxMeter(object): - def __init__(self): - self.reset() - - def reset(self): - self.max = None - self.n = 0 - - def record(self, val, n=1): - if self.max is None: - self.max = val - else: - self.max = max(self.max, val) - self.n = n - - def get_val(self): - return self.max, self.n - - def get_data(self): - return self.max, self.n - - -class MinMeter(object): - def __init__(self): - self.reset() - - def reset(self): - self.min = None - self.n = 0 - - def record(self, val, n=1): - if self.min is None: - self.min = val - else: - self.min = max(self.min, val) - self.n = n - - def get_val(self): - return self.min, self.n - - def get_data(self): - return self.min, self.n - - -class LastMeter(object): - def __init__(self): - self.reset() - - def reset(self): - self.last = None - self.n = 0 - - def record(self, val, n=1): - self.last = val - self.n = n - - def get_val(self): - return self.last, self.n - - def get_data(self): - return self.last, self.n - - -class AverageMeter(object): - def __init__(self): - self.reset() - - def reset(self): - self.n = 0 - self.val = 0 - - def record(self, val, n=1): - self.n += n - self.val += val * n - - def get_val(self): - if self.n == 0: - return None, 0 - return self.val / self.n, self.n - - def get_data(self): - if self.n == 0: - return None, 0 - return self.val / self.n, self.n - - -class Logger(object): - def __init__(self, print_interval, backends, verbose=False, last_epoch=-1): - self.epoch = last_epoch - self.iteration = -1 - self.val_iteration = -1 - self.metrics = OrderedDict() - self.backends = backends - self.print_interval = print_interval - self.verbose = verbose - dllogger.init(backends) - - def log_parameter(self, data, verbosity=0): - dllogger.log(step="PARAMETER", data=data, verbosity=verbosity) - - def register_metric(self, metric_name, meter, verbosity=0, metadata={}): - if self.verbose: - print("Registering metric: {}".format(metric_name)) - self.metrics[metric_name] = {"meter": meter, "level": verbosity} - dllogger.metadata(metric_name, metadata) - - def log_metric(self, metric_name, val, n=1): - self.metrics[metric_name]["meter"].record(val, n=n) - - def start_iteration(self, val=False): - if val: - self.val_iteration += 1 - else: - self.iteration += 1 - - def end_iteration(self, val=False): - it = self.val_iteration if val else self.iteration - if it % self.print_interval == 0: - metrics = { - n: m for n, m in self.metrics.items() if n.startswith("val") == val - } - step = ( - (self.epoch, self.iteration) - if not val - else (self.epoch, self.iteration, self.val_iteration) - ) - - verbositys = {m["level"] for _, m in metrics.items()} - for ll in verbositys: - llm = {n: m for n, m in metrics.items() if m["level"] == ll} - - dllogger.log( - step=step, - data={n: m["meter"].get_iteration() for n, m in llm.items()}, - verbosity=ll, - ) - - for n, m in metrics.items(): - m["meter"].reset_iteration() - - dllogger.flush() - - def start_epoch(self): - self.epoch += 1 - self.iteration = 0 - self.val_iteration = 0 - - for n, m in self.metrics.items(): - m["meter"].reset_epoch() - - def end_epoch(self): - for n, m in self.metrics.items(): - m["meter"].reset_iteration() - - verbositys = {m["level"] for _, m in self.metrics.items()} - for ll in verbositys: - llm = {n: m for n, m in self.metrics.items() if m["level"] == ll} - dllogger.log( - step=(self.epoch,), - data={n: m["meter"].get_epoch() for n, m in llm.items()}, - ) - - def end(self): - for n, m in self.metrics.items(): - m["meter"].reset_epoch() - - verbositys = {m["level"] for _, m in self.metrics.items()} - for ll in verbositys: - llm = {n: m for n, m in self.metrics.items() if m["level"] == ll} - dllogger.log( - step=tuple(), data={n: m["meter"].get_run() for n, m in llm.items()} - ) - - for n, m in self.metrics.items(): - m["meter"].reset_epoch() - - dllogger.flush() - - def iteration_generator_wrapper(self, gen, val=False): - for g in gen: - self.start_iteration(val=val) - yield g - self.end_iteration(val=val) - - def epoch_generator_wrapper(self, gen): - for g in gen: - self.start_epoch() - yield g - self.end_epoch() diff --git a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/training.py b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/training.py index 578fa440e4..5245fe7794 100644 --- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/training.py +++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/image_classification/training.py @@ -33,10 +33,8 @@ import torch import torch.nn as nn from torch.autograd import Variable -from . import logger as log from . import resnet as models from . import utils -import dllogger try: from apex.parallel import DistributedDataParallel as DDP @@ -189,16 +187,9 @@ def get_optimizer( def lr_policy(lr_fn, logger=None): - if logger is not None: - logger.register_metric( - "lr", log.LR_METER(), verbosity=dllogger.Verbosity.VERBOSE - ) - def _alr(optimizer, iteration, epoch): lr = lr_fn(iteration, epoch) - if logger is not None: - logger.log_metric("lr", lr) for param_group in optimizer.param_groups: param_group["lr"] = lr return lr @@ -321,37 +312,6 @@ def train( ): print(f"training...") print(f"register_metrics {register_metrics}, logger {logger}.") - if register_metrics and logger is not None: - logger.register_metric( - "train.loss", - log.LOSS_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=LOSS_METADATA, - ) - logger.register_metric( - "train.compute_ips", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=IPS_METADATA, - ) - logger.register_metric( - "train.total_ips", - log.PERF_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=IPS_METADATA, - ) - logger.register_metric( - "train.data_time", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "train.compute_time", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) step = get_train_step( model_and_loss, @@ -367,8 +327,7 @@ def train( optimizer.zero_grad() last_train_step = total_train_step data_iter = enumerate(train_loader) - if logger is not None: - data_iter = logger.iteration_generator_wrapper(data_iter) + if prof > 0: data_iter = utils.first_n(prof, data_iter) @@ -414,12 +373,6 @@ def train( ) total_train_step += 1 - if logger is not None: - logger.log_metric("train.loss", to_python_float(loss), bs) - logger.log_metric("train.compute_ips", calc_ips(bs, it_time - data_time)) - logger.log_metric("train.total_ips", calc_ips(bs, it_time)) - logger.log_metric("train.data_time", data_time) - logger.log_metric("train.compute_time", it_time - data_time) end = time.time() @@ -472,79 +425,16 @@ def validate( ): print(f"validating...") print(f"register_metrics {register_metrics}, logger {logger}.") - if register_metrics and logger is not None: - logger.register_metric( - "val.top1", - log.ACC_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=ACC_METADATA, - ) - logger.register_metric( - "val.top5", - log.ACC_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=ACC_METADATA, - ) - logger.register_metric( - "val.loss", - log.LOSS_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=LOSS_METADATA, - ) - logger.register_metric( - "val.compute_ips", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=IPS_METADATA, - ) - logger.register_metric( - "val.total_ips", - log.PERF_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=IPS_METADATA, - ) - logger.register_metric( - "val.data_time", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "val.compute_latency", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "val.compute_latency_at100", - log.LAT_100(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "val.compute_latency_at99", - log.LAT_99(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "val.compute_latency_at95", - log.LAT_95(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) step = get_val_step(model_and_loss) - top1 = log.AverageMeter() # switch to evaluate mode model_and_loss.eval() end = time.time() data_iter = enumerate(val_loader) - if not logger is None: - data_iter = logger.iteration_generator_wrapper(data_iter, val=True) + if prof > 0: data_iter = utils.first_n(prof, data_iter) @@ -558,19 +448,6 @@ def validate( it_time = time.time() - end - top1.record(to_python_float(prec1), bs) - if logger is not None: - logger.log_metric("val.top1", to_python_float(prec1), bs) - logger.log_metric("val.top5", to_python_float(prec5), bs) - logger.log_metric("val.loss", to_python_float(loss), bs) - logger.log_metric("val.compute_ips", calc_ips(bs, it_time - data_time)) - logger.log_metric("val.total_ips", calc_ips(bs, it_time)) - logger.log_metric("val.data_time", data_time) - logger.log_metric("val.compute_latency", it_time - data_time) - logger.log_metric("val.compute_latency_at95", it_time - data_time) - logger.log_metric("val.compute_latency_at99", it_time - data_time) - logger.log_metric("val.compute_latency_at100", it_time - data_time) - loss_sum += to_python_float(loss) total_val_step += 1 @@ -582,7 +459,7 @@ def validate( ) break - return [top1, loss_sum / total_val_step] + return loss_sum / total_val_step # Train loop {{{ @@ -647,8 +524,6 @@ def train_loop( writer.add_scalar("train/summary/scalar/world_size", world_size, epoch) mlflow.log_metric("train/world_size", step=epoch, value=world_size) - if logger is not None: - logger.start_epoch() if not skip_training: total_train_step = train( train_loader, @@ -668,7 +543,7 @@ def train_loop( ) if not skip_validation and not detector.is_preempted(): - top1, val_loss = validate( + val_loss = validate( val_loader, model_and_loss, fp16, @@ -678,19 +553,6 @@ def train_loop( prof=prof, register_metrics=epoch == start_epoch, ) - if not detector.is_preempted(): - prec1, nimg = top1.get_val() - if writer: - writer.add_scalar("val/summary/scalar/loss", val_loss, epoch) - writer.add_scalar("val/summary/scalar/prec1", prec1, epoch) - mlflow.log_metric("val/loss", step=epoch, value=val_loss) - mlflow.log_metric("val/prec1", step=epoch, value=prec1) - - if logger is not None: - print( - "Epoch ", epoch, " complete with is_preempted ", detector.is_preempted() - ) - logger.end_epoch() save_ckpt = is_first_rank and ( detector.is_preempted() or (epoch + 1) % save_checkpoint_epochs == 0 @@ -706,14 +568,8 @@ def train_loop( save_ckpt = False print(f"save ckpt {save_ckpt}, ckpt dir {checkpoint_dir}.") if save_ckpt: - if not skip_validation and not detector.is_preempted(): - is_best = logger.metrics["val.top1"]["meter"].get_epoch() > best_prec1 - best_prec1 = max( - logger.metrics["val.top1"]["meter"].get_epoch(), best_prec1 - ) - else: - is_best = False - best_prec1 = 0 + is_best = False + best_prec1 = 0 ckpt_epoch_index = epoch + 1 if not detector.is_preempted() else epoch utils.save_checkpoint( diff --git a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/main.py b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/main.py index 3600cef151..13c8f8d066 100644 --- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/main.py +++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/main.py @@ -56,7 +56,6 @@ ) import image_classification.resnet as models -import image_classification.logger as log from image_classification.smoothing import LabelSmoothing from image_classification.mixup import NLLMultiLabelSmooth, MixUpWrapper @@ -64,7 +63,6 @@ from image_classification.training import * from image_classification.utils import * -import dllogger import torch.multiprocessing as mp import os @@ -471,26 +469,6 @@ def _worker_init_fn(id): args.data, args.batch_size, 1000, False, workers=args.workers, fp16=args.fp16 ) - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: - logger = log.Logger( - args.print_freq, - [ - dllogger.StdOutBackend( - dllogger.Verbosity.DEFAULT, step_format=log.format_step - ), - dllogger.JSONStreamBackend( - dllogger.Verbosity.VERBOSE, - os.path.join(args.workspace, args.raport_file), - ), - ], - last_epoch=start_epoch - 1, - ) - - else: - logger = log.Logger(args.print_freq, [], last_epoch=start_epoch - 1) - - logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT) - optimizer = get_optimizer( list(model_and_loss.model.named_parameters()), args.fp16, @@ -505,13 +483,11 @@ def _worker_init_fn(id): ) if args.lr_schedule == "step": - lr_policy = lr_step_policy( - args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger - ) + lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup, logger=None) elif args.lr_schedule == "cosine": - lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=logger) + lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=None) elif args.lr_schedule == "linear": - lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger) + lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=None) if args.amp: model_and_loss, optimizer = amp.initialize( @@ -534,7 +510,7 @@ def _worker_init_fn(id): val_loader, args.epochs, args.fp16, - logger, + None, should_backup_checkpoint(args), args.save_checkpoint_epochs, use_amp=args.amp, @@ -549,8 +525,7 @@ def _worker_init_fn(id): total_train_step=args.total_train_step, ) exp_duration = time.time() - exp_start_time - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: - logger.end() + print("Experiment ended") sys.stdout.flush() diff --git a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/requirements.txt b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/requirements.txt index d605e3bcc0..e69de29bb2 100644 --- a/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/requirements.txt +++ b/cli/jobs/pipelines-with-components/image_classification_with_densenet/image_cnn_train/requirements.txt @@ -1 +0,0 @@ -git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger diff --git a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/conda.yaml b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/conda.yaml index b9e0a36d2d..a3c760b8de 100644 --- a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/conda.yaml +++ b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/conda.yaml @@ -6,8 +6,6 @@ dependencies: - python=3.8.12 - pip=21.2.2 - pip: - - --extra-index-url https://pypi.org/simple - - git+https://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger - mldesigner==0.1.0b4 - watchdog==0.10.3 - torch==1.8.1 diff --git a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/__init__.py b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/__init__.py index e1ec42a71c..f0758088ea 100644 --- a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/__init__.py +++ b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/__init__.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -from . import logger from . import dataloaders from . import training from . import utils diff --git a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/logger.py b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/logger.py deleted file mode 100644 index b7e845ab1d..0000000000 --- a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/logger.py +++ /dev/null @@ -1,311 +0,0 @@ -# Copyright (c) 2018-2019, NVIDIA CORPORATION -# Copyright (c) 2017- Facebook, Inc -# -# All rights reserved. -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, this -# list of conditions and the following disclaimer. -# -# * Redistributions in binary form must reproduce the above copyright notice, -# this list of conditions and the following disclaimer in the documentation -# and/or other materials provided with the distribution. -# -# * Neither the name of the copyright holder nor the names of its -# contributors may be used to endorse or promote products derived from -# this software without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -from collections import OrderedDict -import dllogger -import numpy as np - - -def format_step(step): - if isinstance(step, str): - return step - s = "" - if len(step) > 0: - s += "Epoch: {} ".format(step[0]) - if len(step) > 1: - s += "Iteration: {} ".format(step[1]) - if len(step) > 2: - s += "Validation Iteration: {} ".format(step[2]) - if len(step) == 0: - s = "Summary:" - return s - - -PERF_METER = lambda: Meter(AverageMeter(), AverageMeter(), AverageMeter()) -LOSS_METER = lambda: Meter(AverageMeter(), AverageMeter(), MinMeter()) -ACC_METER = lambda: Meter(AverageMeter(), AverageMeter(), MaxMeter()) -LR_METER = lambda: Meter(LastMeter(), LastMeter(), LastMeter()) - -LAT_100 = lambda: Meter(QuantileMeter(1), QuantileMeter(1), QuantileMeter(1)) -LAT_99 = lambda: Meter(QuantileMeter(0.99), QuantileMeter(0.99), QuantileMeter(0.99)) -LAT_95 = lambda: Meter(QuantileMeter(0.95), QuantileMeter(0.95), QuantileMeter(0.95)) - - -class Meter(object): - def __init__(self, iteration_aggregator, epoch_aggregator, run_aggregator): - self.run_aggregator = run_aggregator - self.epoch_aggregator = epoch_aggregator - self.iteration_aggregator = iteration_aggregator - - def record(self, val, n=1): - self.iteration_aggregator.record(val, n=n) - - def get_iteration(self): - v, n = self.iteration_aggregator.get_val() - return v - - def reset_iteration(self): - v, n = self.iteration_aggregator.get_data() - self.iteration_aggregator.reset() - if v is not None: - self.epoch_aggregator.record(v, n=n) - - def get_epoch(self): - v, n = self.epoch_aggregator.get_val() - return v - - def reset_epoch(self): - v, n = self.epoch_aggregator.get_data() - self.epoch_aggregator.reset() - if v is not None: - self.run_aggregator.record(v, n=n) - - def get_run(self): - v, n = self.run_aggregator.get_val() - return v - - def reset_run(self): - self.run_aggregator.reset() - - -class QuantileMeter(object): - def __init__(self, q): - self.q = q - self.reset() - - def reset(self): - self.vals = [] - self.n = 0 - - def record(self, val, n=1): - if isinstance(val, list): - self.vals += val - self.n += len(val) - else: - self.vals += [val] * n - self.n += n - - def get_val(self): - if not self.vals: - return None, self.n - return np.quantile(self.vals, self.q, interpolation="nearest"), self.n - - def get_data(self): - return self.vals, self.n - - -class MaxMeter(object): - def __init__(self): - self.reset() - - def reset(self): - self.max = None - self.n = 0 - - def record(self, val, n=1): - if self.max is None: - self.max = val - else: - self.max = max(self.max, val) - self.n = n - - def get_val(self): - return self.max, self.n - - def get_data(self): - return self.max, self.n - - -class MinMeter(object): - def __init__(self): - self.reset() - - def reset(self): - self.min = None - self.n = 0 - - def record(self, val, n=1): - if self.min is None: - self.min = val - else: - self.min = max(self.min, val) - self.n = n - - def get_val(self): - return self.min, self.n - - def get_data(self): - return self.min, self.n - - -class LastMeter(object): - def __init__(self): - self.reset() - - def reset(self): - self.last = None - self.n = 0 - - def record(self, val, n=1): - self.last = val - self.n = n - - def get_val(self): - return self.last, self.n - - def get_data(self): - return self.last, self.n - - -class AverageMeter(object): - def __init__(self): - self.reset() - - def reset(self): - self.n = 0 - self.val = 0 - - def record(self, val, n=1): - self.n += n - self.val += val * n - - def get_val(self): - if self.n == 0: - return None, 0 - return self.val / self.n, self.n - - def get_data(self): - if self.n == 0: - return None, 0 - return self.val / self.n, self.n - - -class Logger(object): - def __init__(self, print_interval, backends, verbose=False, last_epoch=-1): - self.epoch = last_epoch - self.iteration = -1 - self.val_iteration = -1 - self.metrics = OrderedDict() - self.backends = backends - self.print_interval = print_interval - self.verbose = verbose - dllogger.init(backends) - - def log_parameter(self, data, verbosity=0): - dllogger.log(step="PARAMETER", data=data, verbosity=verbosity) - - def register_metric(self, metric_name, meter, verbosity=0, metadata={}): - if self.verbose: - print("Registering metric: {}".format(metric_name)) - self.metrics[metric_name] = {"meter": meter, "level": verbosity} - dllogger.metadata(metric_name, metadata) - - def log_metric(self, metric_name, val, n=1): - self.metrics[metric_name]["meter"].record(val, n=n) - - def start_iteration(self, val=False): - if val: - self.val_iteration += 1 - else: - self.iteration += 1 - - def end_iteration(self, val=False): - it = self.val_iteration if val else self.iteration - if it % self.print_interval == 0: - metrics = { - n: m for n, m in self.metrics.items() if n.startswith("val") == val - } - step = ( - (self.epoch, self.iteration) - if not val - else (self.epoch, self.iteration, self.val_iteration) - ) - - verbositys = {m["level"] for _, m in metrics.items()} - for ll in verbositys: - llm = {n: m for n, m in metrics.items() if m["level"] == ll} - - dllogger.log( - step=step, - data={n: m["meter"].get_iteration() for n, m in llm.items()}, - verbosity=ll, - ) - - for n, m in metrics.items(): - m["meter"].reset_iteration() - - dllogger.flush() - - def start_epoch(self): - self.epoch += 1 - self.iteration = 0 - self.val_iteration = 0 - - for n, m in self.metrics.items(): - m["meter"].reset_epoch() - - def end_epoch(self): - for n, m in self.metrics.items(): - m["meter"].reset_iteration() - - verbositys = {m["level"] for _, m in self.metrics.items()} - for ll in verbositys: - llm = {n: m for n, m in self.metrics.items() if m["level"] == ll} - dllogger.log( - step=(self.epoch,), - data={n: m["meter"].get_epoch() for n, m in llm.items()}, - ) - - def end(self): - for n, m in self.metrics.items(): - m["meter"].reset_epoch() - - verbositys = {m["level"] for _, m in self.metrics.items()} - for ll in verbositys: - llm = {n: m for n, m in self.metrics.items() if m["level"] == ll} - dllogger.log( - step=tuple(), data={n: m["meter"].get_run() for n, m in llm.items()} - ) - - for n, m in self.metrics.items(): - m["meter"].reset_epoch() - - dllogger.flush() - - def iteration_generator_wrapper(self, gen, val=False): - for g in gen: - self.start_iteration(val=val) - yield g - self.end_iteration(val=val) - - def epoch_generator_wrapper(self, gen): - for g in gen: - self.start_epoch() - yield g - self.end_epoch() diff --git a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/training.py b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/training.py index 93c2b174f2..5245fe7794 100644 --- a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/training.py +++ b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/image_classification/training.py @@ -33,10 +33,8 @@ import torch import torch.nn as nn from torch.autograd import Variable -from . import logger as log from . import resnet as models from . import utils -import dllogger try: from apex.parallel import DistributedDataParallel as DDP @@ -189,16 +187,9 @@ def get_optimizer( def lr_policy(lr_fn, logger=None): - if logger is not None: - logger.register_metric( - "lr", log.LR_METER(), verbosity=dllogger.Verbosity.VERBOSE - ) - def _alr(optimizer, iteration, epoch): lr = lr_fn(iteration, epoch) - if logger is not None: - logger.log_metric("lr", lr) for param_group in optimizer.param_groups: param_group["lr"] = lr return lr @@ -321,37 +312,6 @@ def train( ): print(f"training...") print(f"register_metrics {register_metrics}, logger {logger}.") - if register_metrics and logger is not None: - logger.register_metric( - "train.loss", - log.LOSS_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=LOSS_METADATA, - ) - logger.register_metric( - "train.compute_ips", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=IPS_METADATA, - ) - logger.register_metric( - "train.total_ips", - log.PERF_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=IPS_METADATA, - ) - logger.register_metric( - "train.data_time", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "train.compute_time", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) step = get_train_step( model_and_loss, @@ -367,8 +327,7 @@ def train( optimizer.zero_grad() last_train_step = total_train_step data_iter = enumerate(train_loader) - if logger is not None: - data_iter = logger.iteration_generator_wrapper(data_iter) + if prof > 0: data_iter = utils.first_n(prof, data_iter) @@ -414,12 +373,6 @@ def train( ) total_train_step += 1 - if logger is not None: - logger.log_metric("train.loss", to_python_float(loss), bs) - logger.log_metric("train.compute_ips", calc_ips(bs, it_time - data_time)) - logger.log_metric("train.total_ips", calc_ips(bs, it_time)) - logger.log_metric("train.data_time", data_time) - logger.log_metric("train.compute_time", it_time - data_time) end = time.time() @@ -472,79 +425,16 @@ def validate( ): print(f"validating...") print(f"register_metrics {register_metrics}, logger {logger}.") - if register_metrics and logger is not None: - logger.register_metric( - "val.top1", - log.ACC_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=ACC_METADATA, - ) - logger.register_metric( - "val.top5", - log.ACC_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=ACC_METADATA, - ) - logger.register_metric( - "val.loss", - log.LOSS_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=LOSS_METADATA, - ) - logger.register_metric( - "val.compute_ips", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=IPS_METADATA, - ) - logger.register_metric( - "val.total_ips", - log.PERF_METER(), - verbosity=dllogger.Verbosity.DEFAULT, - metadata=IPS_METADATA, - ) - logger.register_metric( - "val.data_time", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "val.compute_latency", - log.PERF_METER(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "val.compute_latency_at100", - log.LAT_100(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "val.compute_latency_at99", - log.LAT_99(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) - logger.register_metric( - "val.compute_latency_at95", - log.LAT_95(), - verbosity=dllogger.Verbosity.VERBOSE, - metadata=TIME_METADATA, - ) step = get_val_step(model_and_loss) - top1 = log.AverageMeter() # switch to evaluate mode model_and_loss.eval() end = time.time() data_iter = enumerate(val_loader) - if not logger is None: - data_iter = logger.iteration_generator_wrapper(data_iter, val=True) + if prof > 0: data_iter = utils.first_n(prof, data_iter) @@ -558,19 +448,6 @@ def validate( it_time = time.time() - end - top1.record(to_python_float(prec1), bs) - if logger is not None: - logger.log_metric("val.top1", to_python_float(prec1), bs) - logger.log_metric("val.top5", to_python_float(prec5), bs) - logger.log_metric("val.loss", to_python_float(loss), bs) - logger.log_metric("val.compute_ips", calc_ips(bs, it_time - data_time)) - logger.log_metric("val.total_ips", calc_ips(bs, it_time)) - logger.log_metric("val.data_time", data_time) - logger.log_metric("val.compute_latency", it_time - data_time) - logger.log_metric("val.compute_latency_at95", it_time - data_time) - logger.log_metric("val.compute_latency_at99", it_time - data_time) - logger.log_metric("val.compute_latency_at100", it_time - data_time) - loss_sum += to_python_float(loss) total_val_step += 1 @@ -582,7 +459,7 @@ def validate( ) break - return [top1, loss_sum / total_val_step] + return loss_sum / total_val_step # Train loop {{{ @@ -621,6 +498,7 @@ def train_loop( ) if is_first_rank: ts = str(time.time()) + # logdir = os.path.expanduser('~/tensorboard/{}/logs/'.format(os.environ['DLTS_JOB_ID']) + ts) logdir = os.path.expanduser( "~/tensorboard/{}/logs/".format(os.environ["AZUREML_RUN_ID"]) + ts ) @@ -646,8 +524,6 @@ def train_loop( writer.add_scalar("train/summary/scalar/world_size", world_size, epoch) mlflow.log_metric("train/world_size", step=epoch, value=world_size) - if logger is not None: - logger.start_epoch() if not skip_training: total_train_step = train( train_loader, @@ -667,7 +543,7 @@ def train_loop( ) if not skip_validation and not detector.is_preempted(): - top1, val_loss = validate( + val_loss = validate( val_loader, model_and_loss, fp16, @@ -677,19 +553,6 @@ def train_loop( prof=prof, register_metrics=epoch == start_epoch, ) - if not detector.is_preempted(): - prec1, nimg = top1.get_val() - if writer: - writer.add_scalar("val/summary/scalar/loss", val_loss, epoch) - writer.add_scalar("val/summary/scalar/prec1", prec1, epoch) - mlflow.log_metric("val/loss", step=epoch, value=val_loss) - mlflow.log_metric("val/prec1", step=epoch, value=prec1) - - if logger is not None: - print( - "Epoch ", epoch, " complete with is_preempted ", detector.is_preempted() - ) - logger.end_epoch() save_ckpt = is_first_rank and ( detector.is_preempted() or (epoch + 1) % save_checkpoint_epochs == 0 @@ -705,14 +568,8 @@ def train_loop( save_ckpt = False print(f"save ckpt {save_ckpt}, ckpt dir {checkpoint_dir}.") if save_ckpt: - if not skip_validation and not detector.is_preempted(): - is_best = logger.metrics["val.top1"]["meter"].get_epoch() > best_prec1 - best_prec1 = max( - logger.metrics["val.top1"]["meter"].get_epoch(), best_prec1 - ) - else: - is_best = False - best_prec1 = 0 + is_best = False + best_prec1 = 0 ckpt_epoch_index = epoch + 1 if not detector.is_preempted() else epoch utils.save_checkpoint( diff --git a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/main.py b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/main.py index 3600cef151..13c8f8d066 100644 --- a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/main.py +++ b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/main.py @@ -56,7 +56,6 @@ ) import image_classification.resnet as models -import image_classification.logger as log from image_classification.smoothing import LabelSmoothing from image_classification.mixup import NLLMultiLabelSmooth, MixUpWrapper @@ -64,7 +63,6 @@ from image_classification.training import * from image_classification.utils import * -import dllogger import torch.multiprocessing as mp import os @@ -471,26 +469,6 @@ def _worker_init_fn(id): args.data, args.batch_size, 1000, False, workers=args.workers, fp16=args.fp16 ) - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: - logger = log.Logger( - args.print_freq, - [ - dllogger.StdOutBackend( - dllogger.Verbosity.DEFAULT, step_format=log.format_step - ), - dllogger.JSONStreamBackend( - dllogger.Verbosity.VERBOSE, - os.path.join(args.workspace, args.raport_file), - ), - ], - last_epoch=start_epoch - 1, - ) - - else: - logger = log.Logger(args.print_freq, [], last_epoch=start_epoch - 1) - - logger.log_parameter(args.__dict__, verbosity=dllogger.Verbosity.DEFAULT) - optimizer = get_optimizer( list(model_and_loss.model.named_parameters()), args.fp16, @@ -505,13 +483,11 @@ def _worker_init_fn(id): ) if args.lr_schedule == "step": - lr_policy = lr_step_policy( - args.lr, [30, 60, 80], 0.1, args.warmup, logger=logger - ) + lr_policy = lr_step_policy(args.lr, [30, 60, 80], 0.1, args.warmup, logger=None) elif args.lr_schedule == "cosine": - lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=logger) + lr_policy = lr_cosine_policy(args.lr, args.warmup, args.epochs, logger=None) elif args.lr_schedule == "linear": - lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=logger) + lr_policy = lr_linear_policy(args.lr, args.warmup, args.epochs, logger=None) if args.amp: model_and_loss, optimizer = amp.initialize( @@ -534,7 +510,7 @@ def _worker_init_fn(id): val_loader, args.epochs, args.fp16, - logger, + None, should_backup_checkpoint(args), args.save_checkpoint_epochs, use_amp=args.amp, @@ -549,8 +525,7 @@ def _worker_init_fn(id): total_train_step=args.total_train_step, ) exp_duration = time.time() - exp_start_time - if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: - logger.end() + print("Experiment ended") sys.stdout.flush() diff --git a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/requirements.txt b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/requirements.txt index d605e3bcc0..e69de29bb2 100644 --- a/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/requirements.txt +++ b/sdk/python/jobs/pipelines/2d_image_classification_with_densenet/imagecnn_train/requirements.txt @@ -1 +0,0 @@ -git+git://github.com/NVIDIA/dllogger.git@26a0f8f1958de2c0c460925ff6102a4d2486d6cc#egg=dllogger