In [1]:
from IPython.display import display, HTML, Markdown
display(HTML("<style>.container { width:100% !important; }</style>"))
display(HTML("<style>div.output_scroll { height: 44em; }</style>"))

In [2]:
import sys
import matplotlib

In [3]:
sys.path.append('/home/cjesus/Software/WatchMaL')

In [4]:
from analysis.event_display.cnn_mpmt_event_display import CNNmPMTEventDisplay

  print(f"Imported analysis code from WatChMaL repository with git version: {get_git_version(os.path.dirname(__file__))}")


Imported analysis code from WatChMaL repository with git version: None


In [5]:
help(CNNmPMTEventDisplay.__init__)

Help on function __init__ in module watchmal.dataset.cnn_mpmt.cnn_mpmt_dataset:

__init__(self, h5file, mpmt_positions_file, padding_type=None, transforms=None, collapse_arrays=False)
    Constructs a dataset for CNN data. Event hit data is read in from the HDF5 file and the PMT charge data is
    formatted into an event-display-like image for input to a CNN. Each pixel of the image corresponds to one mPMT
    module, with channels corresponding to each PMT within the mPMT. The mPMTs are placed in the image according to
    a mapping provided by the numpy array in the `mpmt_positions_file`.
    
    Parameters
    ----------
    h5file: string
        Location of the HDF5 file containing the event data
    mpmt_positions_file: string
        Location of a npz file containing the mapping from mPMT IDs to CNN image pixel locations
    transforms: sequence of string
        List of random transforms to apply to data before passing to CNN for data augmentation. Each element of the
        

In [6]:
# hydra imports
import hydra
from omegaconf import OmegaConf
from hydra.utils import instantiate, to_absolute_path

# torch imports
import torch
from torch.nn.parallel import DistributedDataParallel as DDP

import torch.multiprocessing as mp

# generic imports
import logging
import debugpy
import os
import numpy as np

import os
from hydra import initialize, initialize_config_module, initialize_config_dir, compose
from omegaconf import OmegaConf

from watchmal.utils.logging_utils import get_git_version

logger = logging.getLogger('train')

def main(config):
    """
    Run model using given config, spawn worker subprocesses as necessary

    Args:
        config  ... hydra config specified in the @hydra.main annotation
    """
    
    config.gpu_list = [0]#[0,1,2,3]
    config.tasks.train.data_loaders.train.batch_size=512 
    config.tasks.train.data_loaders.validation.batch_size=512
    config.tasks.evaluate.data_loaders.test.batch_size=512
    config.tasks.train.optimizers.lr=0.001 

    ngpus = len(config.gpu_list)
    is_distributed = ngpus > 1
    
    print(config.gpu_list)
    
    # Initialize process group env variables
    if is_distributed:
        os.environ['MASTER_ADDR'] = 'localhost'

        if 'MASTER_PORT' in config:
            master_port = config.MASTER_PORT
        else:
            master_port = 12355
            
        # Automatically select port based on base gpu
        master_port += config.gpu_list[0]
        os.environ['MASTER_PORT'] = str(master_port)

    # create run directory
    try:
        os.stat(config.dump_path)
    except:
        print("Creating a directory for run dump at : {}".format(config.dump_path))
        os.makedirs(config.dump_path)
    
    print("Dump path: {}".format(config.dump_path))

    # initialize seed
    if config.seed is None:
        # numpy call needed to fix pytorch issue that was patched in August 2020
        config.seed = np.random.randint(100000) #np.random.seed(torch.seed())
    
    if is_distributed:
        print("Using multiprocessing...")
        devids = ["cuda:{0}".format(x) for x in config.gpu_list]
        print("Using DistributedDataParallel on these devices: {}".format(devids))
        mp.spawn(main_worker_function, nprocs=ngpus, args=(ngpus, is_distributed, config))
    else:
        print("Only one gpu found, not using multiprocessing...")
        main_worker_function(0, ngpus, is_distributed, config)

def main_worker_function(rank, ngpus_per_node, is_distributed, config):
    """
    Instantiate model on a particular GPU, and perform train/evaluation tasks as specified

    Args:
        rank            ... rank of process among all spawned processes (in multiprocessing mode)
        ngpus_per_node  ... number of gpus being used (in multiprocessing mode)
        is_distributed  ... boolean indicating if running in multiprocessing mode
        config          ... hydra config specified in the @hydra.main annotation
    """
    print("rank: ", rank)
    # Infer rank from gpu and ngpus, rank is position in gpu list
    gpu = config.gpu_list[rank]

    print("Running main worker function on device: {}".format(gpu))
    torch.cuda.set_device(gpu)

    world_size = ngpus_per_node
    
    if is_distributed:
        torch.distributed.init_process_group(
            'nccl',
            init_method='env://',
            world_size=world_size,
            rank=rank,
        )

    # Instantiate model and engine
    model = instantiate(config.model).to(gpu)

    # Configure the device to be used for model training and inference
    if is_distributed:
        # Convert model batch norms to synchbatchnorm
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
        model = DDP(model, device_ids=[gpu])

    # Instantiate the engine
    engine = instantiate(config.engine, model=model, rank=rank, gpu=gpu, dump_path=config.dump_path)
    
    # Configure data loaders
    for task, task_config in config.tasks.items():
        if 'data_loaders' in task_config:
            engine.configure_data_loaders(config.data, task_config.data_loaders, is_distributed, config.seed)
    
    # Configure optimizers
    for task, task_config in config.tasks.items():
        if 'optimizers' in task_config:
            engine.configure_optimizers(task_config.optimizers)

    # Configure scheduler
    for task, task_config in config.tasks.items():
        if 'scheduler' in task_config:
            engine.configure_scheduler(task_config.scheduler)
    
    # Perform tasks
    for task, task_config in config.tasks.items():
        getattr(engine, task)(task_config)


with initialize(version_base=None, config_path="../../Software/WatchMaL/config/"):
    cfg = compose(config_name="resnet_train.yaml")
    main(cfg)
    #print(cfg)

[0]
Dump path: ./outputs/
Only one gpu found, not using multiprocessing...
rank:  0
Running main worker function on device: 0


RuntimeError: No CUDA GPUs are available

In [None]:
with initialize(version_base=None, config_path="../../Software/WatchMaL/config/"):
    cfg = compose(config_name="resnet_train.yaml")
    print(cfg)

--- Logging error ---
Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 461, in dispatch_queue
    await self.process_one()
  File "/opt/conda/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 450, in process_one
    await dispatch(*args)
TypeError: object NoneType can't be used in 'await' expression

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1089, in emit
    self.flush()
  File "/opt/conda/lib/python3.8/logging/__init__.py", line 1069, in flush
    self.stream.flush()
BrokenPipeError: [Errno 32] Broken pipe
Call stack:
  File "/opt/conda/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/opt/conda/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/opt/conda/lib/python3.8/site-packages/ipykernel_laun