In [None]:
from functools import partial
import numpy as np
import numpy.random as npr
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
from torch.utils.data import DataLoader
import torchvision
import torchvision.transforms as transforms
import ray
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch

%load_ext autoreload
%autoreload 2

This notebooks performs an HPO with `ray-tune` and `pytorch`

In [None]:
def load_trainval_data(data_file, config):
    datasets = torch.load(data_file)

    train_data = datasets[0]
    val_data = datasets[1]
    # test_data = datasets[2]
    
    train_dataloader = DataLoader(train_data, batch_size=config['batch_size'], shuffle=True)
    val_dataloader = DataLoader(val_data, batch_size=1024, shuffle=False)
    # test_dataloader = DataLoader(test_data, batch_size=1024, shuffle=False)

    return train_dataloader, val_dataloader

# data_file = '../local_data/TrainData/20230111-165428-R2B5_y13y16_vcg-fluxes_rho_fluct.torch_data'
# tloader,vloader = load_trainval_data(data_file, {'batch_size': 512})
# print(len(tloader))

In [None]:
torch.cuda.device_count()
torch.cuda.get_device_name(1)

In [5]:
from convection_param.NetworksTorch import Sequential, Unet, ResDNN, SeqConv

def train(config, epochs, checkpoint_dir=None, data_file=None, model=None, save_model=False):
    
    print('Starting hpo training run')
    if model == 'seq':
        model = Sequential(input_dim=9*23,
                           output_dim=189,
                           n_hidden=config['n_hidden'],
                           n_layers=config['n_layers'],
                           activation=config['activation'],
                           bn=config['bn'])
    elif model == 'unet':
        model = Unet(n_channels=9,
                     n_classes=8,
                     output_channels_total=189,
                     n_levels=config['n_levels'],
                     n_features=config['n_features'],
                     column_height=23,
                     linear=False,
                     activation=config['activation'],
                     bn1=config['bn1'],
                     bn2=config['bn2'])
    elif model == 'resdnn':
        model = ResDNN(in_size=23*9,
                       out_size=189,
                       n_neurons=config['n_neurons'],
                       bn=config['batch_norm'],
                       n_layers_per_block=config['layers_per_block'],
                       n_levels=config['n_lvls'],
                       activation=config['activation'])
    elif model == 'seqconv':
        model = SeqConv(n_channels=9,
                        n_feature_channels=config['n_channels'],
                        column_height=23,
                        n_hidden=config['n_hidden'],
                        n_layers=config['n_layers'],
                        output_dim=189,
                        activation=config['activation'],
                        kernel_size=config['kernel_size'])
    
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            model = nn.DataParallel(model)
            
    model.to(device)

    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'])#, lr=0.0003)

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        model.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainloader, valloader = load_trainval_data(data_file, config)
    train_steps = len(trainloader)
    val_steps = len(valloader)

    for epoch in range(epochs):  # loop over the dataset multiple times
        model.train()
        # for batch, (X, y) in enumerate(dataloader):
        train_loss = 0
        for batch, (X, y) in enumerate(trainloader):
            X, y = X.to(device), y.to(device)

            # Compute prediction error
            pred = model(X)
            loss = loss_fn(pred, y)
            train_loss += loss.item()

            # Backpropagation
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        train_loss /= train_steps

        # Validation loss
        val_loss = 0.0
        model.eval()
        for X, y in valloader:
            with torch.no_grad():
                X, y = X.to(device), y.to(device)
                pred = model(X)
                val_loss += loss_fn(pred, y).item()

        val_loss /= val_steps
        
        tune.report(train_loss=train_loss, val_loss=val_loss)
        # tune.report(train_loss=train_loss.cpu().numpy(), val_loss=val_loss.cpu().numpy())
    print("Finished Training")

In [6]:
import numpy as np
import numpy.random as npr

def feat_lvl_dependence(n_lvl):
    max_channel_amount = 2048
    max_feat = max_channel_amount/(2**n_lvl)
    max_idx = int(np.floor(np.log2(max_feat)))
    feat_list = 2**np.arange(max_idx-2,max_idx+1)
    # return npr.choice(feat_list)
    return feat_list

feat_lvl_dependence(3)

array([ 64, 128, 256])

In [8]:
config_seq = {
    "batch_size": tune.choice([512,1024]),
    # "n_hidden": tune.randint(100, 1500),
    'n_hidden': tune.choice([16,32,64,128,256,512,1024,2048]),
    "activation": tune.choice([F.relu, F.selu, F.gelu, F.sigmoid, F.leaky_relu]),
    "bn": tune.choice([True,False]),
    'n_layers': tune.choice([1,2,3,4,5,6]),
    'lr': tune.choice([0.1,0.01,0.001,0.0003,0.0001]),
}
current_best_params_seq = [{
  "activation": F.leaky_relu,
  "batch_size": 1024,
  "bn": True,
  "n_hidden": 2048,
  "n_layers": 6,
  'lr': 0.0003,
}]

config_unet = {
    "batch_size": tune.choice([512,1024]),
    # "n_levels": tune.randint(3,6),
    "n_levels": tune.choice([2,3,4,5]),
    # "n_features": tune.choice([16,32,64,128,264]),
    "n_features": tune.sample_from(lambda spec: feat_lvl_dependence(spec.config.n_levels)),
    "bn1": tune.choice([True,False]),
    "bn2": tune.choice([True,False]),
    "activation": tune.choice([F.relu, F.selu, F.gelu, F.sigmoid, F.leaky_relu]),
    'lr': tune.choice([0.1, 0.01,0.001,0.0003,0.0001]),
}
current_best_params_unet = [{
             "batch_size":512,
             "n_levels":2,
             "n_features":512,
             "bn1": False,
             "bn2": False,
             "activation": F.leaky_relu,
             'lr': 0.0001,
}]

config_resdnn = {
    'batch_size': tune.choice([512,1024]),
    # 'n_neurons': tune.randint(100,1500),
    'n_neurons': tune.choice([16,32,64,128,256,512,1024,2048]),
    'batch_norm': tune.choice([True, False]),
    'layers_per_block': tune.choice([1,2,3,4]),
    'n_lvls': tune.choice([2,4,8,10,14,16]),
    'activation': tune.choice([nn.ReLU(), nn.SELU(), nn.GELU(), nn.Sigmoid(), nn.LeakyReLU()]),
    'lr': tune.choice([0.1, 0.01,0.001,0.0003,0.0001]),
}
current_best_params_resdnn = [{
    'batch_size':1024,
    'n_neurons': 2048,
    'batch_norm': True,
    'layers_per_block': 1,
    'n_lvls': 10,
    'activation': nn.ReLU(),
    'lr': 0.0003,
}]

config_seq_conv = {
    'batch_size': tune.choice([512,1024]),
    'n_channels': tune.choice([1,8,32,64,256,512,1024]),
    # 'n_hidden': tune.randint(100,1500),
    'n_hidden': tune.choice([16,32,64,128,256,512,1024,2048]),
    'n_layers': tune.choice([0,1,2,3,4,5]),
    'kernel_size': tune.choice([2,3,4,5]),
    "activation": tune.choice([F.relu, F.selu, F.gelu, F.sigmoid, F.leaky_relu]),
    'lr': tune.choice([0.1, 0.01,0.001,0.0003,0.0001]),
}
current_best_params_seq_conv = [{
    'batch_size': 512,
    'n_channels': 1024,
    'n_hidden': 1024,
    'n_layers': 1,
    'kernel_size': 5,
    'activation': F.gelu,
    'lr': 0.0001,
}]

def get_sp_size(search_space):
    result = 1
    for key,space in search_space.items():
        result *= len(space)
    return result

print(get_sp_size(config_seq))
print(get_sp_size(config_seq_conv))
print(get_sp_size(config_resdnn))
# get_sp_size(config_unet)

4800
67200
19200


In [None]:
model = 'unet'#'seq'#'resdnn'#'seqconv'#
log_dir = f'../logs_ray_torch/BestModelRetrain/{model}'
model_conf_map = {'seq': config_seq,
                  'seqconv': config_seq_conv,
                  'resdnn': config_resdnn,
                  'unet': config_unet}

def main(num_samples=10, max_num_epochs=10, grace_period=10, gpus_per_trial=1):
    data_file = '../local_data/TrainData/20230131-171851-R2B5_y13y16_vcg-fluxes_rho_fluct.torch_data'
    metric='val_loss'
    mode='min'
        
    # hyperopt_search = HyperOptSearch(
    #     metric=metric, mode=mode, space=model_conf_map[model])#,
        # points_to_evaluate=current_best_params_resdnn)
    scheduler = ASHAScheduler(
        metric=metric,
        mode=mode,
        max_t=max_num_epochs,
        grace_period=grace_period,
        reduction_factor=2)
    reporter = CLIReporter(
        metric_columns=["loss", "val_loss"])
    result = tune.run(
        partial(train, epochs=max_num_epochs, data_file=data_file, model=model),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=model_conf_map[model],
        num_samples=num_samples,
        scheduler=scheduler,
        # search_alg=hyperopt_search,
        progress_reporter=reporter,
        local_dir=log_dir)

    best_trial = result.get_best_trial("val_loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["val_loss"]))

if __name__ == "__main__":
    ray.init(dashboard_host = '0.0.0.0')
    # main(num_samples=50, max_num_epochs=100, grace_period=30, gpus_per_trial=1)
    main(num_samples=4, max_num_epochs=10, grace_period=10, gpus_per_trial=1)
    ray.shutdown()