In [1]:
#!accelerate launch --mixed_precision fp16 ./train_accel.py --config_name EXP_02

In [2]:
# !CUDA_VISIBLE_DEVICES=0 python train.py --config_name BASELINE_HF_V1

In [3]:
#!CUDA_VISIBLE_DEVICES=0 python train.py --config_name BASELINE_HF_V2

In [4]:
# !CUDA_VISIBLE_DEVICES=0 python train.py --config_name BASELINE_EMBED_V0

In [5]:
#!CUDA_VISIBLE_DEVICES=0 python train.py --config_name BASELINE_EMBED_V1

In [6]:
#!CUDA_VISIBLE_DEVICES=0 python train.py --config_name BASELINE_EMBED_V2

In [7]:
#!CUDA_VISIBLE_DEVICES=0 python train.py --config_name MATGRAPH

In [8]:
#!CUDA_VISIBLE_DEVICES=0 python train.py --config_name MATGRAPHV2

In [9]:
#!CUDA_VISIBLE_DEVICES=0 python train.py --config_name BASELINE_EMBED_V3

In [10]:
#!CUDA_VISIBLE_DEVICES=0 python train.py --config_name BASELINE_EMBED_V5

In [11]:
#!CUDA_VISIBLE_DEVICES=0 python train.py --config_name BASELINE_HF_V5

In [12]:
#!CUDA_VISIBLE_DEVICES=1 python train.py --config_name BASELINE_HF_V6

In [13]:
#!CUDA_VISIBLE_DEVICES=1 python train.py --config_name BASELINE_HF_V7

In [14]:
#!CUDA_VISIBLE_DEVICES=1 python train.py --config_name BASELINE_HF_V8

In [15]:
#!CUDA_VISIBLE_DEVICES=1 python train.py --config_name BASELINE_HF_V9

In [16]:
#!CUDA_VISIBLE_DEVICES=1 python eval.py --config_name BASELINE_HF_V8FTEVAL

In [17]:
def set_gpu_environ():
    """Sets CUDA_VISIBLE_DEVICES to those under minimal memory load.
    Meant to be used in notebooks only.
    """
    import os
    import subprocess
    query = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv']).decode().split('\n')[1:-1]
    utilization = [int(x.replace(" MiB", "")) for x in query]
    free = [i for i in range(len(utilization)) if utilization[i] == min(utilization)]
    set_visible = ",".join([str(i) for i in free])
    os.environ["CUDA_VISIBLE_DEVICES"] = set_visible
set_gpu_environ()

In [18]:
import os
import random
import numpy as np
import torch
import config

def seed(seed=0):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    

def train(cfg):
    seed()
    custom_model = cfg.MODEL_NAME()
    if cfg.MODEL_WTS:
        print(f"Loading model weights from {cfg.MODEL_WTS}")
        custom_model.load_state_dict(torch.load(cfg.MODEL_WTS))
    opt = cfg.OPT(
        custom_model.parameters(), lr=cfg.LR, weight_decay=cfg.WD
    )
    loss_func = cfg.LOSS_FUNC()
    scheduler = cfg.SCHEDULER(
        opt,
        num_warmup_steps=cfg.WARM_UP_PCT * cfg.EPOCHS,
        num_training_steps=cfg.EPOCHS,
    )



    cfg.FIT_FUNC(
        epochs=cfg.EPOCHS,
        model=custom_model,
        loss_fn=loss_func,
        opt=opt,
        metric=cfg.METRIC,
        config = cfg,
        folder=cfg.FOLDER/cfg.EXP_NAME,
        exp_name=f"{cfg.EXP_NAME}",
        device=cfg.DEVICE,
        sched=scheduler,
    )
    
def main(config_name):
    configs = eval(f"config.{config_name}")
    print(f"Training with config: {configs.__dict__}")
    os.makedirs(configs.FOLDER/configs.EXP_NAME)
    train(configs)

  warn(f"Failed to load image Python extension: {e}")


[1;34mgraphnet[0m: [32mINFO    [0m 2023-02-24 12:08:46 - get_logger - Writing log to [1mlogs/graphnet_20230224-120846.log[0m


In [None]:
main('BASELINE_graph_V10')

Training with config: {'__module__': 'config', 'EXP_NAME': 'EXP_35', 'LOSS_FUNC': <class 'icecube.modelsgraph.gVonMisesFisher3DLoss'>, 'NUM_WORKERS': 22, 'MODEL_NAME': <class 'icecube.modelsgraph.EGNNModelV9'>, 'FIT_FUNC': <function gfit_shuflle at 0x7fd2739d3b90>, 'METRIC': <function gget_score_vector at 0x7fd2739d3ef0>, 'TRN_DATASET': <class 'icecube.graphdataset.GraphDasetV0'>, 'VAL_DATASET': <class 'icecube.graphdataset.GraphDasetV0'>, 'TRN_BATCH_RANGE': [[1, 100], [100, 200], [200, 300], [300, 400], [400, 500], [500, 600]], 'EPOCHS': 12, 'DEVICE': 'cuda:0', '__doc__': None}


[34m[1mwandb[0m: Currently logged in as: [33mdrhb[0m ([33mkaggle-hi[0m). Use [1m`wandb login --relogin`[0m to force relogin


epoch,train_loss,valid_loss,val_metric


trn_range: [1, 100]




Better model found at epoch 0 with value: 1.1558771133422852.
   epoch  train_loss  valid_loss     metric
0      0    2.582576    1.971129  1.1558771
trn_range: [100, 200]




Better model found at epoch 1 with value: 1.075687289237976.
   epoch  train_loss  valid_loss     metric
0      1    1.868299    1.722111  1.0756873
trn_range: [200, 300]




Better model found at epoch 2 with value: 1.0643270015716553.
   epoch  train_loss  valid_loss    metric
0      2    1.702401    1.674164  1.064327
trn_range: [300, 400]




Better model found at epoch 3 with value: 1.0614221096038818.
   epoch  train_loss  valid_loss     metric
0      3    1.657294    1.634328  1.0614221
trn_range: [400, 500]




Better model found at epoch 4 with value: 1.0651195049285889.
   epoch  train_loss  valid_loss     metric
0      4    1.626149    1.645543  1.0651195
trn_range: [500, 600]




Better model found at epoch 5 with value: 1.049655795097351.
   epoch  train_loss  valid_loss     metric
0      5    1.604195    1.582335  1.0496558
trn_range: [1, 100]




Better model found at epoch 6 with value: 1.0420535802841187.
   epoch  train_loss  valid_loss     metric
0      6    1.589404    1.573633  1.0420536
trn_range: [100, 200]




Better model found at epoch 7 with value: 1.0442147254943848.
   epoch  train_loss  valid_loss     metric
0      7     1.57626    1.588152  1.0442147
trn_range: [200, 300]




In [None]:
torch.cuda.current_device()

In [None]:
k

In [None]:
k