In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/opt/slh/icecube/')

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1,0"
#os.environ["CUDA_VISIBLE_DEVICES"]="1"
#os.environ["NCCL_P2P_DISABLE"] = "1"

In [2]:
import polars as pl
import pandas as pd
import gc
import os
import numpy as np
from icecube.fastai_fix import *
from tqdm.notebook import tqdm
from icecube.data_train import RandomChunkSampler,LenMatchBatchSampler,IceCubeCache, DeviceDataLoader
from icecube.loss import loss, loss_vms
from icecube.models import EncoderWithDirectionReconstructionV17

  warn(f"Failed to load image Python extension: {e}")


[1;34mgraphnet[0m: [32mINFO    [0m 2023-03-22 23:52:30 - get_logger - Writing log to [1mlogs/graphnet_20230322-235230.log[0m


In [3]:
OUT = 'V17'
PATH = '../data/'

NUM_WORKERS = 24
SEED = 2023
bs = 1024
L = 192

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)
os.makedirs(OUT, exist_ok=True)

In [4]:
def WrapperAdamW(param_groups,**kwargs):
    return OptimWrapper(param_groups,torch.optim.AdamW)



In [5]:
fname = OUT

ds_train = IceCubeCache(PATH, mode='train', L=L, reduce_size=0.125)
ds_train_len = IceCubeCache(PATH, mode='train', L=L, reduce_size=0.125, mask_only=True)
sampler_train = RandomChunkSampler(ds_train_len, chunks=ds_train.chunks)
len_sampler_train = LenMatchBatchSampler(sampler_train, batch_size=bs, drop_last=True)
dl_train = DeviceDataLoader(torch.utils.data.DataLoader(ds_train, 
            batch_sampler=len_sampler_train, num_workers=4, persistent_workers=True))

ds_val = IceCubeCache(PATH, mode='eval', L=L)
ds_val_len = IceCubeCache(PATH, mode='eval', L=L, mask_only=True)
sampler_val = torch.utils.data.SequentialSampler(ds_val_len)
len_sampler_val = LenMatchBatchSampler(sampler_val, batch_size=bs, drop_last=False)
dl_val= DeviceDataLoader(torch.utils.data.DataLoader(ds_val, batch_sampler=len_sampler_val,
            num_workers=0))

data = DataLoaders(dl_train,dl_val)
model = EncoderWithDirectionReconstructionV17()
model = nn.DataParallel(model)
learn = Learner(data,
                model,  
                path = OUT, 
                loss_func=loss_vms,
                cbs=[GradientClip(3.0),
                     CSVLogger(),
                     SaveModelCallback(monitor='loss',comp=np.less,every_epoch=True),
                     GradientAccumulation(n_acc=4)],
                     metrics=[loss], 
                     opt_func=partial(WrapperAdamW,eps=1e-7)).to_fp16()




In [None]:
learn.fit_one_cycle(8, lr_max=5e-4, wd=0.05, pct_start=0.01)

epoch,train_loss,valid_loss,loss,time
0,1.560018,1.555013,1.051564,2:16:48
1,1.483454,1.494784,1.02777,2:17:03
2,1.446636,1.458646,1.018197,2:17:00
3,1.396156,1.422105,0.997488,2:16:51
4,1.389978,1.392831,0.993274,2:16:56
5,1.341305,1.374504,0.986461,2:17:27
6,1.319786,1.361925,0.98131,2:17:18


  L = max(1,L // 16)


In [None]:
learn.validate()

In [None]:
#learn.save('TEMP')

In [None]:
l