In [6]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('/opt/slh/icecube/')

import os
os.environ["CUDA_VISIBLE_DEVICES"]="1,0"
#os.environ["CUDA_VISIBLE_DEVICES"]="1"
#os.environ["NCCL_P2P_DISABLE"] = "1"

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
import polars as pl
import pandas as pd
import gc
import os
import numpy as np
from icecube.fastai_fix import *
from tqdm.notebook import tqdm
from icecube.data_train import RandomChunkSampler,LenMatchBatchSampler,IceCubeCache, DeviceDataLoader
from icecube.loss import loss, loss_vms
from icecube.models import EncoderWithDirectionReconstructionV18
from fastxtend.vision.all import EMACallback

In [8]:
OUT = 'V18FT'
PATH = '../data/'

NUM_WORKERS = 24
SEED = 2023
bs = 1024
L = 192

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED)
os.makedirs(OUT, exist_ok=True)

In [9]:
def WrapperAdamW(param_groups,**kwargs):
    return OptimWrapper(param_groups,torch.optim.AdamW)



In [10]:
fname = OUT

ds_train = IceCubeCache(PATH, mode='train', L=L, reduce_size=0.125)
ds_train_len = IceCubeCache(PATH, mode='train', L=L, reduce_size=0.125, mask_only=True)
sampler_train = RandomChunkSampler(ds_train_len, chunks=ds_train.chunks)
len_sampler_train = LenMatchBatchSampler(sampler_train, batch_size=bs, drop_last=True)
dl_train = DeviceDataLoader(torch.utils.data.DataLoader(ds_train, 
            batch_sampler=len_sampler_train, num_workers=4, persistent_workers=True))

ds_val = IceCubeCache(PATH, mode='eval', L=L)
ds_val_len = IceCubeCache(PATH, mode='eval', L=L, mask_only=True)
sampler_val = torch.utils.data.SequentialSampler(ds_val_len)
len_sampler_val = LenMatchBatchSampler(sampler_val, batch_size=bs, drop_last=False)
dl_val= DeviceDataLoader(torch.utils.data.DataLoader(ds_val, batch_sampler=len_sampler_val,
            num_workers=0))

data = DataLoaders(dl_train,dl_val)
model = EncoderWithDirectionReconstructionV18()
model.load_state_dict(torch.load('/opt/slh/icecube/hb_training_loop/V18/models/model_7.pth'))
model = nn.DataParallel(model)
model = model.cuda()
learn = Learner(data, model,  path = OUT, loss_func=loss_vms,cbs=[GradientClip(3.0),CSVLogger(),EMACallback(),
            SaveModelCallback(monitor='loss',comp=np.less,every_epoch=True),
            GradientAccumulation(n_acc=4)],
            metrics=[loss], opt_func=partial(WrapperAdamW,eps=1e-7)).to_fp16()




In [None]:
learn.fit_one_cycle(8, 
                    lr_max=2e-5,
                    wd=0.05, 
                    pct_start=0.01, 
                    div=25,
                    div_final=25,
                    moms=(0.95,0.95,0.95))

epoch,train_loss,valid_loss,loss,time
0,1.336601,1.347999,0.97881,2:32:17
1,1.341409,1.346446,0.978643,2:32:33
2,1.314246,1.345321,0.97842,2:32:42
3,1.325136,1.344466,0.977938,2:32:55
4,1.326901,1.344888,0.977807,2:33:01
5,1.311469,1.344948,0.977622,2:33:11
6,1.31535,1.344665,0.977547,2:33:15


  L = max(1,L // 16)
  if not isinstance(inputs, collections.Container) or isinstance(inputs, torch.Tensor):


In [None]:
l