In [1]:
import os
import random
import numpy as np
import torch

def set_gpu_environ():
    """Sets CUDA_VISIBLE_DEVICES to those under minimal memory load.
    Meant to be used in notebooks only.
    """
    import os
    import subprocess
    query = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv']).decode().split('\n')[1:-1]
    utilization = [int(x.replace(" MiB", "")) for x in query]
    free = [i for i in range(len(utilization)) if utilization[i] == min(utilization)]
    set_visible = ",".join([str(i) for i in free])
    os.environ["CUDA_VISIBLE_DEVICES"] = set_visible
    print(set_visible)
set_gpu_environ()

1


In [2]:
import sys
sys.path.append('/opt/slh/icecube/')
import config
import os
import time
import math
import pickle
from contextlib import nullcontext
from transformers.optimization import get_cosine_schedule_with_warmup
import numpy as np
import torch
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed import init_process_group, destroy_process_group
from icecube.modelsgraph import DynEdgeV1, gVonMisesFisher3DLoss, GraphxTransformerV4
from icecube.graphdataset import GraphDasetV0
from icecube.utils import gget_score_vector
from pathlib import Path
from datasets import load_dataset, load_from_disk, concatenate_datasets
from torch_geometric.loader import DataLoader as gDataLoader
import random
import os
import pandas as pd
from tqdm import tqdm
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint, LearningRateMonitor


  warn(f"Failed to load image Python extension: {e}")


[1;34mgraphnet[0m: [32mINFO    [0m 2023-03-09 14:26:59 - get_logger - Writing log to [1mlogs/graphnet_20230309-142659.log[0m


In [3]:
class IceCubeModel(pl.LightningModule):
    def __init__(
        self,
        model_name, 
        loss_func, 
        metric_func,
        dl_len,
        max_lr, 
        wd, 
        beta_1, 
        beta_2,
        **kwargs,
    ):
        super().__init__()
        self.save_hyperparameters()
        self.model = model_name()
        self.loss_func = loss_func()
        self.metric_func = metric_func

    def forward(self, batch):
        out = self.model(batch)
        return out
    
    def training_step(self, batch, batch_idx):
        out = self.forward(batch)
        loss = self.loss_func(out, batch.y)
        self.log_dict({"loss/train_step": loss})
        return {"loss": loss}

    def training_epoch_end(self, training_step_outputs):
        avg_loss = torch.stack([x["loss"] for x in training_step_outputs]).mean()
        self.log("loss/train", avg_loss, sync_dist=True)


    def validation_step(self, batch, batch_idx):
        out = self.forward(batch)
        loss = self.loss_func(out, batch.y)
        score = self.metric_func(out, batch.y)

        output = {
            "val_loss": loss,
            "metric": torch.tensor(score),
        }

        return output

    def validation_epoch_end(self, outputs):
        loss_val = torch.stack([x["val_loss"] for x in outputs]).mean()
        metric = torch.stack([x["metric"] for x in outputs]).mean()

        self.log_dict(
            {"loss/valid": loss_val, "metric": metric},
            prog_bar=True,
            sync_dist=True,
        )
    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.model.parameters(),
                                  lr=self.hparams.max_lr, 
                                  weight_decay=self.hparams.wd, 
                                  betas=(self.hparams.beta_1, self.hparams.beta_2))


        scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.hparams.warmup_steps,
                num_training_steps=self.hparams.dl_len,
                )

        return {
            "optimizer": optimizer,
            "lr_scheduler": {"scheduler": scheduler, "interval": "step", "frequency": 1},
        }



In [4]:
def get_config(config_name):
    configs = eval(f"config.{config_name}")
    print(f"Training with config: {configs.__dict__}")
    os.makedirs(configs.FOLDER/configs.EXP_NAME)
    return configs 

In [5]:
config = get_config('EXP_100')

Training with config: {'__module__': 'config', 'FOLDER': Path('/opt/slh/icecube/RESULTS'), 'DATA_CACHE_DIR': Path('/opt/slh/icecube/data/hf_cashe'), 'EXP_NAME': 'EXP_100', 'TRN_BATCH_RANGE': (1, 650), 'VAL_BATCH_RANGE': (655, 656), 'METRIC': <function gget_score_vector at 0x7fc221164290>, 'TRN_DATASET': <class 'icecube.graphdataset.GraphDasetV0'>, 'VAL_DATASET': <class 'icecube.graphdataset.GraphDasetV0'>, 'BATCH_SIZE': 768, 'NUM_WORKERS': 22, 'PRESISTENT_WORKERS': True, 'LOSS_FUNC': <class 'icecube.modelsgraph.gVonMisesFisher3DLoss'>, 'MAX_LR': 0.0005, 'WD': 0.1, 'GRADIEN_ACCUMULATION_STEPS': 12, 'WARMUP_STEPS': 1000, 'BETA1': 0.9, 'BETA2': 0.95, 'MODEL': <class 'icecube.modelsgraph.GraphxTransformerV4'>, 'SCHEDULER': <function get_cosine_schedule_with_warmup at 0x7fbf7f5efd40>, '__dict__': <attribute '__dict__' of 'EXP_100' objects>, '__weakref__': <attribute '__weakref__' of 'EXP_100' objects>, '__doc__': None}


In [6]:

    
vld_pth = [
        load_from_disk(config.DATA_CACHE_DIR / f"batch_{i}.parquet")
        for i in range(config.VAL_BATCH_RANGE[0], config.VAL_BATCH_RANGE[1])
    ]
vld_pth = concatenate_datasets(vld_pth)
vld_ds = config.VAL_DATASET(vld_pth)
valid_dl = gDataLoader(
        vld_ds,
        batch_size=config.BATCH_SIZE,
        shuffle=False,
        num_workers=config.NUM_WORKERS,
        pin_memory=True,
        persistent_workers=config.PRESISTENT_WORKERS,
    )
nums = [i for i in range(config.TRN_BATCH_RANGE[0], config.TRN_BATCH_RANGE[1])]
random.shuffle(nums)
trn_pth = [
                load_from_disk(config.DATA_CACHE_DIR / f"batch_{i}.parquet") for i in nums
            ]
trn_pth = concatenate_datasets(trn_pth)
trn_ds = config.TRN_DATASET(trn_pth)
train_dl = gDataLoader(
            trn_ds,
            batch_size=config.BATCH_SIZE,
            shuffle=False,
            num_workers=config.NUM_WORKERS,
            pin_memory=True,
            persistent_workers=config.PRESISTENT_WORKERS,
        )

dl_len = len(train_dl)// config.GRADIEN_ACCUMULATION_STEPS




In [7]:
md = IceCubeModel(model_name = config.MODEL, 
                  loss_func = config.LOSS_FUNC,
                  metric_func = config.METRIC, 
                  dl_len = dl_len,
                  max_lr = config.MAX_LR, 
                  wd = config.WD, 
                  beta_1 = config.BETA1,
                  beta_2 = config.BETA2, 
                  warmup_steps = config.WARMUP_STEPS)
#md.model.load_state_dict(torch.load('/opt/slh/icecube/RESULTS/EXP_25_FT/EXP_25_FT_2.pth'))
#md.load_from_checkpoint("");

In [None]:


wandb_logger = WandbLogger(project="ice",
         entity="kaggle-hi",
         name=config.EXP_NAME)

chekpoint_callback = ModelCheckpoint(dirpath = config.FOLDER / config.EXP_NAME, 
                filename = "{epoch:02d}-{metric:.4f}", 
                monitor="metric",
                save_top_k=8, 
                save_last=True,
                )

lr_monitor = LearningRateMonitor(logging_interval='step')

trainer = pl.Trainer(max_epochs=1, 
                    accelerator='gpu', 
                    devices=1,
                    precision=16, 
                    accumulate_grad_batches=config.GRADIEN_ACCUMULATION_STEPS,
                    gradient_clip_val=1.0,
                    val_check_interval=len(train_dl)//8, 
                    logger=wandb_logger,
                    callbacks=[chekpoint_callback, lr_monitor])

trainer.fit(model=md, train_dataloaders = train_dl, val_dataloaders = valid_dl)

[34m[1mwandb[0m: Currently logged in as: [33mdrhb[0m ([33mkaggle-hi[0m). Use [1m`wandb login --relogin`[0m to force relogin


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]