In [2]:
from pytorch_lightning import Trainer, seed_everything

seed_everything(42)

from pytorch_lightning import LightningModule, LightningDataModule
import torch
from pytorch_lightning import trainer, LightningModule
from torch.nn import functional as F
import torch
import torchmetrics
import timm

from torch import nn
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings

warnings.filterwarnings("ignore", "is_categorical_dtype")
warnings.filterwarnings("ignore", "use_inf_as_na")

Seed set to 42


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from torch.utils.data import (
    DataLoader,
    SequentialSampler,
    RandomSampler,
    WeightedRandomSampler,
    Dataset,
)
import numpy as np


class TableDataset(Dataset):
    def __init__(self, df, features: list, label: list, num_classes=2, y_type="bt"):
        super(Dataset, self).__init__()
        assert isinstance(df, pd.DataFrame)
        assert isinstance(features, list)
        assert isinstance(label, list)

        for feature in features + label:
            assert feature in df.columns

        self.df = df.dropna(subset=features + label)
        assert len(self.df) > 0
        self.features = features
        self.label = label
        self.num_classes = num_classes
        self.y_type = y_type
        self._init_dataset()

    def _init_dataset(self):
        X = torch.tensor(self.df[self.features].values).float()

        y = torch.tensor(self.df[self.label].values)
        if (self.num_classes != len(self.label)) and self.y_type == "bt":
            y = F.one_hot(
                torch.tensor(y).long(), num_classes=self.num_classes
            ).squeeze()

        self.X = X
        self.y = y

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


class DatasetModule(LightningDataModule):
    def __init__(
        self,
        train,
        test,
        batch_size=32,
        features: list = None,
        label: list = None,
        num_classes=2,
        y_type="bt",
        num_workers=4,
    ):
        super().__init__()

        self.batch_size = batch_size
        self.features = features
        self.label = label
        self.num_classes = num_classes
        self.y_type = y_type
        self.num_workers = num_workers

        self._init_dataset(train, test)

    def _init_dataset(self, train, test):
        train, val = train_test_split(train, test_size=0.2)
        print(
            f"Train : {train[self.label].value_counts()}\nval : {val[self.label].value_counts()}\nTest : {test[self.label].value_counts()}"
        )
        if self.y_type == "bt" and len(self.label) == 1:

            class_weights = dict(
                enumerate(
                    class_weight.compute_class_weight(
                        "balanced",
                        classes=np.arange(self.num_classes),
                        y=train[self.label[0]],
                    )
                )
            )
            self.class_weights = class_weights

        self.train = TableDataset(
            train, self.features, self.label, self.num_classes, self.y_type
        )
        self.validation = TableDataset(
            val, self.features, self.label, self.num_classes, self.y_type
        )
        self.test = TableDataset(
            test, self.features, self.label, self.num_classes, self.y_type
        )

    def train_dataloader(self):

        if self.y_type == "bt":
            train_class_weights = [
                self.class_weights[torch.argmax(i).item()] for i in self.train.y
            ]
            sampler = WeightedRandomSampler(
                train_class_weights, len(train_class_weights), replacement=True
            )
        else:
            sampler = RandomSampler(self.train)

        return DataLoader(
            self.train,
            batch_size=self.batch_size,
            sampler=sampler,
            drop_last=True,
            persistent_workers=True,
            num_workers=self.num_workers,
        )

    def val_dataloader(self):
        return DataLoader(
            self.validation,
            batch_size=self.batch_size,
            persistent_workers=True,
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.validation),
        )

    def test_dataloader(self):
        return DataLoader(
            self.test,
            batch_size=self.batch_size,
            persistent_workers=True,
            num_workers=self.num_workers,
            sampler=SequentialSampler(self.test),
        )

In [4]:
train_imputed = pd.read_pickle("result/part1/train_imputed.pkl")
test_imputed = pd.read_pickle("result/part1/test_imputed.pkl")

In [5]:
proteomics = test_imputed.columns[test_imputed.columns.tolist().index("C3") :].tolist()
risk_factors = [
    "age",
    "sex",
    "ldl_a",
    "hdl_a",
    "tc_a",
    "tg_a",
    "sbp_a",
    "BMI",
    "smoking",
    "prevalent_diabetes",
]

PRS = ["PRS"]
proteomics

['C3',
 'KLK7',
 'GCHFR',
 'NHLRC3',
 'APOD',
 'GAPDH',
 'TP53I3',
 'CPA4',
 'ANXA2',
 'GRSF1',
 'IL25',
 'HMMR',
 'MRPL52',
 'PAIP2B',
 'THAP12',
 'FOS',
 'FGF9',
 'PITHD1',
 'THSD1',
 'PTGES2',
 'DEFB103A_DEFB103B',
 'ATP1B4',
 'CYB5A',
 'UNC79',
 'SLC34A3',
 'TAGLN3',
 'SLIRP',
 'CLASP1',
 'PSMC3',
 'KIR3DL2',
 'BEX3',
 'PFDN4',
 'BCL7A',
 'SMC3',
 'SLC28A1',
 'CDC123',
 'GJA8',
 'NMRK2',
 'GATA3',
 'CPLX2',
 'RASGRF1',
 'FGF7',
 'ANKRA2',
 'RBM25',
 'LYZL2',
 'CDK1',
 'CREB3',
 'CREBZF',
 'IGLON5',
 'SHC1',
 'ZP4',
 'TMOD4',
 'CEP152',
 'MYH7B',
 'CEP350',
 'CDC25A',
 'TRIM26',
 'MANEAL',
 'MUCL3',
 'GIMAP8',
 'CYTH3',
 'PDXDC1',
 'CLINT1',
 'MAPRE3',
 'EVI2B',
 'STAU1',
 'PCNA',
 'DNAJA1',
 'JMJD1C',
 'GAGE2A',
 'GAD1',
 'IZUMO1',
 'PDCL2',
 'PDE1C',
 'STOML2',
 'BSND',
 'MAPK13',
 'PDIA2',
 'BTLA',
 'MLLT1',
 'TPRKB',
 'ARHGAP5',
 'BTNL10',
 'PHLDB2',
 'PDIA5',
 'ATF4',
 'PRAME',
 'TOP1MT',
 'KHDC3L',
 'DCUN1D2',
 'IL3',
 'DCLRE1C',
 'ERCC1',
 'DCDC2C',
 'VCPKMT',
 'SPRING1',
 'M

In [6]:
dataset = DatasetModule(
    train=train_imputed,
    test=test_imputed,
    features=proteomics,
    label=["incident_cad"],
    num_classes=2,
    batch_size=256,
)

Train : incident_cad
0.0             27209
1.0              1596
dtype: int64
val : incident_cad
0.0             6799
1.0              403
dtype: int64
Test : incident_cad
0.0             14599
1.0               833
dtype: int64


  torch.tensor(y).long(), num_classes=self.num_classes
  torch.tensor(y).long(), num_classes=self.num_classes
  torch.tensor(y).long(), num_classes=self.num_classes


In [7]:
for x, y in dataset.train_dataloader():
    print(x.shape, y.shape)
    print(torch.argmax(y, dim=1).sum())

torch.Size([256, 2911]) torch.Size([256, 2])
tensor(134)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(130)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(138)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(117)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(130)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(119)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(119)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(138)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(132)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(120)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(131)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(131)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(129)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(127)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(130)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(128)
torch.Size([256, 2911]) torch.Size([256, 2])
tensor(126)
torch.Size([256, 2911]) torch.S

In [8]:
import os
from torch import optim, nn, utils, Tensor
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor

import torch
import pytorch_lightning as pl

import torch.nn as nn
import torch.optim as optim
from collections import defaultdict


class LinearResBlock(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearResBlock, self).__init__()
        self.fc1 = nn.Linear(input_size, output_size)
        self.layer_norm = nn.LayerNorm(output_size)

        torch.nn.init.kaiming_normal_(self.fc1.weight, nonlinearity="relu")  # <6>
        torch.nn.init.constant_(self.layer_norm.weight, 0.5)  # <7>
        torch.nn.init.zeros_(self.layer_norm.bias)

    def forward(self, x):
        out = self.fc1(x)

        out = self.layer_norm(out)
        out = torch.relu(out)
        return out + x


class FullyConnectedNet(pl.LightningModule):
    def __init__(
        self,
        hidden_size,
        features,
        output_size,
        num_resblocks=3,
        lr=1e-3,
        weight_decay=1e-2,
        weight=[1, 1],
        **kwargs,
    ):
        super(FullyConnectedNet, self).__init__()
        input_size = len(features)
        self.features = features
        self.norm = nn.LayerNorm(input_size)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.resblocks = nn.Sequential(
            *[LinearResBlock(hidden_size, hidden_size) for _ in range(num_resblocks)]
        )
        self.fc2 = nn.Linear(hidden_size, output_size)

        self.lr = lr
        self.weight_decay = weight_decay

        self.mertic = {
            "train_auc": torchmetrics.AUROC(num_classes=2, task="multiclass"),
            "val_auc": torchmetrics.AUROC(num_classes=2, task="multiclass"),
        }
        self.history = defaultdict(dict)
        self.loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(weight).float())

    def forward(self, x):
        x = self.norm(x)
        out = torch.relu(self.fc1(x))
        out = self.resblocks(out)
        out = self.fc2(out)
        return out

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        outputs = self.forward(x)
        loss = self.loss_fn(outputs, y.squeeze(-1).float())

        self.mertic["train_auc"].update(
            torch.softmax(outputs, dim=-1), torch.argmax(y, dim=1)
        )

        self.log("ptl/train_loss", loss, on_epoch=True, prog_bar=True, on_step=False)
        return loss

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        outputs = self.forward(x)
        loss = self.loss_fn(outputs, y.squeeze(-1).float())

        self.mertic["val_auc"].update(
            torch.softmax(outputs, dim=-1), torch.argmax(y, dim=1)
        )

        self.log("ptl/val_loss", loss, on_epoch=True, prog_bar=True)

    def on_train_epoch_end(self):

        auc = self.mertic["train_auc"].compute()
        self.log("ptl/train_auc", auc, prog_bar=True)

    def on_validation_epoch_end(self):
        auc = self.mertic["val_auc"].compute()
        self.log("ptl/val_auc", auc, prog_bar=True)

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(
            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )
        return optimizer

    def predict_df(self, df, batch_size=256):

        for feature in self.features:
            assert feature in df.columns
        print(f"input df have NA: {df[self.features].isna().sum(axis=1).sum()}")
        df = df.copy().dropna(subset=self.features)

        predict_dataloader = DataLoader(
            torch.tensor(df[self.features].values).float(),
            batch_size=batch_size,
            persistent_workers=True,
            num_workers=4,
        )

        self.eval()
        pred = []
        with torch.no_grad():
            for x in predict_dataloader:
                y_hat = self.forward(x).cpu().detach()
                y_hat = torch.softmax(y_hat, dim=-1)[:, 1]

                pred.append(y_hat)
        pred = torch.cat(pred).numpy()
        df["pred"] = pred
        return df

In [9]:
used_fatures = proteomics + risk_factors + PRS
used_fatures

['C3',
 'KLK7',
 'GCHFR',
 'NHLRC3',
 'APOD',
 'GAPDH',
 'TP53I3',
 'CPA4',
 'ANXA2',
 'GRSF1',
 'IL25',
 'HMMR',
 'MRPL52',
 'PAIP2B',
 'THAP12',
 'FOS',
 'FGF9',
 'PITHD1',
 'THSD1',
 'PTGES2',
 'DEFB103A_DEFB103B',
 'ATP1B4',
 'CYB5A',
 'UNC79',
 'SLC34A3',
 'TAGLN3',
 'SLIRP',
 'CLASP1',
 'PSMC3',
 'KIR3DL2',
 'BEX3',
 'PFDN4',
 'BCL7A',
 'SMC3',
 'SLC28A1',
 'CDC123',
 'GJA8',
 'NMRK2',
 'GATA3',
 'CPLX2',
 'RASGRF1',
 'FGF7',
 'ANKRA2',
 'RBM25',
 'LYZL2',
 'CDK1',
 'CREB3',
 'CREBZF',
 'IGLON5',
 'SHC1',
 'ZP4',
 'TMOD4',
 'CEP152',
 'MYH7B',
 'CEP350',
 'CDC25A',
 'TRIM26',
 'MANEAL',
 'MUCL3',
 'GIMAP8',
 'CYTH3',
 'PDXDC1',
 'CLINT1',
 'MAPRE3',
 'EVI2B',
 'STAU1',
 'PCNA',
 'DNAJA1',
 'JMJD1C',
 'GAGE2A',
 'GAD1',
 'IZUMO1',
 'PDCL2',
 'PDE1C',
 'STOML2',
 'BSND',
 'MAPK13',
 'PDIA2',
 'BTLA',
 'MLLT1',
 'TPRKB',
 'ARHGAP5',
 'BTNL10',
 'PHLDB2',
 'PDIA5',
 'ATF4',
 'PRAME',
 'TOP1MT',
 'KHDC3L',
 'DCUN1D2',
 'IL3',
 'DCLRE1C',
 'ERCC1',
 'DCDC2C',
 'VCPKMT',
 'SPRING1',
 'M

In [10]:
from ray.train.lightning import (
    RayDDPStrategy,
    RayLightningEnvironment,
    RayTrainReportCallback,
    prepare_trainer,
)
from ray import tune
from ray.tune.schedulers import ASHAScheduler
from ray.train import RunConfig, ScalingConfig, CheckpointConfig
from ray.train.torch import TorchTrainer


def train_func(config):
    dataset = DatasetModule(
        train=train_imputed,
        test=test_imputed,
        features=used_fatures,
        label=["incident_cad"],
        num_classes=2,
        batch_size=config["batch_size"],
    )
    # model = FullyConnectedNet(
    #     input_size=len(proteomics),
    #     hidden_size=config["hidden_size"],
    #     output_size=2,
    #     lr=config["lr"],
    #     weight_decay=config["weight_decay"],
    #     weight=config["weight"],
    #     num_resblocks=config["num_resblocks"],
    # )
    model = FullyConnectedNet(**config)
    trainer = Trainer(
        devices="auto",
        strategy=RayDDPStrategy(),
        callbacks=[RayTrainReportCallback()],
        plugins=[RayLightningEnvironment()],
        enable_progress_bar=True,
    )
    trainer = prepare_trainer(trainer)
    trainer.fit(model, dataset)


search_space = {
    "features": used_fatures,
    "output_size": 2,
    #    "hidden_size": tune.choice([256, 128, 64]),
    "hidden_size": tune.choice([256, 512, 1024]),
    #    "lr": tune.loguniform(1e-4, 1e-1),
    "lr": 0.0074,
    "weight_decay": tune.loguniform(1e-4, 1e-2),
    #    "weight": tune.choice([[1, 1], [0.1, 1], [0.1, 10], [0.1, 100]]),
    "weight": [1, 1],
    "batch_size": 256,
    #    "batch_size": tune.choice([256]),
    "num_resblocks": tune.choice([1, 2, 3, 4, 5]),
}


# The maximum training epochs
num_epochs = 10

# Number of sampls from parameter space
num_samples = 50
scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)


scaling_config = ScalingConfig(
    num_workers=1, use_gpu=True, resources_per_worker={"CPU": 3, "GPU": 0.5}
)

run_config = RunConfig(
    checkpoint_config=CheckpointConfig(
        num_to_keep=2,
        checkpoint_score_attribute="ptl/val_auc",
        checkpoint_score_order="max",
    ),
)

# Define a TorchTrainer without hyper-parameters for Tuner
ray_trainer = TorchTrainer(
    train_func,
    scaling_config=scaling_config,
    run_config=run_config,
)


def tune_asha(num_samples=10):
    scheduler = ASHAScheduler(max_t=num_epochs, grace_period=1, reduction_factor=2)

    tuner = tune.Tuner(
        ray_trainer,
        param_space={"train_loop_config": search_space},
        tune_config=tune.TuneConfig(
            metric="ptl/val_auc",
            mode="max",
            num_samples=num_samples,
            scheduler=scheduler,
        ),
    )
    return tuner.fit()


results = tune_asha(num_samples=num_samples)


# results.get_best_result("ptl/val_auc")

best_result = results.get_best_result("ptl/val_auc")
best_params = best_result.config
best_result_epoch_dir = (
    best_result.get_best_checkpoint("ptl/val_auc", "max").path + "/checkpoint.ckpt"
)
best_model_state = torch.load(best_result_epoch_dir)
best_model = FullyConnectedNet(**best_params["train_loop_config"])
best_model.load_state_dict(best_model_state["state_dict"])
best_model

0,1
Current time:,2024-04-17 10:31:08
Running for:,00:04:13.18
Memory:,25.2/50.1 GiB

Trial name,status,loc,train_loop_config/hi dden_size,train_loop_config/nu m_resblocks,train_loop_config/we ight_decay,iter,total time (s),ptl/val_loss,ptl/val_auc,ptl/train_loss
TorchTrainer_f5695_00023,RUNNING,172.26.79.196:22643,256,2,0.000507831,,,,,
TorchTrainer_f5695_00024,PENDING,,512,1,0.000125219,,,,,
TorchTrainer_f5695_00025,PENDING,,256,3,0.00174492,,,,,
TorchTrainer_f5695_00026,PENDING,,1024,4,0.00895851,,,,,
TorchTrainer_f5695_00027,PENDING,,256,5,0.00294267,,,,,
TorchTrainer_f5695_00028,PENDING,,1024,1,0.000854914,,,,,
TorchTrainer_f5695_00029,PENDING,,1024,3,0.00146755,,,,,
TorchTrainer_f5695_00030,PENDING,,1024,4,0.000432065,,,,,
TorchTrainer_f5695_00031,PENDING,,256,4,0.00182457,,,,,
TorchTrainer_f5695_00032,PENDING,,512,5,0.000679674,,,,,


[36m(RayTrainWorker pid=16585)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=16464)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=16464)[0m - (ip=172.26.79.196, pid=16585) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=16585)[0m Train : incident_cad
[36m(RayTrainWorker pid=16585)[0m 0.0             27198
[36m(RayTrainWorker pid=16585)[0m 1.0              1607
[36m(RayTrainWorker pid=16585)[0m dtype: int64
[36m(RayTrainWorker pid=16585)[0m val : incident_cad
[36m(RayTrainWorker pid=16585)[0m 0.0             6810
[36m(RayTrainWorker pid=16585)[0m 1.0              392
[36m(RayTrainWorker pid=16585)[0m dtype: int64
[36m(RayTrainWorker pid=16585)[0m Test : incident_cad
[36m(RayTrainWorker pid=16585)[0m 0.0             14599
[36m(RayTrainWorker pid=16585)[0m 1.0               833
[36m(RayTrainWorker pid=16585)[0m dtype: int64


[36m(RayTrainWorker pid=16593)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(RayTrainWorker pid=16585)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=16585)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=16585)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=16585)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=16585)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=16585)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:03<00:00,  0.54it/s]
[36m(RayTrainWorker pid=16593)[0m Train : incident_cad
[36m(RayTrainWorker pid=16593)[0m 1.0               833[32m [repeated 6x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/ray-logging.html#log-deduplication for more options.)[0m
[36m(RayTrainWorker pid=16593)[0m dtype: int64[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=16593)[0m val : incident_cad
[36m(RayTrainWorker pid=16593)[0m Test : incident_cad


[36m(RayTrainWorker pid=16585)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=16585)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=16593)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=16593)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=16593)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=16593)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainW

Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:   5%|▍         | 5/111 [00:00<00:02, 37.02it/s, v_num=0]
Epoch 0:  11%|█         | 12/111 [00:00<00:01, 50.59it/s, v_num=0]
Epoch 0:  16%|█▌        | 18/111 [00:00<00:01, 54.46it/s, v_num=0]
Epoch 0:  24%|██▍       | 27/111 [00:00<00:01, 60.92it/s, v_num=0]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:04<00:04,  0.22it/s]
Epoch 0:  32%|███▏      | 36/111 [00:00<00:01, 67.28it/s, v_num=0]
                                                                           
Epoch 0:  33%|███▎      | 37/111 [00:00<00:01, 67.68it/s, v_num=0]
Epoch 0:  40%|███▉      | 44/111 [00:00<00:00, 69.41it/s, v_num=0]
Epoch 0:  44%|████▍     | 49/111 [00:00<00:00, 64.07it/s, v_num=0]
Epoch 0:  48%|████▊     | 53/111 [00:00<00:00, 62.19it/s, v_num=0]
Epoch 0:   2%|▏         | 2/111 [00:00<00:06, 16.99it/s, v_num=0]
Epoch 0:  53%|█████▎    | 59/111 [00:00<00:00, 60.85it/s, v_num=0]
Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Check

[36m(RayTrainWorker pid=16585)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=16593)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=16593)[0m 
[36m(RayTrainWorker pid=16593)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=16593)[0m -----------------------------------------------[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=16593)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=16593)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=16593)[0m 2 | resblocks | Sequential       | 1.1 M 
[36m(RayTrainWorker pid=16593)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=1

Epoch 0: 100%|██████████| 111/111 [00:02<00:00, 52.08it/s, v_num=0, ptl/val_loss=0.496, ptl/val_auc=0.743, ptl/train_loss=0.865]


[36m(RayTrainWorker pid=16585)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00000_0_hidden_size=512,num_resblocks=5,weight_decay=0.0051_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=16585)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 1:   1%|          | 1/111 [00:00<00:06, 16.77it/s, v_num=0, ptl/val_loss=0.496, ptl/val_auc=0.743, ptl/train_loss=0.865, ptl/train_auc=0.701]
Epoch 0:  96%|█████████▋| 107/111 [00:01<00:00, 63.80it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 63.35it/s, v_num=0]
Epoch 1:   5%|▍         | 5/111 [00:00<00:01, 61.28it/s, v_num=0, ptl/val_loss=0.481, ptl/val_auc=0.761, ptl/train_loss=0.848, ptl/train_auc=0.712]
Epoch 1:  55%|█████▍    | 61/111 [00:00<00:00, 68.46it/s, v_num=0, ptl/val_loss=0.496, ptl/val_auc=0.743, ptl/train_loss=0.865, ptl/train_auc=0.701]
Epoch 1:  63%|██████▎   | 70/111 [00:00<00:00, 70.26it/s, v_num=0, ptl/val_loss=0.496, ptl/val_auc=0.743, ptl/train_loss=0.865, ptl/train_auc=0.701]
Epoch 1:  93%|█████████▎| 103/111 [00:01<00:00, 67.53it/s, v_num=0, ptl/val_loss=0.496, ptl/val_auc=0.743, ptl/train_loss=0.865, ptl/train_auc=0.701]
Epoch 1: 100%|██████████| 111/111 [00:01<00:00, 68.67it/s, v_num=0, ptl/val_loss=0.496, ptl/val_auc=0.743, ptl/train_los

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 2:  23%|██▎       | 26/111 [00:00<00:01, 56.59it/s, v_num=0, ptl/val_loss=0.714, ptl/val_auc=0.763, ptl/train_loss=0.565, ptl/train_auc=0.739]
[36m(RayTrainWorker pid=16593)[0m [32m [repeated 13x across cluster][0m
Validation DataLoader 0:  86%|████████▌ | 24/28 [00:00<00:00, 96.42it/s] [A


[36m(RayTrainWorker pid=16593)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=16593)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 2:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.415, ptl/val_auc=0.776, ptl/train_loss=0.548, ptl/train_auc=0.712]          
Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 2:  14%|█▍        | 16/111 [00:00<00:01, 61.24it/s, v_num=0, ptl/val_loss=0.714, ptl/val_auc=0.763, ptl/train_loss=0.565, ptl/train_auc=0.739][32m [repeated 8x across cluster][0m
Epoch 2:  49%|████▊     | 54/111 [00:00<00:00, 60.75it/s, v_num=0, ptl/val_loss=0.714, ptl/val_auc=0.763, ptl/train_loss=0.565, ptl/train_auc=0.739][32m [repeated 13x across cluster][0m
Epoch 2:   5%|▌         | 6/111 [00:00<00:02, 51.80it/s, v_num=0, ptl/val_loss=0.415, ptl/val_auc=0.776, ptl/train_loss=0.548, ptl/train_auc=0.752][32m [repeated 9x across cluster][0m
Epoch 2:   9%|▉         | 10/111 [00:00<00:01, 65.26it/s, v_num=0, ptl/val_loss=0.714, ptl/val_auc=0.763, ptl/train_loss=0.565, ptl/train_auc=0.739][32m [repeated 5x across cluster][0m
Epoch 2:  56%|█████▌    | 62/111 [00:01<00:00, 61.93

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 3:  98%|█████████▊| 109/111 [00:01<00:00, 82.03it/s, v_num=0, ptl/val_loss=0.629, ptl/val_auc=0.776, ptl/train_loss=0.544, ptl/train_auc=0.767]
Epoch 3:  72%|███████▏  | 80/111 [00:01<00:00, 78.20it/s, v_num=0, ptl/val_loss=0.629, ptl/val_auc=0.776, ptl/train_loss=0.544, ptl/train_auc=0.767][32m [repeated 6x across cluster][0m


[36m(RayTrainWorker pid=16593)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=16585)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00000_0_hidden_size=512,num_resblocks=5,weight_decay=0.0051_2024-04-17_10-26-56/checkpoint_000003)[32m [repeated 6x across cluster][0m
[36m(RayTrainWorker pid=16593)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric

Epoch 4:   5%|▍         | 5/111 [00:00<00:02, 52.54it/s, v_num=0, ptl/val_loss=0.530, ptl/val_auc=0.782, ptl/train_loss=0.549, ptl/train_auc=0.766]  
Epoch 3: 100%|██████████| 111/111 [00:01<00:00, 67.84it/s, v_num=0, ptl/val_loss=0.466, ptl/val_auc=0.779, ptl/train_loss=0.524, ptl/train_auc=0.767]
Epoch 4:  27%|██▋       | 30/111 [00:00<00:01, 58.58it/s, v_num=0, ptl/val_loss=0.530, ptl/val_auc=0.782, ptl/train_loss=0.549, ptl/train_auc=0.766]
Epoch 4:  97%|█████████▋| 108/111 [00:01<00:00, 70.53it/s, v_num=0, ptl/val_loss=0.530, ptl/val_auc=0.782, ptl/train_loss=0.549, ptl/train_auc=0.766]
Epoch 4: 100%|██████████| 111/111 [00:01<00:00, 70.88it/s, v_num=0, ptl/val_loss=0.530, ptl/val_auc=0.782, ptl/train_loss=0.549, ptl/train_auc=0.766]
[36m(RayTrainWorker pid=16593)[0m [32m [repeated 12x across cluster][0m
Epoch 5:   1%|          | 1/111 [00:00<00:04, 23.04it/s, v_num=0, ptl/val_loss=0.562, ptl/val_auc=0.786, ptl/train_loss=0.533, ptl/train_auc=0.774]  
Epoch 4:  67%|██████▋   |

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 6:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.680, ptl/val_auc=0.788, ptl/train_loss=0.536, ptl/train_auc=0.774]          
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A[32m [repeated 4x across cluster][0m
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 135.24it/s][A[32m [repeated 100x across cluster][0m
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 136.95it/s][A[32m [repeated 12x across cluster][0m
Epoch 5: 100%|██████████| 111/111 [00:01<00:00, 75.10it/s, v_num=0, ptl/val_loss=0.680, ptl/val_auc=0.788, ptl/train_loss=0.533, ptl/train_auc=0.774][32m [repeated 2x across cluster][0m
Epoch 5: 100%|██████████| 111/111 [00:01<00:00, 70.50it/s, v_num=0, ptl/val_loss=0.680, ptl/val_auc=0.788, ptl/train_loss=0.536, ptl/train_auc=0.

[36m(RayTrainWorker pid=16585)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00000_0_hidden_size=512,num_resblocks=5,weight_decay=0.0051_2024-04-17_10-26-56/checkpoint_000006)[32m [repeated 4x across cluster][0m


Epoch 4: 100%|██████████| 111/111 [00:01<00:00, 60.85it/s, v_num=0, ptl/val_loss=0.562, ptl/val_auc=0.786, ptl/train_loss=0.533, ptl/train_auc=0.766]
Epoch 7:   3%|▎         | 3/111 [00:00<00:03, 32.23it/s, v_num=0, ptl/val_loss=0.484, ptl/val_auc=0.790, ptl/train_loss=0.523, ptl/train_auc=0.785]  
Epoch 7:   6%|▋         | 7/111 [00:00<00:02, 39.09it/s, v_num=0, ptl/val_loss=0.484, ptl/val_auc=0.790, ptl/train_loss=0.523, ptl/train_auc=0.785]
Epoch 7:  10%|▉         | 11/111 [00:00<00:02, 37.63it/s, v_num=0, ptl/val_loss=0.484, ptl/val_auc=0.790, ptl/train_loss=0.523, ptl/train_auc=0.785]
Epoch 7:  95%|█████████▍| 105/111 [00:01<00:00, 73.39it/s, v_num=0, ptl/val_loss=0.484, ptl/val_auc=0.790, ptl/train_loss=0.523, ptl/train_auc=0.785]
[36m(RayTrainWorker pid=16585)[0m [32m [repeated 7x across cluster][0m
Epoch 7: 100%|██████████| 111/111 [00:01<00:00, 74.80it/s, v_num=0, ptl/val_loss=0.484, ptl/val_auc=0.790, ptl/train_loss=0.523, ptl/train_auc=0.785]


You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 7: 100%|██████████| 111/111 [00:01<00:00, 64.05it/s, v_num=0, ptl/val_loss=0.645, ptl/val_auc=0.790, ptl/train_loss=0.532, ptl/train_auc=0.785]
Epoch 8:   4%|▎         | 4/111 [00:00<00:02, 46.84it/s, v_num=0, ptl/val_loss=0.645, ptl/val_auc=0.790, ptl/train_loss=0.532, ptl/train_auc=0.788]  
Epoch 6:  44%|████▍     | 49/111 [00:00<00:00, 78.16it/s, v_num=0, ptl/val_loss=0.680, ptl/val_auc=0.788, ptl/train_loss=0.536, ptl/train_auc=0.780][32m [repeated 3x across cluster][0m
Epoch 7:  15%|█▌        | 17/111 [00:00<00:02, 41.77it/s, v_num=0, ptl/val_loss=0.484, ptl/val_auc=0.790, ptl/train_loss=0.523, ptl/train_auc=0.785][32m [repeated 2x across cluster][0m
Epoch 7:  72%|███████▏  | 80/111 [00:01<00:00, 71.78it/s, v_num=0, ptl/val_loss=0.484, ptl/val_auc=0.790, ptl/train_loss=0.523, ptl/train_auc=0.785][32m [repeated 8x across cluster][0m
Epoch 8:  12%|█▏        | 13/111 [00:00<00:01, 66.64it/s, v_num=0, ptl/val_loss=0.645, ptl/val_auc=0.790, ptl/train_loss=0.532, ptl/train_a

[36m(RayTrainWorker pid=17107)[0m Setting up process group for: env:// [rank=0, world_size=1]


Epoch 9:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.788]          
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 135.15it/s][A[32m [repeated 9x across cluster][0m
Epoch 8: 100%|██████████| 111/111 [00:01<00:00, 67.07it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.532, ptl/train_auc=0.788][32m [repeated 2x across cluster][0m
Epoch 8: 100%|██████████| 111/111 [00:01<00:00, 64.09it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.788][32m [repeated 2x across cluster][0m


[36m(TorchTrainer pid=17053)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=17053)[0m - (ip=172.26.79.196, pid=17107) world_rank=0, local_rank=0, node_rank=0


Epoch 9:  59%|█████▊    | 65/111 [00:00<00:00, 76.51it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791]
Epoch 9:  76%|███████▌  | 84/111 [00:01<00:00, 79.78it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791][32m [repeated 6x across cluster][0m
Epoch 9:  83%|████████▎ | 92/111 [00:01<00:00, 79.53it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791]
Epoch 9:  87%|████████▋ | 97/111 [00:01<00:00, 79.13it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791]
Epoch 9:  93%|█████████▎| 103/111 [00:01<00:00, 76.25it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791]
Epoch 9:  94%|█████████▎| 104/111 [00:01<00:00, 76.34it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791]
Epoch 9: 100%|██████████| 111/111 [00:01<00:00, 75.95it/s, v_num=0

[36m(RayTrainWorker pid=16585)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00000_0_hidden_size=512,num_resblocks=5,weight_decay=0.0051_2024-04-17_10-26-56/checkpoint_000009)[32m [repeated 3x across cluster][0m
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


[36m(RayTrainWorker pid=17107)[0m Train : incident_cad
[36m(RayTrainWorker pid=17107)[0m dtype: int64
[36m(RayTrainWorker pid=17107)[0m val : incident_cad
[36m(RayTrainWorker pid=17107)[0m dtype: int64
[36m(RayTrainWorker pid=17107)[0m Test : incident_cad
[36m(RayTrainWorker pid=17107)[0m dtype: int64


[36m(RayTrainWorker pid=17107)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=17107)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=17107)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=17107)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=17107)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=17107)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=17107)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
[36m(RayTrainWorker pid=17107)[0m 1.0               833[32m [repeated 14x across cluster][0m
Epoch 9:  82%|████████▏ | 91/111 [00:01<00:00, 79.61it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791][32m [repeated 2x across cluster][0m
Epoch 9:  13%|█▎        | 14/111 [00:00<00:01, 61.81it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791][32m [repeated 2x across cluster][0m
Epoch 9:  67%|██████▋   | 74/111 [00:00<00:00, 78.16it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791][32m [repeated 5x across cluster][0m
Epoch 9:  50%|████▉     | 55/111 [00:00<00:00, 74.38it/s, v_num=0, ptl/val_loss=0.692, ptl/val_auc=0.791, ptl/train_loss=0.521, ptl/train_auc=0.791][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
V

[36m(RayTrainWorker pid=17107)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=17107)[0m 
[36m(RayTrainWorker pid=17107)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=17107)[0m -----------------------------------------------
[36m(RayTrainWorker pid=17107)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=17107)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=17107)[0m 2 | resblocks | Sequential       | 1.3 M 
[36m(RayTrainWorker pid=17107)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=17107)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=17107)[0m -----------------------------------------------
[36m(RayTrainWorker pid=17107)[0m 2.8 M     Trainable params
[36m(RayTrainWorker pid=17107)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=17107)[0m 2.8 M     Total params
[36m(RayTrainWorker pid=17107)[0m 11.287    Total estimate

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
                                                                           


[36m(RayTrainWorker pid=17107)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=17107)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:   7%|▋         | 8/111 [00:00<00:01, 62.31it/s, v_num=0]
Epoch 0:   7%|▋         | 8/111 [00:00<00:01, 62.16it/s, v_num=0]
Epoch 0:  15%|█▌        | 17/111 [00:00<00:01, 72.93it/s, v_num=0]
Epoch 0:  20%|█▉        | 22/111 [00:00<00:01, 66.80it/s, v_num=0]
Epoch 0:  29%|██▉       | 32/111 [00:00<00:01, 74.26it/s, v_num=0]
Epoch 0:  82%|████████▏ | 91/111 [00:01<00:00, 78.86it/s, v_num=0]
Epoch 0:  90%|█████████ | 100/111 [00:01<00:00, 79.48it/s, v_num=0]
Epoch 0:  96%|█████████▋| 107/111 [00:01<00:00, 79.10it/s, v_num=0]
Epoch 0:  97%|█████████▋| 108/111 [00:01<00:00, 79.28it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 79.91it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 70.23it/s, v_num=0, ptl/val_loss=0.439, ptl/val_auc=0.750]
E

[36m(RayTrainWorker pid=17107)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=17107)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00002_2_hidden_size=512,num_resblocks=5,weight_decay=0.0057_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=17317)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=17192)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=17192)[0m - (ip=172.26.79.196, pid=17317) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=17317)[0m Train : incident_cad
[36m(RayTrainWorker pid=17317)[0m dtype: int64
[36m(RayTrainWorker pid=17317)[0m val : incident_cad
[36m(RayTrainWorker pid=17317)[0m dtype: int64
[36m(RayTrainWorker pid=17317)[0m Test : incident_cad
[36m(RayTrainWorker pid=17317)[0m dtype: int64
[36m(RayTrainWorker pid=17317)[0m 1.0               833[32m [repeated 8x across cluster][0m
Epoch 0:  50%|█████     | 56/111 [00:00<00:00, 76.06it/s, v_num=0]
Epoch 0:  37%|███▋      | 41/111 [00:00<00:00, 76.67it/s, v_num=0]
Epoch 0:  89%|████████▉ | 99/111 [00:01<00:00, 79.63it/s, v_num=0][32m [repeated 6x across cluster][0m
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 173.04it/s][A[32m [repeated 25x across cluster][0m
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 175.30it/s][A[32m [repeated 3x across cluster][0m


[36m(RayTrainWorker pid=17317)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=17317)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=17317)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=17317)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=17317)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=17317)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=17317)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=17520)[0m Setting up process group for: env:// [rank=0, world_size=1]


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:01<00:00,  1.45it/s]


[36m(RayTrainWorker pid=17317)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=17317)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   4%|▎         | 4/111 [00:00<00:01, 57.08it/s, v_num=0]
                                                                           
Epoch 0:  13%|█▎        | 14/111 [00:00<00:01, 86.76it/s, v_num=0]
Epoch 0:  14%|█▎        | 15/111 [00:00<00:01, 85.50it/s, v_num=0]


[36m(TorchTrainer pid=17433)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=17433)[0m - (ip=172.26.79.196, pid=17520) world_rank=0, local_rank=0, node_rank=0


Epoch 0:  17%|█▋        | 19/111 [00:00<00:01, 70.69it/s, v_num=0]
Epoch 0:  23%|██▎       | 26/111 [00:00<00:01, 68.92it/s, v_num=0]
Epoch 0:  28%|██▊       | 31/111 [00:00<00:01, 69.38it/s, v_num=0]
Epoch 0:  34%|███▍      | 38/111 [00:00<00:01, 64.93it/s, v_num=0]
Epoch 0:  42%|████▏     | 47/111 [00:00<00:00, 68.66it/s, v_num=0]
Epoch 0:  51%|█████▏    | 57/111 [00:00<00:00, 72.38it/s, v_num=0]
Epoch 0:  52%|█████▏    | 58/111 [00:00<00:00, 72.88it/s, v_num=0]
Epoch 0:  61%|██████▏   | 68/111 [00:00<00:00, 76.51it/s, v_num=0]
Epoch 0:  62%|██████▏   | 69/111 [00:00<00:00, 76.69it/s, v_num=0]
Epoch 0:  72%|███████▏  | 80/111 [00:00<00:00, 80.07it/s, v_num=0]
Epoch 0:  81%|████████  | 90/111 [00:01<00:00, 81.66it/s, v_num=0]
Epoch 0:  91%|█████████ | 101/111 [00:01<00:00, 83.88it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 86.01it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
[36m(RayTrainWorker pid=17317)[0m 
Validation:   0%|          | 0/28 [00:00<

[36m(RayTrainWorker pid=17317)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


[36m(RayTrainWorker pid=17317)[0m 
Validation DataLoader 0:  64%|██████▍   | 18/28 [00:00<00:00, 190.92it/s][A
Validation DataLoader 0:  68%|██████▊   | 19/28 [00:00<00:00, 189.23it/s][A
Validation DataLoader 0:  71%|███████▏  | 20/28 [00:00<00:00, 187.79it/s][A
Validation DataLoader 0:  75%|███████▌  | 21/28 [00:00<00:00, 187.64it/s][A
Validation DataLoader 0:  79%|███████▊  | 22/28 [00:00<00:00, 187.15it/s][A
Validation DataLoader 0:  82%|████████▏ | 23/28 [00:00<00:00, 189.12it/s][A
Validation DataLoader 0:  86%|████████▌ | 24/28 [00:00<00:00, 191.42it/s][A
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 191.34it/s][A
Validation DataLoader 0:  93%|█████████▎| 26/28 [00:00<00:00, 193.30it/s][A
Validation DataLoader 0:  96%|█████████▋| 27/28 [00:00<00:00, 194.98it/s][A
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 196.56it/s][A
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 75.45it/s, v_num=0, ptl/val_loss=0.374, ptl/val_auc=0.749]


[36m(RayTrainWorker pid=17317)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00003_3_hidden_size=1024,num_resblocks=1,weight_decay=0.0003_2024-04-17_10-26-56/checkpoint_000000)


Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 67.82it/s, v_num=0, ptl/val_loss=0.374, ptl/val_auc=0.749, ptl/train_loss=0.659]
[36m(RayTrainWorker pid=17520)[0m Train : incident_cad
[36m(RayTrainWorker pid=17520)[0m dtype: int64
[36m(RayTrainWorker pid=17520)[0m val : incident_cad
[36m(RayTrainWorker pid=17520)[0m dtype: int64
[36m(RayTrainWorker pid=17520)[0m Test : incident_cad
[36m(RayTrainWorker pid=17520)[0m dtype: int64


[36m(RayTrainWorker pid=17520)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=17520)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=17520)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=17520)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=17520)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=17520)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=17520)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=17520)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=17520)[0m 
[36m(RayTrainWorker pid=17520)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=17520)[0m -----------------------------------------------
[36m(RayTrainWorker pid=17520)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=17520)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=17520)[0m 2 | resblocks | Sequential       | 263 K 
[36m(RayTrainWorker pid=17520)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=17520)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=17520)[0m -----------------------------------------------
[36m(RayTrainWorker pid=17520)[0m 1.8 M     Trainable params
[36m(RayTrainWorker pid=17520)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=17520)[0m 1.8 M     Total params
[36m(RayTrainWorker pid=17520)[0m 7.069     Total estimate

                                                                           
[36m(RayTrainWorker pid=17520)[0m 1.0               833[32m [repeated 6x across cluster][0m


[36m(RayTrainWorker pid=17520)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=17520)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   3%|▎         | 3/111 [00:00<00:02, 41.29it/s, v_num=0]
Epoch 0:   4%|▎         | 4/111 [00:00<00:04, 21.74it/s, v_num=0]
Epoch 0:   5%|▍         | 5/111 [00:00<00:04, 26.03it/s, v_num=0]
Epoch 0:  15%|█▌        | 17/111 [00:00<00:01, 57.75it/s, v_num=0]
Epoch 0:  24%|██▍       | 27/111 [00:00<00:01, 69.41it/s, v_num=0]
Epoch 0:  34%|███▍      | 38/111 [00:00<00:00, 76.53it/s, v_num=0]
Epoch 0:  43%|████▎     | 48/111 [00:00<00:00, 80.03it/s, v_num=0]
Epoch 0:  50%|█████     | 56/111 [00:00<00:00, 80.47it/s, v_num=0]
Epoch 0:  59%|█████▉    | 66/111 [00:00<00:00, 82.52it/s, v_num=0]
Epoch 0:  60%|██████    | 67/111 [00:00<00:00, 83.00it/s, v_num=0]
Epoch 0:  67%|██████▋   | 74/111 [00:00<00:00, 82.02it/s, v_num=0]
Epoch 0:  74%|███████▍  | 82/111 [00:01<00:00, 80.92it/s, v_num=0]
Epoch 0:  81%|████████  | 90/111 [00:01<00:00, 81.00it/s, v_num=0]
Epoch 0:  88%|████████▊ | 98/111 [00:01<00:00, 80.73it/s, v_num=0]
Epoch 0:  96%|█████████▋| 107/111 [00:01<00:00, 80.96it/s, v_num=

[36m(RayTrainWorker pid=17520)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=17520)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=17520)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00004_4_hidden_size=512,num_resblocks=1,weight_decay=0.0044_2024-04-17_10-26-56/checkpoint_000000)


Epoch 1:  10%|▉         | 11/111 [00:00<00:01, 60.06it/s, v_num=0, ptl/val_loss=0.421, ptl/val_auc=0.767, ptl/train_loss=0.623, ptl/train_auc=0.760]
Epoch 1:  13%|█▎        | 14/111 [00:00<00:01, 50.75it/s, v_num=0, ptl/val_loss=0.421, ptl/val_auc=0.767, ptl/train_loss=0.623, ptl/train_auc=0.760]
Epoch 1:  21%|██        | 23/111 [00:00<00:01, 59.85it/s, v_num=0, ptl/val_loss=0.421, ptl/val_auc=0.767, ptl/train_loss=0.623, ptl/train_auc=0.760]
Epoch 1:  29%|██▉       | 32/111 [00:00<00:01, 66.38it/s, v_num=0, ptl/val_loss=0.421, ptl/val_auc=0.767, ptl/train_loss=0.623, ptl/train_auc=0.760]
Epoch 1:  36%|███▌      | 40/111 [00:00<00:01, 67.80it/s, v_num=0, ptl/val_loss=0.421, ptl/val_auc=0.767, ptl/train_loss=0.623, ptl/train_auc=0.760]
Epoch 1:  37%|███▋      | 41/111 [00:00<00:01, 68.33it/s, v_num=0, ptl/val_loss=0.421, ptl/val_auc=0.767, ptl/train_loss=0.623, ptl/train_auc=0.760]
Epoch 1:  43%|████▎     | 48/111 [00:00<00:00, 68.82it/s, v_num=0, ptl/val_loss=0.421, ptl/val_auc=0.767, 

[36m(RayTrainWorker pid=17862)[0m Setting up process group for: env:// [rank=0, world_size=1]


Validation DataLoader 0:  71%|███████▏  | 20/28 [00:00<00:00, 193.22it/s][A
Validation DataLoader 0:  75%|███████▌  | 21/28 [00:00<00:00, 192.08it/s][A
Validation DataLoader 0:  79%|███████▊  | 22/28 [00:00<00:00, 192.27it/s][A
Validation DataLoader 0:  82%|████████▏ | 23/28 [00:00<00:00, 192.87it/s][A
Validation DataLoader 0:  86%|████████▌ | 24/28 [00:00<00:00, 193.23it/s][A
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 195.77it/s][A
Validation DataLoader 0:  93%|█████████▎| 26/28 [00:00<00:00, 198.60it/s][A
Validation DataLoader 0:  96%|█████████▋| 27/28 [00:00<00:00, 200.04it/s][A
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 201.10it/s][A
Epoch 1: 100%|██████████| 111/111 [00:01<00:00, 64.19it/s, v_num=0, ptl/val_loss=0.778, ptl/val_auc=0.754, ptl/train_loss=0.562, ptl/train_auc=0.760]


[36m(RayTrainWorker pid=17520)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00004_4_hidden_size=512,num_resblocks=1,weight_decay=0.0044_2024-04-17_10-26-56/checkpoint_000001)
[36m(TorchTrainer pid=17736)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=17736)[0m - (ip=172.26.79.196, pid=17862) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=17862)[0m Train : incident_cad
[36m(RayTrainWorker pid=17862)[0m dtype: int64
[36m(RayTrainWorker pid=17862)[0m val : incident_cad
[36m(RayTrainWorker pid=17862)[0m dtype: int64
[36m(RayTrainWorker pid=17862)[0m Test : incident_cad
[36m(RayTrainWorker pid=17862)[0m dtype: int64
[36m(RayTrainWorker pid=17862)[0m 1.0               833[32m [repeated 10x across cluster][0m


[36m(RayTrainWorker pid=17862)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=17862)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=17862)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=17862)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=17862)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=17862)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=17862)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=17862)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=17862)[0m 
[36m(RayTrainWorker pid=17862)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=17862)[0m -----------------------------------------------
[36m(RayTrainWorker pid=17862)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=17862)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=17862)[0m 2 | resblocks | Sequential       | 791 K 
[36m(RayTrainWorker pid=17862)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=17862)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=17862)[0m -----------------------------------------------
[36m(RayTrainWorker pid=17862)[0m 2.3 M     Trainable params
[36m(RayTrainWorker pid=17862)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=17862)[0m 2.3 M     Total params
[36m(RayTrainWorker pid=17862)[0m 9.178     Total estimate

                                                                           


[36m(RayTrainWorker pid=17862)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=17862)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:   8%|▊         | 9/111 [00:00<00:01, 76.96it/s, v_num=0]
Epoch 0:   9%|▉         | 10/111 [00:00<00:01, 76.35it/s, v_num=0]
Epoch 0:  16%|█▌        | 18/111 [00:00<00:01, 78.59it/s, v_num=0]
Epoch 0:  24%|██▍       | 27/111 [00:00<00:01, 80.16it/s, v_num=0]
Epoch 0:  31%|███       | 34/111 [00:00<00:00, 77.47it/s, v_num=0]
Epoch 0:  37%|███▋      | 41/111 [00:00<00:00, 75.86it/s, v_num=0]
Epoch 0:  43%|████▎     | 48/111 [00:00<00:00, 75.60it/s, v_num=0]
Epoch 0:  50%|████▉     | 55/111 [00:00<00:00, 73.63it/s, v_num=0]
Epoch 0:  56%|█████▌    | 62/111 [00:00<00:00, 73.04it/s, v_num=0]
Epoch 0:  60%|██████    | 67/111 [00:00<00:00, 70.06it/s, v_num=0]
Epoch 0:  68%|██████▊   | 76/111 [00:01<00:00, 71.64it/s, v_num=0]
Epoch 0:  77%|███████▋  | 86/111 [00:01<00:00, 74.05it/s, v_num=0]


[36m(RayTrainWorker pid=18068)[0m Setting up process group for: env:// [rank=0, world_size=1]


Epoch 0:  86%|████████▌ | 95/111 [00:01<00:00, 75.28it/s, v_num=0]
Epoch 0:  95%|█████████▍| 105/111 [00:01<00:00, 76.84it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 77.95it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   4%|▎         | 1/28 [00:00<00:00, 224.07it/s][A
Validation DataLoader 0:   7%|▋         | 2/28 [00:00<00:00, 189.48it/s][A
Validation DataLoader 0:  11%|█         | 3/28 [00:00<00:00, 202.88it/s][A
Validation DataLoader 0:  14%|█▍        | 4/28 [00:00<00:00, 193.52it/s][A
[36m(RayTrainWorker pid=17862)[0m 
Validation DataLoader 0:  18%|█▊        | 5/28 [00:00<00:00, 168.64it/s][A
Validation DataLoader 0:  21%|██▏       | 6/28 [00:00<00:00, 174.27it/s][A
Validation DataLoader 0:  25%|██▌       | 7/28 [00:00<00:00, 172.72it/s][A
Validation DataLoader 0:  29%|██▊       | 8/28 [00:00<00:0

[36m(RayTrainWorker pid=17862)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=17862)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00005_5_hidden_size=512,num_resblocks=3,weight_decay=0.0050_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=17862)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 1:   5%|▍         | 5/111 [00:00<00:02, 43.58it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  12%|█▏        | 13/111 [00:00<00:01, 61.99it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  19%|█▉        | 21/111 [00:00<00:01, 68.11it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  20%|█▉        | 22/111 [00:00<00:01, 68.23it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  29%|██▉       | 32/111 [00:00<00:01, 76.07it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]


[36m(TorchTrainer pid=18005)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=18005)[0m - (ip=172.26.79.196, pid=18068) world_rank=0, local_rank=0, node_rank=0


Epoch 1:  33%|███▎      | 37/111 [00:00<00:01, 72.18it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  43%|████▎     | 48/111 [00:00<00:00, 75.87it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  53%|█████▎    | 59/111 [00:00<00:00, 80.85it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  60%|██████    | 67/111 [00:00<00:00, 80.76it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  61%|██████▏   | 68/111 [00:00<00:00, 80.75it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  68%|██████▊   | 75/111 [00:00<00:00, 80.49it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, ptl/train_loss=0.707, ptl/train_auc=0.751]
Epoch 1:  68%|██████▊   | 76/111 [00:00<00:00, 80.41it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.753, 

[36m(RayTrainWorker pid=17862)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00005_5_hidden_size=512,num_resblocks=3,weight_decay=0.0050_2024-04-17_10-26-56/checkpoint_000001)


Epoch 2:   7%|▋         | 8/111 [00:00<00:01, 77.81it/s, v_num=0, ptl/val_loss=0.548, ptl/val_auc=0.763, ptl/train_loss=0.557, ptl/train_auc=0.768]
Epoch 2:  17%|█▋        | 19/111 [00:00<00:00, 93.35it/s, v_num=0, ptl/val_loss=0.548, ptl/val_auc=0.763, ptl/train_loss=0.557, ptl/train_auc=0.768]
Epoch 2:  26%|██▌       | 29/111 [00:00<00:00, 95.56it/s, v_num=0, ptl/val_loss=0.548, ptl/val_auc=0.763, ptl/train_loss=0.557, ptl/train_auc=0.768]
Epoch 2:  32%|███▏      | 35/111 [00:00<00:00, 95.83it/s, v_num=0, ptl/val_loss=0.548, ptl/val_auc=0.763, ptl/train_loss=0.557, ptl/train_auc=0.768]
Epoch 2:  39%|███▊      | 43/111 [00:00<00:00, 84.86it/s, v_num=0, ptl/val_loss=0.548, ptl/val_auc=0.763, ptl/train_loss=0.557, ptl/train_auc=0.768]
Epoch 2:  40%|███▉      | 44/111 [00:00<00:00, 84.92it/s, v_num=0, ptl/val_loss=0.548, ptl/val_auc=0.763, ptl/train_loss=0.557, ptl/train_auc=0.768]
Epoch 2:  40%|███▉      | 44/111 [00:00<00:00, 84.88it/s, v_num=0, ptl/val_loss=0.548, ptl/val_auc=0.763, p

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Validation DataLoader 0:  57%|█████▋    | 16/28 [00:00<00:00, 83.67it/s][A
Validation DataLoader 0:  61%|██████    | 17/28 [00:00<00:00, 84.45it/s][A
Validation DataLoader 0:  64%|██████▍   | 18/28 [00:00<00:00, 84.17it/s][A
Validation DataLoader 0:  68%|██████▊   | 19/28 [00:00<00:00, 85.31it/s][A
Validation DataLoader 0:  71%|███████▏  | 20/28 [00:00<00:00, 84.69it/s][A
Validation DataLoader 0:  75%|███████▌  | 21/28 [00:00<00:00, 85.13it/s][A
Validation DataLoader 0:  79%|███████▊  | 22/28 [00:00<00:00, 86.46it/s][A
Validation DataLoader 0:  82%|████████▏ | 23/28 [00:00<00:00, 86.89it/s][A
Validation DataLoader 0:  86%|████████▌ | 24/28 [00:00<00:00, 88.16it/s][A
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 88.30it/s][A
Validation DataLoader 0:  93%|█████████▎| 26/28 [00:00<00:00, 89.13it/s][A
Validation DataLoader 0:  96%|█████████▋| 27/28 [00:00<00:00, 89.16it/s][A
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 89.28it/s][A
Epoch 2: 100

[36m(RayTrainWorker pid=17862)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00005_5_hidden_size=512,num_resblocks=3,weight_decay=0.0050_2024-04-17_10-26-56/checkpoint_000002)


Epoch 2: 100%|██████████| 111/111 [00:01<00:00, 66.38it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.768]
Epoch 3:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.768]          
Epoch 3:   3%|▎         | 3/111 [00:00<00:03, 29.15it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:   7%|▋         | 8/111 [00:00<00:02, 42.07it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  11%|█         | 12/111 [00:00<00:02, 40.85it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  12%|█▏        | 13/111 [00:00<00:02, 41.46it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  16%|█▌        | 18/111 [00:00<00:02, 43.30it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, 



Epoch 3:  24%|██▍       | 27/111 [00:00<00:01, 43.89it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  30%|██▉       | 33/111 [00:00<00:01, 45.77it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]




Epoch 3:  37%|███▋      | 41/111 [00:00<00:01, 49.18it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  44%|████▍     | 49/111 [00:00<00:01, 52.28it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  50%|████▉     | 55/111 [00:01<00:01, 53.10it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  57%|█████▋    | 63/111 [00:01<00:00, 55.17it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  63%|██████▎   | 70/111 [00:01<00:00, 56.82it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]


[36m(RayTrainWorker pid=18068)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=18068)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=18068)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=18068)[0m HPU available: False, using: 0 HPUs


Epoch 3:  71%|███████   | 79/111 [00:01<00:00, 58.71it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  80%|████████  | 89/111 [00:01<00:00, 61.33it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
[36m(RayTrainWorker pid=17862)[0m [32m [repeated 9x across cluster][0m


[36m(RayTrainWorker pid=18068)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=18068)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=18068)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=18068)[0m Missing logger folder: /tmp/ray/session_2024-04-17_10-26-51_374774_23003/artifacts/2024-04-17_10-26-55/TorchTrainer_2024-04-17_10-26-51/working_dirs/TorchTrainer_f5695_00006_6_hidden_size=256,num_resblocks=3,weight_decay=0.0063_2024-04-17_10-26

Epoch 3:  88%|████████▊ | 98/111 [00:01<00:00, 63.22it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  98%|█████████▊| 109/111 [00:01<00:00, 66.03it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3:  99%|█████████▉| 110/111 [00:01<00:00, 66.35it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Epoch 3: 100%|██████████| 111/111 [00:01<00:00, 66.60it/s, v_num=0, ptl/val_loss=0.390, ptl/val_auc=0.756, ptl/train_loss=0.539, ptl/train_auc=0.780]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   4%|▎         | 1/28 [00:00<00:00, 299.53it/s][A
Validation DataLoader 0:   7%|▋         | 2/28 [00:00<00:00, 248.51it/s][A
Validation DataLoader 0:  11%|█         | 3/28 [00:00<00:00, 254.50it/s][A
Val

[36m(RayTrainWorker pid=17862)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00005_5_hidden_size=512,num_resblocks=3,weight_decay=0.0050_2024-04-17_10-26-56/checkpoint_000003)
[36m(RayTrainWorker pid=18068)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=18068)[0m 
[36m(RayTrainWorker pid=18068)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=18068)[0m -----------------------------------------------
[36m(RayTrainWorker pid=18068)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=18068)[0m 1 | fc1       | Linear           | 748 K 
[36m(RayTrainWorker pid=18068)[0m 2 | resblocks | Sequential       | 198 K 
[36m(RayTrainWorker pid=18068)[0m 3 | fc2       | Linear           | 514   
[36m(RayTrainWorker pid=18068)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=18068)[0m -----------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=18068)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=18068)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


                                                                           
Epoch 0:   2%|▏         | 2/111 [00:00<00:03, 31.43it/s, v_num=0]
Epoch 0:  12%|█▏        | 13/111 [00:00<00:01, 74.53it/s, v_num=0]
Epoch 0:  46%|████▌     | 51/111 [00:00<00:00, 87.50it/s, v_num=0]
Epoch 0:  82%|████████▏ | 91/111 [00:00<00:00, 91.61it/s, v_num=0]
Epoch 0:  92%|█████████▏| 102/111 [00:01<00:00, 93.13it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 93.81it/s, v_num=0]


[36m(RayTrainWorker pid=18068)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=18068)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00006_6_hidden_size=256,num_resblocks=3,weight_decay=0.0063_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=18068)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:  27%|██▋       | 30/111 [00:00<00:01, 79.85it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 77.91it/s, v_num=0, ptl/val_loss=0.964, ptl/val_auc=0.734, ptl/train_loss=0.630]


[36m(RayTrainWorker pid=18484)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=18299)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=18299)[0m - (ip=172.26.79.196, pid=18484) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=18484)[0m Train : incident_cad
[36m(RayTrainWorker pid=18484)[0m dtype: int64
[36m(RayTrainWorker pid=18484)[0m val : incident_cad
[36m(RayTrainWorker pid=18484)[0m dtype: int64
[36m(RayTrainWorker pid=18484)[0m Test : incident_cad
[36m(RayTrainWorker pid=18484)[0m dtype: int64
Epoch 0:  54%|█████▍    | 60/111 [00:00<00:00, 88.66it/s, v_num=0][32m [repeated 2x across cluster][0m
Epoch 0:  72%|███████▏  | 80/111 [00:00<00:00, 90.35it/s, v_num=0][32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=18484)[0m 1.0               833[32m [repeated 10x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 167.89it/s][A[32m [repeated 25x across cluster][0m
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 174.41it/s][A[32m [repeated 

[36m(RayTrainWorker pid=18484)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=18484)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=18484)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=18484)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=18484)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=18484)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=18484)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(RayTrainWorker pid=18484)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=18484)[0m 
[36m(RayTrainWorker pid=18484)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=18484)[0m -----------------------------------------------
[36m(RayTrainWorker pid=18484)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=18484)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=18484)[0m 2 | resblocks | Sequential       | 791 K 
[36m(RayTrainWorker pid=18484)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=18484)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=18484)[0m -----------------------------------------------
[36m(RayTrainWorker pid=18484)[0m 2.3 M     Trainable params
[36m(RayTrainWorker pid=18484)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=18484)[0m 2.3 M     Total params
[36m(RayTrainWorker pid=18484)[0m 9.178     Total estimate

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  0.76it/s]
                                                                           


[36m(RayTrainWorker pid=18484)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=18484)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   2%|▏         | 2/111 [00:00<00:04, 24.56it/s, v_num=0]
Epoch 0:   6%|▋         | 7/111 [00:00<00:02, 37.01it/s, v_num=0]
Epoch 0:   9%|▉         | 10/111 [00:00<00:02, 34.37it/s, v_num=0]
[36m(RayTrainWorker pid=18607)[0m Train : incident_cad
[36m(RayTrainWorker pid=18607)[0m dtype: int64[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=18607)[0m val : incident_cad
[36m(RayTrainWorker pid=18607)[0m Test : incident_cad
[36m(RayTrainWorker pid=18607)[0m 1.0               833[32m [repeated 6x across cluster][0m
Epoch 0:  11%|█         | 12/111 [00:00<00:03, 32.08it/s, v_num=0]
Epoch 0:  14%|█▍        | 16/111 [00:00<00:03, 31.42it/s, v_num=0]
Epoch 0:  14%|█▍        | 16/111 [00:00<00:03, 31.39it/s, v_num=0]
Epoch 0:  19%|█▉        | 21/111 [00:00<00:02, 34.25it/s, v_num=0]
Epoch 0:  23%|██▎       | 26/111 [00:00<00:02, 36.63it/s, v_num=0]
Epoch 0:  24%|██▍       | 27/111 [00:00<00:02, 37.22it/s, v_num=0]
Epoch 0:  32%|███▏      | 35/111 [00:00<00:01, 4



Epoch 0:  78%|███████▊  | 87/111 [00:01<00:00, 55.92it/s, v_num=0]
Epoch 0:  86%|████████▋ | 96/111 [00:01<00:00, 58.09it/s, v_num=0]
Epoch 0:  95%|█████████▍| 105/111 [00:01<00:00, 59.83it/s, v_num=0]
Epoch 0:  98%|█████████▊| 109/111 [00:01<00:00, 58.58it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 58.59it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   4%|▎         | 1/28 [00:00<00:00, 134.60it/s][A
Validation DataLoader 0:   7%|▋         | 2/28 [00:00<00:00, 141.63it/s][A
Validation DataLoader 0:  11%|█         | 3/28 [00:00<00:00, 150.16it/s][A
Validation DataLoader 0:  14%|█▍        | 4/28 [00:00<00:00, 136.06it/s][A
Validation DataLoader 0:  18%|█▊        | 5/28 [00:00<00:00, 111.95it/s][A
Validation DataLoader 0:  21%|██▏       | 6/28 [00:00<00:00, 104.50it/s][A
Validation DataLoader 0:  25%|██▌   

[36m(RayTrainWorker pid=18484)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=18607)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=18607)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=18607)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=18607)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=18607)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=18607)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that h

Epoch 1:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738]          
Epoch 1:   7%|▋         | 8/111 [00:00<00:01, 62.34it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Epoch 1:  14%|█▎        | 15/111 [00:00<00:01, 62.73it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Epoch 1:  18%|█▊        | 20/111 [00:00<00:01, 59.95it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Epoch 1:  19%|█▉        | 21/111 [00:00<00:01, 60.72it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Epoch 1:  19%|█▉        | 21/111 [00:00<00:01, 60.63it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Epoch 1:  25%|██▌       | 28/111 [00:00<00:01, 63.42it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738,

[36m(RayTrainWorker pid=18607)[0m 2 | resblocks | Sequential       | 1.1 M 


Epoch 1:  48%|████▊     | 53/111 [00:00<00:00, 61.37it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Epoch 1:  57%|█████▋    | 63/111 [00:00<00:00, 64.72it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Epoch 1:  61%|██████▏   | 68/111 [00:01<00:00, 63.81it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Epoch 1:  68%|██████▊   | 76/111 [00:01<00:00, 64.85it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Epoch 1:  69%|██████▉   | 77/111 [00:01<00:00, 65.05it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/train_auc=0.758]
Epoch 1:  76%|███████▌  | 84/111 [00:01<00:00, 65.95it/s, v_num=0, ptl/val_loss=0.375, ptl/val_auc=0.752, ptl/train_loss=0.738, ptl/trai

[36m(RayTrainWorker pid=18484)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00007_7_hidden_size=512,num_resblocks=3,weight_decay=0.0002_2024-04-17_10-26-56/checkpoint_000001)
[36m(RayTrainWorker pid=18607)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=18607)[0m 
[36m(RayTrainWorker pid=18607)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=18607)[0m -----------------------------------------------[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=18607)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=18607)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=18607)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=18607)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=18607)[0m 2.6 M     Trainable params
[36m(RayTrainWorker pid

                                                                           


[36m(RayTrainWorker pid=18607)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=18607)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:   5%|▌         | 6/111 [00:00<00:01, 53.51it/s, v_num=0]
[36m(RayTrainWorker pid=18484)[0m [32m [repeated 5x across cluster][0m
Epoch 0:  41%|████      | 45/111 [00:00<00:00, 73.56it/s, v_num=0]
Epoch 0:  35%|███▌      | 39/111 [00:00<00:00, 73.98it/s, v_num=0][32m [repeated 3x across cluster][0m
Epoch 0:  59%|█████▊    | 65/111 [00:00<00:00, 77.07it/s, v_num=0]
Epoch 0:  59%|█████▊    | 65/111 [00:00<00:00, 77.06it/s, v_num=0]
Epoch 0:  68%|██████▊   | 76/111 [00:00<00:00, 80.09it/s, v_num=0]
Epoch 0:  68%|██████▊   | 76/111 [00:00<00:00, 80.07it/s, v_num=0]
Epoch 0:  82%|████████▏ | 91/111 [00:01<00:00, 81.91it/s, v_num=0]
Epoch 0:  83%|████████▎ | 92/111 [00:01<00:00, 73.39it/s, v_num=0]
Epoch 0:  77%|███████▋  | 85/111 [00:01<00:00, 81.07it/s, v_num=0][32m [repeated 3x across cluster][0m
Epoch 0:  93%|█████████▎| 103/111 [00:01<00:00, 75.71it/s, v_num=0]
Epoch 0:  68%|██████▊   | 75/111 [00:00<00:00, 79.89it/s, v_nu

[36m(RayTrainWorker pid=19066)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(RayTrainWorker pid=18607)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=18607)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00008_8_hidden_size=512,num_resblocks=4,weight_decay=0.0014_2024-04-17_10-26-56/checkpoint_000000)
[36m(TorchTrainer pid=18909)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=18909)[0m - (ip=172.26.79.196, pid=19066) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=19066)[0m Train : incident_cad
[36m(RayTrainWorker pid=19066)[0m dtype: int64
[36m(RayTrainWorker pid=19066)[0m val : incident_cad
[36m(RayTrainWorker pid=19066)[0m dtype: int64
[36m(RayTrainWorker pid=19066)[0m Test : incident_cad
[36m(RayTrainWorker pid=19066)[0m dtype: int64
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 159.11it/s][A[32m [repeated 3x across cluster][0m
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 67.28it/s, v_num=0, ptl/val_loss=0.877, ptl/val_auc=0.747]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 62.45it/s, v_num=0, ptl/val_loss=0.877, ptl/val_auc=0.747, ptl/train_loss=0.643]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 77.42it/s, v_num=0]
[36m(RayTrainWorker pid=19066)[0m 1.0               833[32m [repeated 9x across cluster][0m


[36m(RayTrainWorker pid=19066)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=19066)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=19066)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=19066)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=19066)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=19066)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=19066)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(TorchTrainer pid=19016)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=19016)[0m - (ip=172.26.79.196, pid=19123) world_rank=0, local_rank=0, node_rank=0


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  0.62it/s]
                                                                           


[36m(RayTrainWorker pid=19066)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=19066)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   2%|▏         | 2/111 [00:00<00:02, 50.91it/s, v_num=0]
Epoch 0:  11%|█         | 12/111 [00:00<00:01, 83.51it/s, v_num=0]
[36m(RayTrainWorker pid=19123)[0m Train : incident_cad
[36m(RayTrainWorker pid=19123)[0m dtype: int64[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=19123)[0m val : incident_cad
[36m(RayTrainWorker pid=19123)[0m Test : incident_cad
[36m(RayTrainWorker pid=19123)[0m 1.0               833[32m [repeated 6x across cluster][0m
Epoch 0:  13%|█▎        | 14/111 [00:00<00:01, 84.72it/s, v_num=0]
Epoch 0:  16%|█▌        | 18/111 [00:00<00:01, 51.24it/s, v_num=0]
Epoch 0:  24%|██▍       | 27/111 [00:00<00:01, 59.91it/s, v_num=0]
Epoch 0:  33%|███▎      | 37/111 [00:00<00:01, 65.64it/s, v_num=0]
Epoch 0:  40%|███▉      | 44/111 [00:00<00:01, 66.85it/s, v_num=0]
Epoch 0:  47%|████▋     | 52/111 [00:00<00:00, 68.06it/s, v_num=0]
Epoch 0:  53%|█████▎    | 59/111 [00:00<00:00, 68.44it/s, v_num=0]
Epoch 0:  59%|█████▊    | 65/111 [00:00<00:00, 

[36m(RayTrainWorker pid=19123)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=19123)[0m 2 | resblocks | Sequential       | 1.1 M 
[36m(RayTrainWorker pid=19123)[0m 2.6 M     Trainable params
[36m(RayTrainWorker pid=19123)[0m 2.6 M     Total params
[36m(RayTrainWorker pid=19123)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=19123)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=19123)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=19123)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=19123)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=19123)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_

Validation DataLoader 0:  32%|███▏      | 9/28 [00:00<00:00, 51.04it/s][A
Validation DataLoader 0:  36%|███▌      | 10/28 [00:00<00:00, 52.86it/s][A
Validation DataLoader 0:  39%|███▉      | 11/28 [00:00<00:00, 56.21it/s][A
Validation DataLoader 0:  43%|████▎     | 12/28 [00:00<00:00, 58.63it/s][A
Validation DataLoader 0:  46%|████▋     | 13/28 [00:00<00:00, 61.35it/s][A
Validation DataLoader 0:  50%|█████     | 14/28 [00:00<00:00, 63.37it/s][A
Validation DataLoader 0:  54%|█████▎    | 15/28 [00:00<00:00, 66.10it/s][A
Validation DataLoader 0:  57%|█████▋    | 16/28 [00:00<00:00, 68.76it/s][A
Validation DataLoader 0:  61%|██████    | 17/28 [00:00<00:00, 71.46it/s][A
Validation DataLoader 0:  64%|██████▍   | 18/28 [00:00<00:00, 73.93it/s][A
Validation DataLoader 0:  68%|██████▊   | 19/28 [00:00<00:00, 75.90it/s][A
Validation DataLoader 0:  71%|███████▏  | 20/28 [00:00<00:00, 78.24it/s][A
Validation DataLoader 0:  75%|███████▌  | 21/28 [00:00<00:00, 80.66it/s][A
Validation Da

[36m(RayTrainWorker pid=19066)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=19066)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00009_9_hidden_size=256,num_resblocks=1,weight_decay=0.0001_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=19066)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


                                                                           
Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=19123)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=19123)[0m 
[36m(RayTrainWorker pid=19123)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=19123)[0m -----------------------------------------------[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=19123)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=19123)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=19123)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=19123)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=19123)[0m 10.233    Total estimated model params size (MB)


Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:  47%|████▋     | 52/111 [00:00<00:00, 82.31it/s, v_num=0]
Epoch 0:  63%|██████▎   | 70/111 [00:00<00:00, 78.01it/s, v_num=0]
Epoch 0:  93%|█████████▎| 103/111 [00:01<00:00, 76.17it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 77.44it/s, v_num=0]
Epoch 0:  85%|████████▍ | 94/111 [00:01<00:00, 75.09it/s, v_num=0][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=19123)[0m [32m [repeated 6x across cluster][0m
                                                                         [A
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 66.60it/s, v_num=0, ptl/val_loss=0.435, ptl/val_auc=0.755]


[36m(RayTrainWorker pid=19123)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=19123)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 61.84it/s, v_num=0, ptl/val_loss=0.435, ptl/val_auc=0.755, ptl/train_loss=0.819]
Epoch 0:  55%|█████▍    | 61/111 [00:00<00:00, 83.32it/s, v_num=0][32m [repeated 2x across cluster][0m
Epoch 1:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.435, ptl/val_auc=0.755, ptl/train_loss=0.819]          
Epoch 0:  76%|███████▌  | 84/111 [00:01<00:00, 73.66it/s, v_num=0][32m [repeated 2x across cluster][0m
Epoch 0:  62%|██████▏   | 69/111 [00:00<00:00, 84.43it/s, v_num=0][32m [repeated 3x across cluster][0m
Epoch 1:   9%|▉         | 10/111 [00:00<00:03, 27.61it/s, v_num=0, ptl/val_loss=0.435, ptl/val_auc=0.755, ptl/train_loss=0.819, ptl/train_auc=0.744]
Epoch 1:  13%|█▎        | 14/111 [00:00<00:03, 29.25it/s, v_num=0, ptl/val_loss=0.435, ptl/val_auc=0.755, ptl/train_loss=0.819, ptl/train_auc=0.744]
Epoch 0:  29%|██▉       | 32/111 [00:00<00:01, 75.03it/s, v_num=0]
Epoch 1:  16%|█▌        | 18/111 [00:00<00:02, 31.66it/s, v_nu

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 1: 100%|██████████| 111/111 [00:02<00:00, 46.63it/s, v_num=0, ptl/val_loss=0.389, ptl/val_auc=0.730, ptl/train_loss=0.819, ptl/train_auc=0.744]
Epoch 1: 100%|██████████| 111/111 [00:02<00:00, 45.06it/s, v_num=0, ptl/val_loss=0.389, ptl/val_auc=0.730, ptl/train_loss=0.418, ptl/train_auc=0.744]


[36m(RayTrainWorker pid=19123)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=19123)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00010_10_hidden_size=512,num_resblocks=4,weight_decay=0.0002_2024-04-17_10-26-56/checkpoint_000001)[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=19123)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metri

[36m(RayTrainWorker pid=19572)[0m Train : incident_cad
[36m(RayTrainWorker pid=19572)[0m dtype: int64
[36m(RayTrainWorker pid=19572)[0m val : incident_cad
[36m(RayTrainWorker pid=19572)[0m dtype: int64
[36m(RayTrainWorker pid=19572)[0m Test : incident_cad
[36m(RayTrainWorker pid=19572)[0m dtype: int64
[36m(RayTrainWorker pid=19572)[0m 1.0               833[32m [repeated 10x across cluster][0m
Epoch 1:  89%|████████▉ | 99/111 [00:02<00:00, 48.70it/s, v_num=0, ptl/val_loss=0.435, ptl/val_auc=0.755, ptl/train_loss=0.819, ptl/train_auc=0.744][32m [repeated 2x across cluster][0m
Epoch 1:  72%|███████▏  | 80/111 [00:01<00:00, 44.02it/s, v_num=0, ptl/val_loss=0.435, ptl/val_auc=0.755, ptl/train_loss=0.819, ptl/train_auc=0.744]
Epoch 1:  60%|██████    | 67/111 [00:01<00:01, 41.71it/s, v_num=0, ptl/val_loss=0.435, ptl/val_auc=0.755, ptl/train_loss=0.819, ptl/train_auc=0.744][32m [repeated 2x across cluster][0m
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 144

[36m(RayTrainWorker pid=19572)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=19572)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=19572)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=19572)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=19572)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=19572)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=19572)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=19707)[0m Setting up process group for: env:// [rank=0, world_size=1]


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:01<00:00,  1.55it/s]


[36m(TorchTrainer pid=19621)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=19621)[0m - (ip=172.26.79.196, pid=19707) world_rank=0, local_rank=0, node_rank=0
[36m(RayTrainWorker pid=19572)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=19572)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Training: |          | 0/? [00:00<?, ?it/s]
Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:   6%|▋         | 7/111 [00:00<00:01, 65.98it/s, v_num=0]
                                                                           
Epoch 0:  14%|█▍        | 16/111 [00:00<00:01, 77.63it/s, v_num=0]
Epoch 0:  23%|██▎       | 25/111 [00:00<00:01, 81.80it/s, v_num=0]
Epoch 0:  28%|██▊       | 31/111 [00:00<00:01, 75.04it/s, v_num=0]
Epoch 0:  34%|███▍      | 38/111 [00:00<00:00, 74.72it/s, v_num=0]
Epoch 0:  41%|████▏     | 46/111 [00:00<00:00, 75.07it/s, v_num=0]
Epoch 0:  48%|████▊     | 53/111 [00:00<00:00, 72.81it/s, v_num=0]
Epoch 0:  53%|█████▎    | 59/111 [00:00<00:00, 71.04it/s, v_num=0]
Epoch 0:  59%|█████▉    | 66/111 [00:00<00:00, 71.39it/s, v_num=0]
Epoch 0:  60%|██████    | 67/111 [00:00<00:00, 71.57it/s, v_num=0]
Epoch 0:  68%|██████▊   | 76/111 [00:01<00:00, 73.50it/s, v_num=0]
Epoch 0:  73%|███████▎  | 81/111 [00:01<00:00, 71.51it/s, v_num=0]
Epoch 0:  80%|████████  | 8

[36m(RayTrainWorker pid=19572)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=19572)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00011_11_hidden_size=512,num_resblocks=3,weight_decay=0.0100_2024-04-17_10-26-56/checkpoint_000000)


Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 59.25it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757]
Epoch 1:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757]          


[36m(RayTrainWorker pid=19572)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 1:   4%|▎         | 4/111 [00:00<00:02, 36.66it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757, ptl/train_auc=0.722]
Epoch 1:  10%|▉         | 11/111 [00:00<00:01, 51.55it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757, ptl/train_auc=0.722]
Epoch 1:  11%|█         | 12/111 [00:00<00:01, 53.04it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757, ptl/train_auc=0.722]
Epoch 1:  11%|█         | 12/111 [00:00<00:01, 52.91it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757, ptl/train_auc=0.722]
Epoch 1:  18%|█▊        | 20/111 [00:00<00:01, 61.70it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757, ptl/train_auc=0.722]
[36m(RayTrainWorker pid=19707)[0m Train : incident_cad
[36m(RayTrainWorker pid=19707)[0m dtype: int64
[36m(RayTrainWorker pid=19707)[0m val : incident_cad
[36m(RayTrainWorker pid=19707)[0m dtype: int64
[36m(RayTrainWorker pid=19707)[0m Test : inc



Epoch 1:  91%|█████████ | 101/111 [00:01<00:00, 56.42it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757, ptl/train_auc=0.722]
Epoch 1:  96%|█████████▋| 107/111 [00:01<00:00, 56.58it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757, ptl/train_auc=0.722]




Epoch 1: 100%|██████████| 111/111 [00:01<00:00, 57.30it/s, v_num=0, ptl/val_loss=0.450, ptl/val_auc=0.753, ptl/train_loss=0.757, ptl/train_auc=0.722]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   4%|▎         | 1/28 [00:00<00:00, 125.09it/s][A
Validation DataLoader 0:   7%|▋         | 2/28 [00:00<00:00, 119.21it/s][A
Validation DataLoader 0:  11%|█         | 3/28 [00:00<00:00, 119.10it/s][A
Validation DataLoader 0:  14%|█▍        | 4/28 [00:00<00:00, 110.44it/s][A
Validation DataLoader 0:  18%|█▊        | 5/28 [00:00<00:00, 111.03it/s][A
Validation DataLoader 0:  21%|██▏       | 6/28 [00:00<00:00, 109.33it/s][A
Validation DataLoader 0:  25%|██▌       | 7/28 [00:00<00:00, 105.07it/s][A
Validation DataLoader 0:  29%|██▊       | 8/28 [00:00<00:00, 107.67it/s][A
Validation DataLoader 0:  32%|███▏      | 9/28 [00:00<00:00, 107.58it/s]

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Validation DataLoader 0:  61%|██████    | 17/28 [00:00<00:00, 109.67it/s][A
Validation DataLoader 0:  64%|██████▍   | 18/28 [00:00<00:00, 111.59it/s][A
Validation DataLoader 0:  68%|██████▊   | 19/28 [00:00<00:00, 111.15it/s][A
Validation DataLoader 0:  71%|███████▏  | 20/28 [00:00<00:00, 112.61it/s][A
Validation DataLoader 0:  75%|███████▌  | 21/28 [00:00<00:00, 111.21it/s][A
Validation DataLoader 0:  79%|███████▊  | 22/28 [00:00<00:00, 110.41it/s][A
Validation DataLoader 0:  82%|████████▏ | 23/28 [00:00<00:00, 111.53it/s][A
Validation DataLoader 0:  86%|████████▌ | 24/28 [00:00<00:00, 110.25it/s][A
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 97.32it/s] [A
Validation DataLoader 0:  93%|█████████▎| 26/28 [00:00<00:00, 95.99it/s][A
Validation DataLoader 0:  96%|█████████▋| 27/28 [00:00<00:00, 92.78it/s][A
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 90.69it/s][A
Epoch 1: 100%|██████████| 111/111 [00:02<00:00, 48.58it/s, v_num=0, ptl/val_los

[36m(RayTrainWorker pid=19572)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00011_11_hidden_size=512,num_resblocks=3,weight_decay=0.0100_2024-04-17_10-26-56/checkpoint_000001)
[36m(RayTrainWorker pid=19707)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=19707)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=19707)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=19707)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=19707)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=19707)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `t

Epoch 1: 100%|██████████| 111/111 [00:02<00:00, 47.24it/s, v_num=0, ptl/val_loss=0.477, ptl/val_auc=0.766, ptl/train_loss=0.581, ptl/train_auc=0.722]
Epoch 2:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.477, ptl/val_auc=0.766, ptl/train_loss=0.581, ptl/train_auc=0.722]          
Epoch 2:   5%|▌         | 6/111 [00:00<00:01, 62.67it/s, v_num=0, ptl/val_loss=0.477, ptl/val_auc=0.766, ptl/train_loss=0.581, ptl/train_auc=0.741]
Epoch 2:  14%|█▎        | 15/111 [00:00<00:01, 74.39it/s, v_num=0, ptl/val_loss=0.477, ptl/val_auc=0.766, ptl/train_loss=0.581, ptl/train_auc=0.741]
Epoch 2:  19%|█▉        | 21/111 [00:00<00:01, 69.18it/s, v_num=0, ptl/val_loss=0.477, ptl/val_auc=0.766, ptl/train_loss=0.581, ptl/train_auc=0.741]
Epoch 2:  25%|██▌       | 28/111 [00:00<00:01, 67.87it/s, v_num=0, ptl/val_loss=0.477, ptl/val_auc=0.766, ptl/train_loss=0.581, ptl/train_auc=0.741]
Epoch 2:  32%|███▏      | 36/111 [00:00<00:01, 69.59it/s, v_num=0, ptl/val_loss=0.477, ptl/val_auc=0.766,

[36m(RayTrainWorker pid=19707)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=19707)[0m 
[36m(RayTrainWorker pid=19707)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=19707)[0m -----------------------------------------------
[36m(RayTrainWorker pid=19707)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=19707)[0m 1 | fc1       | Linear           | 748 K 
[36m(RayTrainWorker pid=19707)[0m 2 | resblocks | Sequential       | 66.3 K
[36m(RayTrainWorker pid=19707)[0m 3 | fc2       | Linear           | 514   
[36m(RayTrainWorker pid=19707)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=19707)[0m -----------------------------------------------
[36m(RayTrainWorker pid=19707)[0m 820 K     Trainable params
[36m(RayTrainWorker pid=19707)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=19707)[0m 820 K     Total params
[36m(RayTrainWorker pid=19707)[0m 3.284     Total estimate

Validation DataLoader 0:  68%|██████▊   | 19/28 [00:00<00:00, 64.73it/s][A
Validation DataLoader 0:  71%|███████▏  | 20/28 [00:00<00:00, 66.63it/s][A
Validation DataLoader 0:  75%|███████▌  | 21/28 [00:00<00:00, 68.27it/s][A
Validation DataLoader 0:  79%|███████▊  | 22/28 [00:00<00:00, 70.21it/s][A
Validation DataLoader 0:  82%|████████▏ | 23/28 [00:00<00:00, 72.00it/s][A
Validation DataLoader 0:  86%|████████▌ | 24/28 [00:00<00:00, 73.74it/s][A
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 75.69it/s][A
Validation DataLoader 0:  93%|█████████▎| 26/28 [00:00<00:00, 77.69it/s][A
Validation DataLoader 0:  96%|█████████▋| 27/28 [00:00<00:00, 79.11it/s][A
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 81.01it/s][A
Epoch 2: 100%|██████████| 111/111 [00:01<00:00, 59.82it/s, v_num=0, ptl/val_loss=0.441, ptl/val_auc=0.772, ptl/train_loss=0.581, ptl/train_auc=0.741]
Epoch 2: 100%|██████████| 111/111 [00:01<00:00, 55.92it/s, v_num=0, ptl/val_loss=0.441, pt

[36m(RayTrainWorker pid=19572)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00011_11_hidden_size=512,num_resblocks=3,weight_decay=0.0100_2024-04-17_10-26-56/checkpoint_000002)


Epoch 3:   1%|          | 1/111 [00:00<00:05, 21.80it/s, v_num=0, ptl/val_loss=0.441, ptl/val_auc=0.772, ptl/train_loss=0.562, ptl/train_auc=0.741]  
Epoch 3:   1%|          | 1/111 [00:00<00:05, 21.69it/s, v_num=0, ptl/val_loss=0.441, ptl/val_auc=0.772, ptl/train_loss=0.562, ptl/train_auc=0.755]
Epoch 3:   7%|▋         | 8/111 [00:00<00:01, 57.02it/s, v_num=0, ptl/val_loss=0.441, ptl/val_auc=0.772, ptl/train_loss=0.562, ptl/train_auc=0.755]
Epoch 3:   8%|▊         | 9/111 [00:00<00:01, 59.06it/s, v_num=0, ptl/val_loss=0.441, ptl/val_auc=0.772, ptl/train_loss=0.562, ptl/train_auc=0.755]
Epoch 3:  14%|█▍        | 16/111 [00:00<00:01, 64.91it/s, v_num=0, ptl/val_loss=0.441, ptl/val_auc=0.772, ptl/train_loss=0.562, ptl/train_auc=0.755]
[36m(RayTrainWorker pid=19572)[0m [32m [repeated 14x across cluster][0m
Epoch 3:  23%|██▎       | 26/111 [00:00<00:01, 72.58it/s, v_num=0, ptl/val_loss=0.441, ptl/val_auc=0.772, ptl/train_loss=0.562, ptl/train_auc=0.755]
Epoch 3:  32%|███▏      | 35/111

[36m(RayTrainWorker pid=19707)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=19707)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 3: 100%|██████████| 111/111 [00:01<00:00, 61.69it/s, v_num=0, ptl/val_loss=0.765, ptl/val_auc=0.760, ptl/train_loss=0.562, ptl/train_auc=0.755]
Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 


[36m(RayTrainWorker pid=19572)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00011_11_hidden_size=512,num_resblocks=3,weight_decay=0.0100_2024-04-17_10-26-56/checkpoint_000003)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 3: 100%|██████████| 111/111 [00:02<00:00, 55.30it/s, v_num=0, ptl/val_loss=0.765, ptl/val_auc=0.760, ptl/train_loss=0.548, ptl/train_auc=0.755]
Epoch 0:   6%|▋         | 7/111 [00:00<00:03, 29.36it/s, v_num=0]
Epoch 0:   8%|▊         | 9/111 [00:00<00:03, 26.13it/s, v_num=0][32m [repeated 2x across cluster][0m
Epoch 0:  50%|█████     | 56/111 [00:00<00:00, 65.00it/s, v_num=0]
Epoch 0:  69%|██████▉   | 77/111 [00:01<00:00, 71.88it/s, v_num=0]
Epoch 0:  92%|█████████▏| 102/111 [00:01<00:00, 73.69it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 75.65it/s, v_num=0]


[36m(RayTrainWorker pid=19707)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=19707)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 1:   1%|          | 1/111 [00:00<00:03, 30.92it/s, v_num=0, ptl/val_loss=0.494, ptl/val_auc=0.769, ptl/train_loss=0.595, ptl/train_auc=0.771]
Epoch 1:  13%|█▎        | 14/111 [00:00<00:01, 96.39it/s, v_num=0, ptl/val_loss=0.494, ptl/val_auc=0.769, ptl/train_loss=0.595, ptl/train_auc=0.771]
Epoch 1:  23%|██▎       | 25/111 [00:00<00:00, 101.67it/s, v_num=0, ptl/val_loss=0.494, ptl/val_auc=0.769, ptl/train_loss=0.595, ptl/train_auc=0.771]
Epoch 1:  36%|███▌      | 40/111 [00:00<00:00, 87.54it/s, v_num=0, ptl/val_loss=0.494, ptl/val_auc=0.769, ptl/train_loss=0.595, ptl/train_auc=0.771]
Epoch 0:  41%|████      | 45/111 [00:00<00:01, 59.35it/s, v_num=0]
Epoch 1:  46%|████▌     | 51/111 [00:00<00:00, 91.37it/s, v_num=0, ptl/val_loss=0.494, ptl/val_auc=0.769, ptl/train_loss=0.595, ptl/train_auc=0.771]
Epoch 0:  91%|█████████ | 101/111 [00:01<00:00, 73.48it/s, v_num=0][32m [repeated 3x across cluster][0m
Epoch 1:  54%|█████▍    | 60/111 [00:00<00:00, 91.06it/s, v_num=0, ptl/val_loss=0.

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 1: 100%|██████████| 111/111 [00:01<00:00, 75.93it/s, v_num=0, ptl/val_loss=0.639, ptl/val_auc=0.778, ptl/train_loss=0.536, ptl/train_auc=0.771]
Epoch 1:  36%|███▌      | 40/111 [00:00<00:00, 87.42it/s, v_num=0, ptl/val_loss=0.494, ptl/val_auc=0.769, ptl/train_loss=0.595, ptl/train_auc=0.771]
Epoch 2:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.639, ptl/val_auc=0.778, ptl/train_loss=0.536, ptl/train_auc=0.771]          
Epoch 1:  29%|██▉       | 32/111 [00:00<00:00, 90.60it/s, v_num=0, ptl/val_loss=0.494, ptl/val_auc=0.769, ptl/train_loss=0.595, ptl/train_auc=0.771][32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=19707)[0m [32m [repeated 12x across cluster][0m
Epoch 2:  19%|█▉        | 21/111 [00:00<00:01, 71.22it/s, v_num=0, ptl/val_loss=0.639, ptl/val_auc=0.778, ptl/train_loss=0.536, ptl/train_auc=0.787]
Epoch 2:  14%|█▍        | 16/111 [00:00<00:01, 79.33it/s, v_num=0, ptl/val_loss=0.639, ptl/val_auc=0.778, ptl/train_loss=0.536, ptl/train_a

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=19707)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00012_12_hidden_size=256,num_resblocks=1,weight_decay=0.0021_2024-04-17_10-26-56/checkpoint_000003)[32m [repeated 4x across cluster][0m


Epoch 4:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.511, ptl/val_auc=0.789, ptl/train_loss=0.517, ptl/train_auc=0.797]          
Epoch 4:   4%|▎         | 4/111 [00:00<00:02, 35.91it/s, v_num=0, ptl/val_loss=0.511, ptl/val_auc=0.789, ptl/train_loss=0.517, ptl/train_auc=0.802]
Epoch 4:  23%|██▎       | 25/111 [00:00<00:01, 61.02it/s, v_num=0, ptl/val_loss=0.511, ptl/val_auc=0.789, ptl/train_loss=0.517, ptl/train_auc=0.802]
Epoch 4:  30%|██▉       | 33/111 [00:00<00:01, 62.36it/s, v_num=0, ptl/val_loss=0.511, ptl/val_auc=0.789, ptl/train_loss=0.517, ptl/train_auc=0.802]
Epoch 4:  10%|▉         | 11/111 [00:00<00:01, 52.26it/s, v_num=0, ptl/val_loss=0.511, ptl/val_auc=0.789, ptl/train_loss=0.517, ptl/train_auc=0.802][32m [repeated 2x across cluster][0m
Epoch 4:  65%|██████▍   | 72/111 [00:00<00:00, 76.53it/s, v_num=0, ptl/val_loss=0.511, ptl/val_auc=0.789, ptl/train_loss=0.517, ptl/train_auc=0.802]
Epoch 4:  73%|███████▎  | 81/111 [00:01<00:00, 77.95it/s, v_num=0, p

[36m(RayTrainWorker pid=20129)[0m Setting up process group for: env:// [rank=0, world_size=1]
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 5: 100%|██████████| 111/111 [00:01<00:00, 65.78it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.808]
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 143.48it/s][A[32m [repeated 9x across cluster][0m
Epoch 6:   2%|▏         | 2/111 [00:00<00:03, 28.36it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813]  
Epoch 4: 100%|██████████| 111/111 [00:01<00:00, 66.78it/s, v_num=0, ptl/val_loss=0.378, ptl/val_auc=0.786, ptl/train_loss=0.517, ptl/train_auc=0.802]
Epoch 6:  14%|█▎        | 15/111 [00:00<00:01, 52.78it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813]
Epoch 6:  17%|█▋        | 19/111 [00:00<00:01, 52.77it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813]
Epoch 6:  32%|███▏      | 35/111 [00:00<00:01, 58.05it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train

[36m(TorchTrainer pid=20071)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=20071)[0m - (ip=172.26.79.196, pid=20129) world_rank=0, local_rank=0, node_rank=0


Epoch 6:  74%|███████▍  | 82/111 [00:01<00:00, 72.53it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813]
Epoch 6:  83%|████████▎ | 92/111 [00:01<00:00, 74.56it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813]
Epoch 6:  83%|████████▎ | 92/111 [00:01<00:00, 74.55it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813]
Epoch 6:  92%|█████████▏| 102/111 [00:01<00:00, 76.40it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813]
Epoch 6: 100%|██████████| 111/111 [00:01<00:00, 78.77it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813]


[36m(RayTrainWorker pid=19707)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00012_12_hidden_size=256,num_resblocks=1,weight_decay=0.0021_2024-04-17_10-26-56/checkpoint_000006)[32m [repeated 3x across cluster][0m


Epoch 7:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.788, ptl/train_loss=0.489, ptl/train_auc=0.813]          
Epoch 7:  19%|█▉        | 21/111 [00:00<00:00, 100.10it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.788, ptl/train_loss=0.489, ptl/train_auc=0.816]
Epoch 7:  29%|██▉       | 32/111 [00:00<00:00, 100.03it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.788, ptl/train_loss=0.489, ptl/train_auc=0.816]
Epoch 7:  39%|███▊      | 43/111 [00:00<00:00, 101.59it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.788, ptl/train_loss=0.489, ptl/train_auc=0.816]
Epoch 7:  49%|████▊     | 54/111 [00:00<00:00, 102.70it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.788, ptl/train_loss=0.489, ptl/train_auc=0.816]
Epoch 6:   9%|▉         | 10/111 [00:00<00:01, 52.85it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813][32m [repeated 2x across cluster][0m
Epoch 7:  62%|██████▏   | 69/111 [00:00<00:00, 95.35it/s, v_num

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Epoch 7: 100%|██████████| 111/111 [00:01<00:00, 71.39it/s, v_num=0, ptl/val_loss=0.744, ptl/val_auc=0.785, ptl/train_loss=0.489, ptl/train_auc=0.816]


[36m(RayTrainWorker pid=20129)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=20129)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=20129)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=20129)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=20129)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=20129)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=20129)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Epoch 6:  40%|███▉      | 44/111 [00:00<00:01, 61.49it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813][32m [repeated 4x across cluster][0m
[36m(RayTrainWorker pid=19707)[0m [32m [repeated 13x across cluster][0m
Epoch 7:  58%|█████▊    | 64/111 [00:00<00:00, 102.50it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.788, ptl/train_loss=0.489, ptl/train_auc=0.816][32m [repeated 7x across cluster][0m
Epoch 6:  45%|████▌     | 50/111 [00:00<00:01, 60.97it/s, v_num=0, ptl/val_loss=0.445, ptl/val_auc=0.787, ptl/train_loss=0.494, ptl/train_auc=0.813][32m [repeated 3x across cluster][0m
Validation: |          | 0/? [00:00<?, ?it/s][A[32m [repeated 3x across cluster][0m
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A[32m [repeated 3x across cluster][0m
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A[32m [repeated 3x across cluster][0m
Validation DataLoader 0:  8

[36m(RayTrainWorker pid=20129)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=20129)[0m 
[36m(RayTrainWorker pid=20129)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=20129)[0m -----------------------------------------------
[36m(RayTrainWorker pid=20129)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=20129)[0m 1 | fc1       | Linear           | 3.0 M 
[36m(RayTrainWorker pid=20129)[0m 2 | resblocks | Sequential       | 1.1 M 
[36m(RayTrainWorker pid=20129)[0m 3 | fc2       | Linear           | 2.0 K 
[36m(RayTrainWorker pid=20129)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=20129)[0m -----------------------------------------------
[36m(RayTrainWorker pid=20129)[0m 4.1 M     Trainable params
[36m(RayTrainWorker pid=20129)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=20129)[0m 4.1 M     Total params
[36m(RayTrainWorker pid=20129)[0m 16.211    Total estimate

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:01<00:00,  1.73it/s]
Epoch 7:  89%|████████▉ | 99/111 [00:01<00:00, 86.54it/s, v_num=0, ptl/val_loss=0.583, ptl/val_auc=0.788, ptl/train_loss=0.489, ptl/train_auc=0.816][32m [repeated 2x across cluster][0m


[36m(RayTrainWorker pid=20129)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=20129)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=19707)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00012_12_hidden_size=256,num_resblocks=1,weight_decay=0.0021_2024-04-17_10-26-56/checkpoint_000007)


Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:   7%|▋         | 8/111 [00:00<00:02, 41.47it/s, v_num=0]
Epoch 0:  22%|██▏       | 24/111 [00:00<00:01, 59.29it/s, v_num=0]
Epoch 0:  22%|██▏       | 24/111 [00:00<00:01, 59.20it/s, v_num=0]
Epoch 0:  30%|██▉       | 33/111 [00:00<00:01, 64.70it/s, v_num=0]
Epoch 0:  38%|███▊      | 42/111 [00:00<00:00, 69.43it/s, v_num=0]
Epoch 0:  48%|████▊     | 53/111 [00:00<00:00, 74.54it/s, v_num=0]
Epoch 0:  62%|██████▏   | 69/111 [00:00<00:00, 74.75it/s, v_num=0]
Epoch 0:  72%|███████▏  | 80/111 [00:01<00:00, 77.82it/s, v_num=0]
Epoch 0:  83%|████████▎ | 92/111 [00:01<00:00, 80.99it/s, v_num=0]
Epoch 0:  92%|█████████▏| 102/111 [00:01<00:00, 82.47it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 84.27it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 71.78it/s, v_num=0, ptl/val_loss=0.422, ptl/val_auc=0.718]


[36m(RayTrainWorker pid=20129)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 65.08it/s, v_num=0, ptl/val_loss=0.422, ptl/val_auc=0.718, ptl/train_loss=0.678]


[36m(RayTrainWorker pid=20129)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00013_13_hidden_size=1024,num_resblocks=1,weight_decay=0.0004_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=20404)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=20222)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=20222)[0m - (ip=172.26.79.196, pid=20404) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=20404)[0m Train : incident_cad
[36m(RayTrainWorker pid=20404)[0m dtype: int64
[36m(RayTrainWorker pid=20404)[0m val : incident_cad
[36m(RayTrainWorker pid=20404)[0m dtype: int64
[36m(RayTrainWorker pid=20404)[0m Test : incident_cad
[36m(RayTrainWorker pid=20404)[0m dtype: int64
Epoch 0:  49%|████▊     | 54/111 [00:00<00:00, 75.12it/s, v_num=0]
[36m(RayTrainWorker pid=20404)[0m 1.0               833[32m [repeated 10x across cluster][0m
Epoch 0:  54%|█████▍    | 60/111 [00:00<00:00, 75.11it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 142.31it/s][A[32m [repeated 25x across cluster][0m
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 144.60it/s][A[32m [repeated 3x across cluster][0m
Epoch 0:  14%|█▎        | 15/111 [00:00<00:01, 50.89i

[36m(RayTrainWorker pid=20404)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=20404)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=20404)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=20404)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=20404)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=20404)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=20404)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(TorchTrainer pid=20456)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=20456)[0m - (ip=172.26.79.196, pid=20540) world_rank=0, local_rank=0, node_rank=0


                                                                           


[36m(RayTrainWorker pid=20404)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=20404)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   3%|▎         | 3/111 [00:00<00:02, 46.72it/s, v_num=0]
Epoch 0:  10%|▉         | 11/111 [00:00<00:01, 63.92it/s, v_num=0]
Epoch 0:  17%|█▋        | 19/111 [00:00<00:01, 71.74it/s, v_num=0]
Epoch 0:  25%|██▌       | 28/111 [00:00<00:01, 73.71it/s, v_num=0]
Epoch 0:  28%|██▊       | 31/111 [00:00<00:01, 69.68it/s, v_num=0]
Epoch 0:  32%|███▏      | 36/111 [00:00<00:01, 61.88it/s, v_num=0]
Epoch 0:  36%|███▌      | 40/111 [00:00<00:01, 58.28it/s, v_num=0]
Epoch 0:  41%|████      | 45/111 [00:00<00:01, 56.70it/s, v_num=0]
Epoch 0:  48%|████▊     | 53/111 [00:00<00:00, 58.83it/s, v_num=0]
Epoch 0:  55%|█████▍    | 61/111 [00:00<00:00, 61.20it/s, v_num=0]
Epoch 0:  56%|█████▌    | 62/111 [00:01<00:00, 61.50it/s, v_num=0]
Epoch 0:  64%|██████▍   | 71/111 [00:01<00:00, 64.15it/s, v_num=0]
Epoch 0:  69%|██████▉   | 77/111 [00:01<00:00, 64.90it/s, v_num=0]
Epoch 0:  74%|███████▍  | 82/111 [00:01<00:00, 62.42it/s, v_num=0]
Epoch 0:  82%|████████▏ | 91/111 [00:01<00:00, 63.87it/s, v_num

[36m(RayTrainWorker pid=20404)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0: 100%|██████████| 111/111 [00:02<00:00, 52.77it/s, v_num=0, ptl/val_loss=0.482, ptl/val_auc=0.748, ptl/train_loss=0.757]


[36m(RayTrainWorker pid=20404)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00014_14_hidden_size=512,num_resblocks=4,weight_decay=0.0004_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=20404)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=20540)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=20540)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=20540)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=20540)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=20540)[0m /home/xutingfeng/miniforge3/envs/ra

Sanity Checking: |          | 0/? [00:00<?, ?it/s]


[36m(RayTrainWorker pid=20540)[0m 2 | resblocks | Sequential       | 132 K 
[36m(RayTrainWorker pid=20540)[0m 3 | fc2       | Linear           | 514   
[36m(RayTrainWorker pid=20540)[0m 887 K     Trainable params
[36m(RayTrainWorker pid=20540)[0m 887 K     Total params
[36m(RayTrainWorker pid=20540)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=20540)[0m 
[36m(RayTrainWorker pid=20540)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=20540)[0m -----------------------------------------------[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=20540)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=20540)[0m 1 | fc1       | Linear           | 748 K 
[36m(RayTrainWorker pid=20540)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=20540)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=20540)[0m 3.549     Total estimated model params size (MB)


Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:01<00:00,  1.17it/s]


[36m(RayTrainWorker pid=20540)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=20540)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   1%|          | 1/111 [00:00<00:03, 32.86it/s, v_num=0]
                                                                           [32m [repeated 5x across cluster][0m
Epoch 0:  10%|▉         | 11/111 [00:00<00:01, 82.77it/s, v_num=0]
Epoch 0:  21%|██        | 23/111 [00:00<00:00, 97.21it/s, v_num=0]
Epoch 0:  27%|██▋       | 30/111 [00:00<00:00, 90.19it/s, v_num=0]
Epoch 0:  32%|███▏      | 36/111 [00:00<00:00, 83.51it/s, v_num=0]
Epoch 0:  33%|███▎      | 37/111 [00:00<00:00, 82.53it/s, v_num=0]
Epoch 0:  33%|███▎      | 37/111 [00:00<00:00, 82.48it/s, v_num=0]
Epoch 0:  41%|████      | 45/111 [00:00<00:00, 82.42it/s, v_num=0]
Epoch 0:  46%|████▌     | 51/111 [00:00<00:00, 78.89it/s, v_num=0]
Epoch 0:  55%|█████▍    | 61/111 [00:00<00:00, 80.54it/s, v_num=0]
Epoch 0:  63%|██████▎   | 70/111 [00:00<00:00, 82.10it/s, v_num=0]
Epoch 0:  70%|███████   | 78/111 [00:00<00:00, 81.68it/s, v_num=0]
Epoch 0:  71%|███████   | 79/111 [00:00<00:00, 81.70it/s, v_num=0]
Epoch 0:  79%|██

[36m(RayTrainWorker pid=20540)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=20540)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00015_15_hidden_size=256,num_resblocks=2,weight_decay=0.0006_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=20952)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=20766)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=20766)[0m - (ip=172.26.79.196, pid=20952) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=20952)[0m Train : incident_cad
[36m(RayTrainWorker pid=20952)[0m dtype: int64
[36m(RayTrainWorker pid=20952)[0m val : incident_cad
[36m(RayTrainWorker pid=20952)[0m dtype: int64
[36m(RayTrainWorker pid=20952)[0m Test : incident_cad
[36m(RayTrainWorker pid=20952)[0m dtype: int64
[36m(RayTrainWorker pid=20952)[0m 1.0               833[32m [repeated 9x across cluster][0m


[36m(RayTrainWorker pid=20952)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=20952)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=20952)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=20952)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=20952)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=20952)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=20952)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=20952)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=20952)[0m 
[36m(RayTrainWorker pid=20952)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=20952)[0m -----------------------------------------------
[36m(RayTrainWorker pid=20952)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=20952)[0m 1 | fc1       | Linear           | 3.0 M 
[36m(RayTrainWorker pid=20952)[0m 2 | resblocks | Sequential       | 5.3 M 
[36m(RayTrainWorker pid=20952)[0m 3 | fc2       | Linear           | 2.0 K 
[36m(RayTrainWorker pid=20952)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=20952)[0m -----------------------------------------------
[36m(RayTrainWorker pid=20952)[0m 8.3 M     Trainable params
[36m(RayTrainWorker pid=20952)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=20952)[0m 8.3 M     Total params
[36m(RayTrainWorker pid=20952)[0m 33.037    Total estimate

Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:01<00:00,  1.31it/s]


[36m(RayTrainWorker pid=20952)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=20952)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:   1%|          | 1/111 [00:00<00:05, 21.07it/s, v_num=0]
Epoch 0:   7%|▋         | 8/111 [00:00<00:01, 55.18it/s, v_num=0]
Epoch 0:  14%|█▍        | 16/111 [00:00<00:01, 63.66it/s, v_num=0]
                                                                           
Epoch 0:  20%|█▉        | 22/111 [00:00<00:01, 65.90it/s, v_num=0]
Epoch 0:  25%|██▌       | 28/111 [00:00<00:01, 60.32it/s, v_num=0]
Epoch 0:  32%|███▏      | 36/111 [00:00<00:01, 63.50it/s, v_num=0]
Epoch 0:  40%|███▉      | 44/111 [00:00<00:01, 65.48it/s, v_num=0]
Epoch 0:  47%|████▋     | 52/111 [00:00<00:00, 67.76it/s, v_num=0]
Epoch 0:  48%|████▊     | 53/111 [00:00<00:00, 68.03it/s, v_num=0]
Epoch 0:  48%|████▊     | 53/111 [00:00<00:00, 68.01it/s, v_num=0]
Epoch 0:  54%|█████▍    | 60/111 [00:00<00:00, 68.71it/s, v_num=0]
Epoch 0:  55%|█████▍    | 61/111 [00:00<00:00, 68.93it/s, v_num=0]
Epoch 0:  61%|██████▏   | 68/111 [00:00<00:00, 68.98it/s, v_num=0]
Epoch

[36m(RayTrainWorker pid=20952)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0: 100%|██████████| 111/111 [00:02<00:00, 44.09it/s, v_num=0, ptl/val_loss=0.751, ptl/val_auc=0.721, ptl/train_loss=0.987]


[36m(RayTrainWorker pid=20952)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00016_16_hidden_size=1024,num_resblocks=5,weight_decay=0.0093_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=21084)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=21084)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=21084)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=21084)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=21084)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=21084)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=21084)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=21084)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


                                                                           
Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
[36m(RayTrainWorker pid=20952)[0m [32m [repeated 3x across cluster][0m
Epoch 0:   5%|▌         | 6/111 [00:00<00:02, 49.73it/s, v_num=0]
Epoch 0:  14%|█▍        | 16/111 [00:00<00:01, 69.21it/s, v_num=0]
Epoch 0:  22%|██▏       | 24/111 [00:00<00:01, 72.47it/s, v_num=0]
Epoch 0:  30%|██▉       | 33/111 [00:00<00:01, 75.28it/s, v_num=0]
Epoch 0:  38%|███▊      | 42/111 [00:00<00:00, 77.43it/s, v_num=0]
Epoch 0:  45%|████▌     | 50/111 [00:00<00:00, 78.04it/s, v_num=0]
Epoch 0:  50%|████▉     | 55/111 [00:00<00:00, 73.98it/s, v_num=0]
Epoch 0:  57%|█████▋    | 63/111 [00:00<00:00, 74.07it/s, v_num=0]
Epoch 0:  63%|██████▎   | 70/111 [00:00<00:00, 73.49it/s, v_num=0]
Epoch 0:  68%|██████▊   | 76/111 [00:01<00:00, 72.14it/s, v_num=0]
Epoch 0:  75%|███████▍  | 83/111 [00:01<00:00, 71.51it/s, v_num=0]
Epoch 0:  80%|████████  | 89/111 [00:01<00:00, 70.42it/s, v_num

[36m(RayTrainWorker pid=21084)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=21084)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00017_17_hidden_size=1024,num_resblocks=5,weight_decay=0.0019_2024-04-17_10-26-56/checkpoint_000000)


Epoch 0: 100%|██████████| 111/111 [00:02<00:00, 53.57it/s, v_num=0, ptl/val_loss=1.200, ptl/val_auc=0.745, ptl/train_loss=1.150]


[36m(RayTrainWorker pid=21494)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=21313)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=21313)[0m - (ip=172.26.79.196, pid=21494) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=21494)[0m Train : incident_cad
[36m(RayTrainWorker pid=21494)[0m dtype: int64
[36m(RayTrainWorker pid=21494)[0m val : incident_cad
[36m(RayTrainWorker pid=21494)[0m dtype: int64
[36m(RayTrainWorker pid=21494)[0m Test : incident_cad
[36m(RayTrainWorker pid=21494)[0m dtype: int64
[36m(RayTrainWorker pid=21494)[0m 1.0               833[32m [repeated 9x across cluster][0m


[36m(RayTrainWorker pid=21494)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=21494)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=21494)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=21494)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=21494)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=21494)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=21494)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=21494)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=21494)[0m 
[36m(RayTrainWorker pid=21494)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=21494)[0m -----------------------------------------------
[36m(RayTrainWorker pid=21494)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=21494)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=21494)[0m 2 | resblocks | Sequential       | 791 K 
[36m(RayTrainWorker pid=21494)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=21494)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=21494)[0m -----------------------------------------------
[36m(RayTrainWorker pid=21494)[0m 2.3 M     Trainable params
[36m(RayTrainWorker pid=21494)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=21494)[0m 2.3 M     Total params
[36m(RayTrainWorker pid=21494)[0m 9.178     Total estimate

                                                                           


[36m(RayTrainWorker pid=21494)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=21494)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   1%|          | 1/111 [00:00<00:08, 12.62it/s, v_num=0]
[36m(RayTrainWorker pid=21628)[0m Train : incident_cad
[36m(RayTrainWorker pid=21628)[0m dtype: int64[32m [repeated 3x across cluster][0m
[36m(RayTrainWorker pid=21628)[0m val : incident_cad
[36m(RayTrainWorker pid=21628)[0m Test : incident_cad
[36m(RayTrainWorker pid=21628)[0m 1.0               833[32m [repeated 6x across cluster][0m
Epoch 0:   7%|▋         | 8/111 [00:00<00:02, 43.85it/s, v_num=0]
Epoch 0:  16%|█▌        | 18/111 [00:00<00:01, 61.74it/s, v_num=0]
Epoch 0:  22%|██▏       | 24/111 [00:00<00:01, 62.07it/s, v_num=0]
Epoch 0:  23%|██▎       | 25/111 [00:00<00:01, 62.38it/s, v_num=0]
Epoch 0:  28%|██▊       | 31/111 [00:00<00:01, 62.38it/s, v_num=0]
Epoch 0:  36%|███▌      | 40/111 [00:00<00:01, 66.33it/s, v_num=0]
Epoch 0:  43%|████▎     | 48/111 [00:00<00:00, 68.50it/s, v_num=0]
Epoch 0:  44%|████▍     | 49/111 [00:00<00:00, 68.32it/s, v_num=0]
Epoch 0:  47%|████▋     | 52/111 [00:00<00:00, 6



Epoch 0:  72%|███████▏  | 80/111 [00:01<00:00, 59.74it/s, v_num=0]
Epoch 0:  75%|███████▍  | 83/111 [00:01<00:00, 57.44it/s, v_num=0]




Epoch 0:  81%|████████  | 90/111 [00:01<00:00, 58.15it/s, v_num=0]
Epoch 0:  86%|████████▋ | 96/111 [00:01<00:00, 57.97it/s, v_num=0]
Epoch 0:  91%|█████████ | 101/111 [00:01<00:00, 57.50it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 59.24it/s, v_num=0]


[36m(RayTrainWorker pid=21628)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=21628)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=21628)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=21628)[0m HPU available: False, using: 0 HPUs


Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   4%|▎         | 1/28 [00:00<00:00, 34.11it/s][A
Validation DataLoader 0:   7%|▋         | 2/28 [00:00<00:00, 42.09it/s][A
Validation DataLoader 0:  11%|█         | 3/28 [00:00<00:00, 54.48it/s][A
Validation DataLoader 0:  14%|█▍        | 4/28 [00:00<00:00, 60.28it/s][A
Validation DataLoader 0:  18%|█▊        | 5/28 [00:00<00:00, 58.67it/s][A
Validation DataLoader 0:  21%|██▏       | 6/28 [00:00<00:00, 65.92it/s][A
Validation DataLoader 0:  25%|██▌       | 7/28 [00:00<00:00, 72.07it/s][A
Validation DataLoader 0:  29%|██▊       | 8/28 [00:00<00:00, 78.31it/s][A
Validation DataLoader 0:  32%|███▏      | 9/28 [00:00<00:00, 84.18it/s][A
Validation DataLoader 0:  36%|███▌      | 10/28 [00:00<00:00, 87.51it/s][A
Validation DataLoader 0:  39%|███▉      | 11/28 [00:00<00:00, 92.05it/s][A
Val

[36m(RayTrainWorker pid=21628)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=21628)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=21628)[0m [rank: 0] Seed set to 42
[36m(RayTrainWorker pid=21628)[0m Missing logger folder: /tmp/ray/session_2024-04-17_10-26-51_374774_23003/artifacts/2024-04-17_10-26-55/TorchTrainer_2024-04-17_10-26-51/working_dirs/TorchTrainer_f5695_00019_19_hidden_size=512,num_resblocks=4,weight_decay=0.0026_2024-04-17_10-2

Validation DataLoader 0:  86%|████████▌ | 24/28 [00:00<00:00, 129.33it/s][A
Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 130.69it/s][A
Validation DataLoader 0:  93%|█████████▎| 26/28 [00:00<00:00, 131.27it/s][A
Validation DataLoader 0:  96%|█████████▋| 27/28 [00:00<00:00, 132.79it/s][A
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 134.13it/s][A
Epoch 0: 100%|██████████| 111/111 [00:02<00:00, 52.14it/s, v_num=0, ptl/val_loss=0.940, ptl/val_auc=0.733]
Epoch 0: 100%|██████████| 111/111 [00:02<00:00, 50.01it/s, v_num=0, ptl/val_loss=0.940, ptl/val_auc=0.733, ptl/train_loss=0.721]


[36m(RayTrainWorker pid=21494)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00018_18_hidden_size=512,num_resblocks=3,weight_decay=0.0048_2024-04-17_10-26-56/checkpoint_000000)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=21628)[0m 2 | resblocks | Sequential       | 1.1 M 
[36m(RayTrainWorker pid=21628)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=21628)[0m 
[36m(RayTrainWorker pid=21628)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=21628)[0m -----------------------------------------------[32m [repeated 2x across cluster][0m
[36m(RayTrainWorker pid=21628)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=21628)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=21628)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=21628)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=21628)[0m 2.6 M     Trainable params
[36m(RayTrainWorker pid=21628)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=21628)[0m 2.6 M     Total params
[36m(RayTrainWorker pid=21628)[0m 10.233    Total estimated model params size (MB)


                                                                           


[36m(RayTrainWorker pid=21628)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=21628)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   2%|▏         | 2/111 [00:00<00:02, 44.78it/s, v_num=0]
[36m(RayTrainWorker pid=21494)[0m [32m [repeated 4x across cluster][0m
Epoch 0:  12%|█▏        | 13/111 [00:00<00:01, 83.99it/s, v_num=0]
Epoch 0:   3%|▎         | 3/111 [00:00<00:01, 56.45it/s, v_num=0]
Epoch 0:  21%|██        | 23/111 [00:00<00:01, 87.49it/s, v_num=0]
Epoch 0:  21%|██        | 23/111 [00:00<00:01, 87.42it/s, v_num=0]
Epoch 0:  29%|██▉       | 32/111 [00:00<00:00, 88.64it/s, v_num=0][32m [repeated 2x across cluster][0m
Epoch 0:  30%|██▉       | 33/111 [00:00<00:00, 89.04it/s, v_num=0]
Epoch 0:  30%|██▉       | 33/111 [00:00<00:00, 88.99it/s, v_num=0]
Epoch 0:  41%|████▏     | 46/111 [00:00<00:00, 79.50it/s, v_num=0]
Epoch 0:  48%|████▊     | 53/111 [00:00<00:00, 78.35it/s, v_num=0]
Epoch 0:  34%|███▍      | 38/111 [00:00<00:00, 81.47it/s, v_num=0]
Epoch 0:  49%|████▊     | 54/111 [00:00<00:00, 78.36it/s, v_num=0]
Epoch 0:  56%|█████▌    | 62/111 [00:00<00:00, 78.21it/s, v_num=0]
Epoch 0:  62%|████

[36m(RayTrainWorker pid=22046)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(RayTrainWorker pid=21628)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=21628)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00019_19_hidden_size=512,num_resblocks=4,weight_decay=0.0026_2024-04-17_10-26-56/checkpoint_000000)
[36m(TorchTrainer pid=21861)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=21861)[0m - (ip=172.26.79.196, pid=22046) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=22046)[0m Train : incident_cad
[36m(RayTrainWorker pid=22046)[0m dtype: int64
[36m(RayTrainWorker pid=22046)[0m val : incident_cad
[36m(RayTrainWorker pid=22046)[0m dtype: int64
[36m(RayTrainWorker pid=22046)[0m Test : incident_cad
[36m(RayTrainWorker pid=22046)[0m dtype: int64
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 154.02it/s][A[32m [repeated 3x across cluster][0m
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 66.07it/s, v_num=0, ptl/val_loss=0.370, ptl/val_auc=0.747]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 60.94it/s, v_num=0, ptl/val_loss=0.370, ptl/val_auc=0.747, ptl/train_loss=0.783]
[36m(RayTrainWorker pid=22046)[0m 1.0               833[32m [repeated 8x across cluster][0m
Epoch 0:  86%|████████▋ | 96/111 [00:01<00:00, 79.71it/s, v_num=0][32m [repeated 3x across cluster][0m


[36m(RayTrainWorker pid=22046)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=22046)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=22046)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=22046)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=22046)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=22046)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=22046)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=22046)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=22046)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


                                                                           
Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:   6%|▋         | 7/111 [00:00<00:02, 42.95it/s, v_num=0]
Epoch 0:  14%|█▍        | 16/111 [00:00<00:01, 61.17it/s, v_num=0]
[36m(RayTrainWorker pid=22170)[0m Train : incident_cad
[36m(RayTrainWorker pid=22170)[0m 0.0             27242
[36m(RayTrainWorker pid=22170)[0m 1.0              1563
[36m(RayTrainWorker pid=22170)[0m dtype: int64
[36m(RayTrainWorker pid=22170)[0m val : incident_cad
[36m(RayTrainWorker pid=22170)[0m 0.0             6766
[36m(RayTrainWorker pid=22170)[0m 1.0              436
[36m(RayTrainWorker pid=22170)[0m dtype: int64
[36m(RayTrainWorker pid=22170)[0m Test : incident_cad
[36m(RayTrainWorker pid=22170)[0m 0.0             14599
[36m(RayTrainWorker pid=22170)[0m 1.0               833
[36m(RayTrainWorker pid=22170)[0m dtype: int64
Epoch 0:  15%|█▌        | 17/111 [00:00<00:01, 62.66it/s, v_num=0]
Epoch 0:  23%

[36m(RayTrainWorker pid=22046)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=22046)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00020_20_hidden_size=1024,num_resblocks=1,weight_decay=0.0015_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=22046)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0: 100%|██████████| 111/111 [00:02<00:00, 47.05it/s, v_num=0, ptl/val_loss=0.794, ptl/val_auc=0.758, ptl/train_loss=0.706]
Epoch 1:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.794, ptl/val_auc=0.758, ptl/train_loss=0.706]          




Epoch 1:   5%|▌         | 6/111 [00:00<00:04, 26.08it/s, v_num=0, ptl/val_loss=0.794, ptl/val_auc=0.758, ptl/train_loss=0.706, ptl/train_auc=0.758]
Epoch 1:  12%|█▏        | 13/111 [00:00<00:02, 39.12it/s, v_num=0, ptl/val_loss=0.794, ptl/val_auc=0.758, ptl/train_loss=0.706, ptl/train_auc=0.758]
Epoch 1:  13%|█▎        | 14/111 [00:00<00:02, 41.06it/s, v_num=0, ptl/val_loss=0.794, ptl/val_auc=0.758, ptl/train_loss=0.706, ptl/train_auc=0.758]
Epoch 1:  16%|█▌        | 18/111 [00:00<00:02, 40.44it/s, v_num=0, ptl/val_loss=0.794, ptl/val_auc=0.758, ptl/train_loss=0.706, ptl/train_auc=0.758]
Epoch 1:  23%|██▎       | 26/111 [00:00<00:01, 48.00it/s, v_num=0, ptl/val_loss=0.794, ptl/val_auc=0.758, ptl/train_loss=0.706, ptl/train_auc=0.758]
Epoch 1:  32%|███▏      | 35/111 [00:00<00:01, 53.93it/s, v_num=0, ptl/val_loss=0.794, ptl/val_auc=0.758, ptl/train_loss=0.706, ptl/train_auc=0.758]
Epoch 1:  41%|████      | 45/111 [00:00<00:01, 59.37it/s, v_num=0, ptl/val_loss=0.794, ptl/val_auc=0.758, p

[36m(RayTrainWorker pid=22046)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00020_20_hidden_size=1024,num_resblocks=1,weight_decay=0.0015_2024-04-17_10-26-56/checkpoint_000001)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 151.12it/s][A
Epoch 1: 100%|██████████| 111/111 [00:01<00:00, 57.48it/s, v_num=0, ptl/val_loss=0.334, ptl/val_auc=0.732, ptl/train_loss=0.706, ptl/train_auc=0.758]
Epoch 1: 100%|██████████| 111/111 [00:02<00:00, 52.20it/s, v_num=0, ptl/val_loss=0.334, ptl/val_auc=0.732, ptl/train_loss=0.516, ptl/train_auc=0.758]


[36m(RayTrainWorker pid=22170)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=22170)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=22170)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=22170)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=22170)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=22170)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=22170)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
[36m(RayTrainWorker pid=22046)[0m [32m [repeated 6x across cluster][0m
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=22170)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=22170)[0m 
[36m(RayTrainWorker pid=22170)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=22170)[0m -----------------------------------------------
[36m(RayTrainWorker pid=22170)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=22170)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=22170)[0m 2 | resblocks | Sequential       | 263 K 
[36m(RayTrainWorker pid=22170)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=22170)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=22170)[0m -----------------------------------------------
[36m(RayTrainWorker pid=22170)[0m 1.8 M     Trainable params
[36m(RayTrainWorker pid=22170)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=22170)[0m 1.8 M     Total params
[36m(RayTrainWorker pid=22170)[0m 7.069     Total estimate

                                                                           


[36m(RayTrainWorker pid=22170)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=22170)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   3%|▎         | 3/111 [00:00<00:02, 46.00it/s, v_num=0]
Epoch 0:  10%|▉         | 11/111 [00:00<00:01, 67.14it/s, v_num=0]
Epoch 0:  18%|█▊        | 20/111 [00:00<00:01, 74.49it/s, v_num=0]
Epoch 0:  28%|██▊       | 31/111 [00:00<00:00, 82.43it/s, v_num=0]
Epoch 0:  29%|██▉       | 32/111 [00:00<00:00, 83.48it/s, v_num=0]
Epoch 0:  39%|███▊      | 43/111 [00:00<00:00, 89.33it/s, v_num=0]
Epoch 0:  40%|███▉      | 44/111 [00:00<00:00, 89.52it/s, v_num=0]
Epoch 0:  50%|████▉     | 55/111 [00:00<00:00, 92.80it/s, v_num=0]
Epoch 0:  58%|█████▊    | 64/111 [00:00<00:00, 92.32it/s, v_num=0]
Epoch 0:  67%|██████▋   | 74/111 [00:00<00:00, 92.83it/s, v_num=0]
Epoch 0:  77%|███████▋  | 86/111 [00:00<00:00, 94.99it/s, v_num=0]
Epoch 0:  88%|████████▊ | 98/111 [00:01<00:00, 97.13it/s, v_num=0]
Epoch 0:  99%|█████████▉| 110/111 [00:01<00:00, 99.27it/s, v_num=0]
Validation: |          | 0/? [00:00<?, ?it/s][A
Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:  

[36m(RayTrainWorker pid=22170)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=22170)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00021_21_hidden_size=512,num_resblocks=1,weight_decay=0.0007_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=22170)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 1:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.433, ptl/val_auc=0.750, ptl/train_loss=0.565]          
Epoch 1:   4%|▎         | 4/111 [00:00<00:04, 23.72it/s, v_num=0, ptl/val_loss=0.433, ptl/val_auc=0.750, ptl/train_loss=0.565, ptl/train_auc=0.808]
Epoch 1:  10%|▉         | 11/111 [00:00<00:02, 38.66it/s, v_num=0, ptl/val_loss=0.433, ptl/val_auc=0.750, ptl/train_loss=0.565, ptl/train_auc=0.808]
Epoch 1:  19%|█▉        | 21/111 [00:00<00:01, 53.75it/s, v_num=0, ptl/val_loss=0.433, ptl/val_auc=0.750, ptl/train_loss=0.565, ptl/train_auc=0.808]
Epoch 1:  29%|██▉       | 32/111 [00:00<00:01, 64.34it/s, v_num=0, ptl/val_loss=0.433, ptl/val_auc=0.750, ptl/train_loss=0.565, ptl/train_auc=0.808]
Epoch 1:  39%|███▊      | 43/111 [00:00<00:00, 72.35it/s, v_num=0, ptl/val_loss=0.433, ptl/val_auc=0.750, ptl/train_loss=0.565, ptl/train_auc=0.808]
Epoch 1:  47%|████▋     | 52/111 [00:00<00:00, 74.81it/s, v_num=0, ptl/val_loss=0.433, ptl/val_auc=0.750, ptl/train_loss=0.565,

[36m(RayTrainWorker pid=22170)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00021_21_hidden_size=512,num_resblocks=1,weight_decay=0.0007_2024-04-17_10-26-56/checkpoint_000001)
[36m(RayTrainWorker pid=22594)[0m Setting up process group for: env:// [rank=0, world_size=1]
[36m(TorchTrainer pid=22410)[0m Started distributed worker processes: 
[36m(TorchTrainer pid=22410)[0m - (ip=172.26.79.196, pid=22594) world_rank=0, local_rank=0, node_rank=0


[36m(RayTrainWorker pid=22594)[0m Train : incident_cad
[36m(RayTrainWorker pid=22594)[0m dtype: int64
[36m(RayTrainWorker pid=22594)[0m val : incident_cad
[36m(RayTrainWorker pid=22594)[0m dtype: int64
[36m(RayTrainWorker pid=22594)[0m Test : incident_cad
[36m(RayTrainWorker pid=22594)[0m dtype: int64
[36m(RayTrainWorker pid=22594)[0m 1.0               833[32m [repeated 11x across cluster][0m


[36m(RayTrainWorker pid=22594)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=22594)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=22594)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=22594)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=22594)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=22594)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
[36m(RayTrainWorker pid=22594)[0m [rank: 0]

Sanity Checking: |          | 0/? [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


[36m(RayTrainWorker pid=22594)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=22594)[0m 
[36m(RayTrainWorker pid=22594)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=22594)[0m -----------------------------------------------
[36m(RayTrainWorker pid=22594)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=22594)[0m 1 | fc1       | Linear           | 1.5 M 
[36m(RayTrainWorker pid=22594)[0m 2 | resblocks | Sequential       | 263 K 
[36m(RayTrainWorker pid=22594)[0m 3 | fc2       | Linear           | 1.0 K 
[36m(RayTrainWorker pid=22594)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=22594)[0m -----------------------------------------------
[36m(RayTrainWorker pid=22594)[0m 1.8 M     Trainable params
[36m(RayTrainWorker pid=22594)[0m 0         Non-trainable params
[36m(RayTrainWorker pid=22594)[0m 1.8 M     Total params
[36m(RayTrainWorker pid=22594)[0m 7.069     Total estimate

Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:01<00:00,  1.52it/s]


[36m(RayTrainWorker pid=22594)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=22594)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   0%|          | 0/111 [00:00<?, ?it/s] 
Epoch 0:   2%|▏         | 2/111 [00:00<00:06, 16.60it/s, v_num=0]
Epoch 0:   8%|▊         | 9/111 [00:00<00:02, 39.80it/s, v_num=0]
Epoch 0:  18%|█▊        | 20/111 [00:00<00:01, 59.38it/s, v_num=0]
                                                                           
Epoch 0:  29%|██▉       | 32/111 [00:00<00:01, 72.94it/s, v_num=0]
Epoch 0:  39%|███▊      | 43/111 [00:00<00:00, 79.92it/s, v_num=0]
Epoch 0:  40%|███▉      | 44/111 [00:00<00:00, 80.31it/s, v_num=0]
Epoch 0:  49%|████▊     | 54/111 [00:00<00:00, 83.35it/s, v_num=0]
Epoch 0:  53%|█████▎    | 59/111 [00:00<00:00, 79.27it/s, v_num=0]
Epoch 0:  54%|█████▍    | 60/111 [00:00<00:00, 78.87it/s, v_num=0]
Epoch 0:  61%|██████▏   | 68/111 [00:00<00:00, 78.74it/s, v_num=0]
Epoch 0:  62%|██████▏   | 69/111 [00:00<00:00, 79.10it/s, v_num=0]
Epoch 0:  70%|███████   | 78/111 [00:00<00:00, 79.77it/s, v_num=0]
Epoch 0:  78%|███████▊  | 87/111 [00:01<00:00, 81.18it/s, v_num=0]
Epoch

[36m(RayTrainWorker pid=22594)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=22594)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00022_22_hidden_size=512,num_resblocks=1,weight_decay=0.0033_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=22594)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 1:   6%|▋         | 7/111 [00:00<00:01, 66.65it/s, v_num=0, ptl/val_loss=0.439, ptl/val_auc=0.757, ptl/train_loss=0.660, ptl/train_auc=0.758]
Epoch 1:   6%|▋         | 7/111 [00:00<00:01, 66.34it/s, v_num=0, ptl/val_loss=0.439, ptl/val_auc=0.757, ptl/train_loss=0.660, ptl/train_auc=0.758]
Epoch 1:  14%|█▍        | 16/111 [00:00<00:01, 77.50it/s, v_num=0, ptl/val_loss=0.439, ptl/val_auc=0.757, ptl/train_loss=0.660, ptl/train_auc=0.758]
Epoch 1:  20%|█▉        | 22/111 [00:00<00:01, 69.42it/s, v_num=0, ptl/val_loss=0.439, ptl/val_auc=0.757, ptl/train_loss=0.660, ptl/train_auc=0.758]
Epoch 1:  20%|█▉        | 22/111 [00:00<00:01, 69.36it/s, v_num=0, ptl/val_loss=0.439, ptl/val_auc=0.757, ptl/train_loss=0.660, ptl/train_auc=0.758]
Epoch 1:  26%|██▌       | 29/111 [00:00<00:01, 70.12it/s, v_num=0, ptl/val_loss=0.439, ptl/val_auc=0.757, ptl/train_loss=0.660, ptl/train_auc=0.758]
[36m(RayTrainWorker pid=22727)[0m Train : incident_cad
[36m(RayTrainWorker pid=22727)[0m dtype: int64
[



Epoch 1:  98%|█████████▊| 109/111 [00:01<00:00, 60.73it/s, v_num=0, ptl/val_loss=0.439, ptl/val_auc=0.757, ptl/train_loss=0.660, ptl/train_auc=0.758]
Epoch 1: 100%|██████████| 111/111 [00:01<00:00, 59.54it/s, v_num=0, ptl/val_loss=0.439, ptl/val_auc=0.757, ptl/train_loss=0.660, ptl/train_auc=0.758]
Validation: |          | 0/? [00:00<?, ?it/s][A




Validation:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   0%|          | 0/28 [00:00<?, ?it/s][A
Validation DataLoader 0:   4%|▎         | 1/28 [00:00<00:00, 47.50it/s][A
Validation DataLoader 0:   7%|▋         | 2/28 [00:00<00:00, 36.51it/s][A
Validation DataLoader 0:  11%|█         | 3/28 [00:00<00:00, 48.14it/s][A
Validation DataLoader 0:  14%|█▍        | 4/28 [00:00<00:00, 51.39it/s][A
Validation DataLoader 0:  18%|█▊        | 5/28 [00:00<00:00, 53.16it/s][A
Validation DataLoader 0:  21%|██▏       | 6/28 [00:00<00:00, 56.63it/s][A
Validation DataLoader 0:  25%|██▌       | 7/28 [00:00<00:00, 61.79it/s][A
Validation DataLoader 0:  29%|██▊       | 8/28 [00:00<00:00, 64.14it/s][A
Validation DataLoader 0:  32%|███▏      | 9/28 [00:00<00:00, 64.69it/s][A
Validation DataLoader 0:  36%|███▌      | 10/28 [00:00<00:00, 67.41it/s][A
Validation DataLoader 0:  39%|███▉      | 11/28 [00:00<00:00, 68.86it/s][A
Validation DataLoader 0:  43%|████▎     | 12/28 [00:

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


Validation DataLoader 0:  89%|████████▉ | 25/28 [00:00<00:00, 79.99it/s][A
Validation DataLoader 0:  93%|█████████▎| 26/28 [00:00<00:00, 79.96it/s][A
Validation DataLoader 0:  96%|█████████▋| 27/28 [00:00<00:00, 80.92it/s][A
Validation DataLoader 0: 100%|██████████| 28/28 [00:00<00:00, 81.31it/s][A
Epoch 1: 100%|██████████| 111/111 [00:02<00:00, 48.23it/s, v_num=0, ptl/val_loss=0.634, ptl/val_auc=0.760, ptl/train_loss=0.660, ptl/train_auc=0.758]
Epoch 2:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.634, ptl/val_auc=0.760, ptl/train_loss=0.558, ptl/train_auc=0.758]          


[36m(RayTrainWorker pid=22594)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00022_22_hidden_size=512,num_resblocks=1,weight_decay=0.0033_2024-04-17_10-26-56/checkpoint_000001)
[36m(RayTrainWorker pid=22727)[0m GPU available: True (cuda), used: True
[36m(RayTrainWorker pid=22727)[0m TPU available: False, using: 0 TPU cores
[36m(RayTrainWorker pid=22727)[0m IPU available: False, using: 0 IPUs
[36m(RayTrainWorker pid=22727)[0m HPU available: False, using: 0 HPUs
[36m(RayTrainWorker pid=22727)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/loops/utilities.py:73: `max_epochs` was not set. Setting it to 1000 epochs. To train without an epoch limit, set `max_epochs=-1`.
[36m(RayTrainWorker pid=22727)[0m You are using a CUDA device ('NVIDIA GeForce RTX 4060') that has Tensor Cores. To properly utilize them, you should set `t

Epoch 2:   5%|▌         | 6/111 [00:00<00:01, 60.17it/s, v_num=0, ptl/val_loss=0.634, ptl/val_auc=0.760, ptl/train_loss=0.558, ptl/train_auc=0.771]
Epoch 2:  11%|█         | 12/111 [00:00<00:01, 59.71it/s, v_num=0, ptl/val_loss=0.634, ptl/val_auc=0.760, ptl/train_loss=0.558, ptl/train_auc=0.771]
Epoch 2:  12%|█▏        | 13/111 [00:00<00:01, 60.21it/s, v_num=0, ptl/val_loss=0.634, ptl/val_auc=0.760, ptl/train_loss=0.558, ptl/train_auc=0.771]
Epoch 2:  12%|█▏        | 13/111 [00:00<00:01, 60.15it/s, v_num=0, ptl/val_loss=0.634, ptl/val_auc=0.760, ptl/train_loss=0.558, ptl/train_auc=0.771]
Epoch 2:  19%|█▉        | 21/111 [00:00<00:01, 65.17it/s, v_num=0, ptl/val_loss=0.634, ptl/val_auc=0.760, ptl/train_loss=0.558, ptl/train_auc=0.771]
Epoch 2:  26%|██▌       | 29/111 [00:00<00:01, 69.99it/s, v_num=0, ptl/val_loss=0.634, ptl/val_auc=0.760, ptl/train_loss=0.558, ptl/train_auc=0.771]
Epoch 2:  32%|███▏      | 36/111 [00:00<00:01, 67.49it/s, v_num=0, ptl/val_loss=0.634, ptl/val_auc=0.760, p

[36m(RayTrainWorker pid=22594)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00022_22_hidden_size=512,num_resblocks=1,weight_decay=0.0033_2024-04-17_10-26-56/checkpoint_000002)
[36m(RayTrainWorker pid=22727)[0m LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
[36m(RayTrainWorker pid=22727)[0m 
[36m(RayTrainWorker pid=22727)[0m   | Name      | Type             | Params
[36m(RayTrainWorker pid=22727)[0m -----------------------------------------------
[36m(RayTrainWorker pid=22727)[0m 0 | norm      | LayerNorm        | 5.8 K 
[36m(RayTrainWorker pid=22727)[0m 1 | fc1       | Linear           | 748 K 
[36m(RayTrainWorker pid=22727)[0m 2 | resblocks | Sequential       | 132 K 
[36m(RayTrainWorker pid=22727)[0m 3 | fc2       | Linear           | 514   
[36m(RayTrainWorker pid=22727)[0m 4 | loss_fn   | CrossEntropyLoss | 0     
[36m(RayTrainWorker pid=22727)[0m ----------

Epoch 3:   6%|▋         | 7/111 [00:00<00:03, 32.39it/s, v_num=0, ptl/val_loss=0.711, ptl/val_auc=0.765, ptl/train_loss=0.531, ptl/train_auc=0.784]
Epoch 3:  11%|█         | 12/111 [00:00<00:02, 35.92it/s, v_num=0, ptl/val_loss=0.711, ptl/val_auc=0.765, ptl/train_loss=0.531, ptl/train_auc=0.784]
[36m(RayTrainWorker pid=22594)[0m [32m [repeated 12x across cluster][0m
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Epoch 3:  17%|█▋        | 19/111 [00:00<00:02, 42.64it/s, v_num=0, ptl/val_loss=0.711, ptl/val_auc=0.765, ptl/train_loss=0.531, ptl/train_auc=0.784]
Epoch 3:  24%|██▍       | 27/111 [00:00<00:01, 49.23it/s, v_num=0, ptl/val_loss=0.711, ptl/val_auc=0.765, ptl/train_loss=0.531, ptl/train_auc=0.784]
Epoch 3:  33%|███▎      | 37/111 [00:00<00:01, 56.77it/s, v_num=0, ptl/val_loss=0.711, ptl/val_auc=0.765, ptl/train_loss=0.531, ptl/train_auc=0.784]
Epoch 3:  41%|████      | 45/111 [00:00<00:01, 59.88it/s, v_num=0, ptl/val_loss=0.711, ptl/val_auc=0.765, ptl/tr

You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).
[36m(RayTrainWorker pid=22594)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00022_22_hidden_size=512,num_resblocks=1,weight_decay=0.0033_2024-04-17_10-26-56/checkpoint_000003)


Epoch 3: 100%|██████████| 111/111 [00:01<00:00, 57.36it/s, v_num=0, ptl/val_loss=0.530, ptl/val_auc=0.771, ptl/train_loss=0.539, ptl/train_auc=0.784]
                                                                           


[36m(RayTrainWorker pid=22727)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=22727)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/val_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 0:   5%|▌         | 6/111 [00:00<00:01, 65.10it/s, v_num=0]
Epoch 0:  57%|█████▋    | 63/111 [00:00<00:00, 75.52it/s, v_num=0]
Epoch 0:  71%|███████   | 79/111 [00:01<00:00, 76.44it/s, v_num=0]
Epoch 0:  91%|█████████ | 101/111 [00:01<00:00, 74.60it/s, v_num=0]
Epoch 0:  95%|█████████▌| 106/111 [00:01<00:00, 72.22it/s, v_num=0]
Epoch 0: 100%|██████████| 111/111 [00:01<00:00, 73.06it/s, v_num=0]
Epoch 1:   0%|          | 0/111 [00:00<?, ?it/s, v_num=0, ptl/val_loss=0.602, ptl/val_auc=0.752, ptl/train_loss=0.588]          
Epoch 1:   7%|▋         | 8/111 [00:00<00:01, 71.32it/s, v_num=0, ptl/val_loss=0.602, ptl/val_auc=0.752, ptl/train_loss=0.588, ptl/train_auc=0.792]
Epoch 1:   7%|▋         | 8/111 [00:00<00:01, 71.09it/s, v_num=0, ptl/val_loss=0.602, ptl/val_auc=0.752, ptl/train_loss=0.588, ptl/train_auc=0.792]


[36m(RayTrainWorker pid=22727)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_loss', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.
[36m(RayTrainWorker pid=22727)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00023_23_hidden_size=256,num_resblocks=2,weight_decay=0.0005_2024-04-17_10-26-56/checkpoint_000000)
[36m(RayTrainWorker pid=22727)[0m /home/xutingfeng/miniforge3/envs/rapids-24.02/lib/python3.10/site-packages/pytorch_lightning/trainer/connectors/logger_connector/result.py:441: It is recommended to use `self.log('ptl/train_auc', ..., sync_dist=True)` when logging on epoch level in distributed setting to accumulate the metric across devices.


Epoch 1:  41%|████▏     | 46/111 [00:00<00:00, 87.79it/s, v_num=0, ptl/val_loss=0.602, ptl/val_auc=0.752, ptl/train_loss=0.588, ptl/train_auc=0.792]
Epoch 0:  29%|██▉       | 32/111 [00:00<00:01, 63.54it/s, v_num=0][32m [repeated 2x across cluster][0m
Epoch 1:  66%|██████▌   | 73/111 [00:00<00:00, 87.45it/s, v_num=0, ptl/val_loss=0.602, ptl/val_auc=0.752, ptl/train_loss=0.588, ptl/train_auc=0.792]
Epoch 1:  69%|██████▉   | 77/111 [00:00<00:00, 82.72it/s, v_num=0, ptl/val_loss=0.602, ptl/val_auc=0.752, ptl/train_loss=0.588, ptl/train_auc=0.792]
[36m(RayTrainWorker pid=22727)[0m [32m [repeated 5x across cluster][0m
Epoch 0:  86%|████████▌ | 95/111 [00:01<00:00, 75.71it/s, v_num=0][32m [repeated 2x across cluster][0m
Epoch 1:  14%|█▍        | 16/111 [00:00<00:01, 76.70it/s, v_num=0, ptl/val_loss=0.602, ptl/val_auc=0.752, ptl/train_loss=0.588, ptl/train_auc=0.792][32m [repeated 3x across cluster][0m
Epoch 1:  92%|█████████▏| 102/111 [00:01<00:00, 80.98it/s, v_num=0, ptl/val_loss=

[36m(RayTrainWorker pid=22727)[0m Checkpoint successfully created at: Checkpoint(filesystem=local, path=/home/xutingfeng/ray_results/TorchTrainer_2024-04-17_10-26-51/TorchTrainer_f5695_00023_23_hidden_size=256,num_resblocks=2,weight_decay=0.0005_2024-04-17_10-26-56/checkpoint_000001)
You may want to consider increasing the `CheckpointConfig(num_to_keep)` or decreasing the frequency of saving checkpoints.
You can suppress this error by setting the environment variable TUNE_WARN_EXCESSIVE_EXPERIMENT_CHECKPOINT_SYNC_THRESHOLD_S to a smaller value than the current threshold (5.0).


In [38]:
results.get_dataframe().sort_values("ptl/val_auc")

Unnamed: 0,ptl/val_loss,ptl/val_auc,ptl/train_loss,epoch,step,timestamp,checkpoint_dir_name,should_checkpoint,done,training_iteration,...,config/train_loop_config/features,config/train_loop_config/output_size,config/train_loop_config/hidden_size,config/train_loop_config/lr,config/train_loop_config/weight_decay,config/train_loop_config/weight,config/train_loop_config/batch_size,config/train_loop_config/num_resblocks,ptl/train_auc,logdir
18,1.760173,0.469385,1.876581,0,111,1713250225,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,128,0.039256,0.000113,"[0.1, 10]",256,5,,b1adb_00018
2,14.133671,0.483149,4.743559,0,111,1713250064,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,256,0.019132,0.004371,"[0.1, 100]",256,5,,b1adb_00002
27,130.268478,0.493366,50.511715,0,111,1713250306,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,128,0.056077,0.008174,"[0.1, 100]",256,3,,b1adb_00027
46,0.742477,0.494961,1.494878,0,111,1713250502,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,256,0.060112,0.002438,"[1, 1]",256,3,,b1adb_00046
38,19.235968,0.497104,19.780983,0,111,1713250422,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,256,0.073539,0.001687,"[0.1, 10]",256,4,,b1adb_00038
40,2.810828,0.502995,22.12425,0,111,1713250441,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,128,0.04626,0.000291,"[0.1, 100]",256,4,,b1adb_00040
37,0.408884,0.506463,1.211488,0,111,1713250415,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,128,0.029619,0.003661,"[0.1, 10]",256,5,,b1adb_00037
26,0.747489,0.5159,1.141289,0,111,1713250301,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,64,0.079681,0.000378,"[1, 1]",256,4,,b1adb_00026
23,0.985663,0.528829,5.112461,0,111,1713250269,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,64,0.082075,0.004236,"[0.1, 100]",256,4,,b1adb_00023
22,9.694574,0.533914,9.452261,0,111,1713250263,checkpoint_000000,True,True,1,...,"[C3, KLK7, GCHFR, NHLRC3, APOD, GAPDH, TP53I3,...",2,64,0.061577,0.007046,"[0.1, 10]",256,3,,b1adb_00022


In [39]:
# results.get_best_result("ptl/val_auc")

best_result = results.get_best_result("ptl/val_auc")
best_params = best_result.config
best_result_epoch_dir = (
    best_result.get_best_checkpoint("ptl/val_auc", "max").path + "/checkpoint.ckpt"
)
best_model_state = torch.load(best_result_epoch_dir)
best_model = FullyConnectedNet(**best_params["train_loop_config"])
best_model.load_state_dict(best_model_state["state_dict"])
best_model

FullyConnectedNet(
  (norm): LayerNorm((2922,), eps=1e-05, elementwise_affine=True)
  (fc1): Linear(in_features=2922, out_features=256, bias=True)
  (resblocks): Sequential(
    (0): LinearResBlock(
      (fc1): Linear(in_features=256, out_features=256, bias=True)
      (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (1): LinearResBlock(
      (fc1): Linear(in_features=256, out_features=256, bias=True)
      (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
    (2): LinearResBlock(
      (fc1): Linear(in_features=256, out_features=256, bias=True)
      (layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    )
  )
  (fc2): Linear(in_features=256, out_features=2, bias=True)
  (loss_fn): CrossEntropyLoss()
)

In [40]:
best_params["train_loop_config"]

{'features': ['C3',
  'KLK7',
  'GCHFR',
  'NHLRC3',
  'APOD',
  'GAPDH',
  'TP53I3',
  'CPA4',
  'ANXA2',
  'GRSF1',
  'IL25',
  'HMMR',
  'MRPL52',
  'PAIP2B',
  'THAP12',
  'FOS',
  'FGF9',
  'PITHD1',
  'THSD1',
  'PTGES2',
  'DEFB103A_DEFB103B',
  'ATP1B4',
  'CYB5A',
  'UNC79',
  'SLC34A3',
  'TAGLN3',
  'SLIRP',
  'CLASP1',
  'PSMC3',
  'KIR3DL2',
  'BEX3',
  'PFDN4',
  'BCL7A',
  'SMC3',
  'SLC28A1',
  'CDC123',
  'GJA8',
  'NMRK2',
  'GATA3',
  'CPLX2',
  'RASGRF1',
  'FGF7',
  'ANKRA2',
  'RBM25',
  'LYZL2',
  'CDK1',
  'CREB3',
  'CREBZF',
  'IGLON5',
  'SHC1',
  'ZP4',
  'TMOD4',
  'CEP152',
  'MYH7B',
  'CEP350',
  'CDC25A',
  'TRIM26',
  'MANEAL',
  'MUCL3',
  'GIMAP8',
  'CYTH3',
  'PDXDC1',
  'CLINT1',
  'MAPRE3',
  'EVI2B',
  'STAU1',
  'PCNA',
  'DNAJA1',
  'JMJD1C',
  'GAGE2A',
  'GAD1',
  'IZUMO1',
  'PDCL2',
  'PDE1C',
  'STOML2',
  'BSND',
  'MAPK13',
  'PDIA2',
  'BTLA',
  'MLLT1',
  'TPRKB',
  'ARHGAP5',
  'BTNL10',
  'PHLDB2',
  'PDIA5',
  'ATF4',
  'PRAME',
  

In [41]:
test_imputed = best_model.predict_df(test_imputed)
test_imputed

input df have NA: 0


Unnamed: 0,eid,PRS,sex,height,weight,BSA,genotype_array,age,PC1,PC2,...,TGFBR3,CRTAC1,IGFBP7,SELE,VWF,NOTCH3,CNTN1,ENG,ICAM2,pred
19409,2883530.0,1.030583,1.0,171.0,64.2,1.746282,2,44.0,71.3002,-100.66700,...,-0.0087,-0.029539,0.022568,-0.027118,0.008048,0.004249,0.000619,0.001707,-0.026825,0.307764
19272,2867444.0,2.192278,0.0,165.0,55.8,1.599219,2,53.0,-12.4815,3.16181,...,0.1859,0.291950,0.147400,-0.120500,0.597300,0.115700,0.243300,0.127800,0.063400,0.553233
49865,5869793.0,0.653794,1.0,171.0,77.3,1.916181,2,62.0,-11.4721,2.20519,...,0.0516,0.369750,-0.155300,0.035500,-0.276700,-0.043900,0.195500,-0.111000,-0.990800,0.469619
39664,4880838.0,0.664819,0.0,163.0,84.4,1.954852,2,62.0,-11.1640,3.66252,...,-0.0127,0.393200,0.174500,0.035700,0.873200,0.236600,0.114200,0.134400,0.008700,0.095878
30555,3987428.0,0.826465,0.0,164.0,73.1,1.824859,1,66.0,-11.4666,2.77498,...,-0.5216,0.005050,-0.160200,0.181900,1.026700,-0.062150,-0.094500,-0.032700,0.213200,0.454483
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43327,5241912.0,1.085083,1.0,176.0,116.0,2.381409,2,45.0,-10.8083,4.46241,...,0.2348,-0.919950,0.803300,0.131600,0.481500,0.279800,-0.226200,0.262600,0.239400,0.859506
29129,3851862.0,1.294348,0.0,169.0,72.9,1.849932,2,40.0,-12.6549,3.40064,...,-0.3290,-0.251250,-0.787400,-0.919000,0.212700,-0.617800,0.123900,-0.124100,-0.940500,0.082682
1550,1144512.0,0.722791,1.0,191.0,96.6,2.263883,1,59.0,-12.7237,1.46547,...,0.1043,-0.284750,0.350300,1.608600,-0.341300,0.134100,-0.012000,0.226700,0.135200,0.406253
1888,1177099.0,1.335307,1.0,175.0,75.1,1.910679,2,63.0,-15.1573,7.36690,...,0.2172,0.172250,0.431300,0.121750,-0.754900,0.530700,0.244000,-0.018900,-0.053400,0.468948


In [1]:
cal_binary_metrics(test_imputed["incident_cad"], test_imputed["pred"])

NameError: name 'cal_binary_metrics' is not defined

In [17]:
import pandas as pd
from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
    f1_score,
    roc_curve,
    precision_recall_curve,
    auc,
)


from tqdm.rich import tqdm
import numpy as np

from statsmodels.stats.multitest import multipletests


import statsmodels.api as sm

from sklearn.metrics import (
    roc_auc_score,
    accuracy_score,
)
from scipy.stats import pearsonr, spearmanr


def generate_multipletests_result(df, pvalue_col="pvalue", alpha=0.05, method="fdr_bh"):
    df = df.copy()
    pvalue_series = df[pvalue_col]
    reject, pvals_corrected, _, _ = multipletests(
        pvalue_series, alpha=alpha, method="fdr_bh"
    )
    df["pval_corrected"] = pvals_corrected
    df["reject"] = reject
    return df


def find_best_cutoff(fpr, tpr, thresholds):
    diff = tpr - fpr
    Youden_index = np.argmax(diff)
    optimal_threshold = thresholds[Youden_index]
    optimal_FPR, optimal_TPR = fpr[Youden_index], tpr[Youden_index]
    return optimal_threshold, optimal_FPR, optimal_TPR


def cal_binary_metrics(y, y_pred):
    fpr, tpr, thresholds = roc_curve(y, y_pred)
    AUC = roc_auc_score(y, y_pred)
    # by best youden

    optim_threshold, optim_fpr, optim_tpr = find_best_cutoff(fpr, tpr, thresholds)
    y_pred_binary = (y_pred > optim_threshold).astype(int)
    ACC = accuracy_score(y, y_pred_binary)
    macro_f1 = f1_score(y, y_pred_binary, average="macro")
    sensitivity = optim_tpr
    specificity = 1 - optim_fpr
    precision, recall, _ = precision_recall_curve(y, y_pred)
    APR = auc(recall, precision)

    return {
        "AUC": AUC,
        "ACC": ACC,
        "Macro_F1": macro_f1,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "APR": APR,
    }


import torch.nn as nn
import torch.optim as optim
from tqdm.rich import tqdm

# 定义神经网络模型


class LinearResBlock(nn.Module):
    def __init__(self, input_size, output_size):
        super(LinearResBlock, self).__init__()
        self.fc1 = nn.Linear(input_size, output_size)
        self.batch_norm = nn.LayerNorm(output_size)

        torch.nn.init.kaiming_normal_(self.fc1.weight, nonlinearity="relu")  # <6>

        torch.nn.init.constant_(self.batch_norm.weight, 0.5)  # <7>
        torch.nn.init.zeros_(self.batch_norm.bias)

    def forward(self, x):
        out = self.fc1(x)

        out = self.batch_norm(out)
        out = torch.relu(out)
        return out + x


class FullyConnectedNet(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_resblocks=3):
        super(FullyConnectedNet, self).__init__()
        self.norm = nn.LayerNorm(input_size)
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.resblocks = nn.Sequential(
            *[LinearResBlock(hidden_size, hidden_size) for _ in range(num_resblocks)]
        )
        self.fc2 = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        x = self.norm(x)
        out = torch.relu(self.fc1(x))
        out = self.resblocks(out)
        out = self.fc2(out)
        return out


# 定义训练函数
def train(model, dataset, criterion, optimizer, num_epochs):
    train_loader = dataset.train_dataloader()
    val_loader = dataset.test_dataloader()
    for epoch in range(num_epochs):
        running_loss = 0.0
        auroc = torchmetrics.AUROC(num_classes=2, task="multiclass")
        for inputs, labels in tqdm(
            train_loader, desc=f"Epoch {epoch+1}/{num_epochs}", total=len(train_loader)
        ):
            inputs, labels = inputs.to("cuda:0"), labels.to("cuda:0")
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels.squeeze(-1).float())
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            auroc.update(torch.softmax(outputs, dim=-1), torch.argmax(labels, dim=1))
        auc = auroc.compute()
        print(
            f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss/len(train_loader)}, AUC: {auc}"
        )
        if epoch % 1 == 0:
            test_auc = torchmetrics.AUROC(num_classes=2, task="multiclass")
            for inputs, labels in val_loader:
                inputs, labels = inputs.to("cuda:0"), labels.to("cuda:0")
                outputs = model(inputs)
                test_auc.update(
                    torch.softmax(outputs, dim=-1), torch.argmax(labels, dim=1)
                )
            print(f"Epoch {epoch+1}/{num_epochs}, Test AUC: {test_auc.compute()}")
    # test_auc = torchmetrics.AUROC(num_classes=2, task="multiclass")
    # for inputs, labels in dataset.test_dataloader():
    #     inputs, labels = inputs.to("cuda:0"), labels.to("cuda:0")
    #     outputs = model(inputs)
    #     test_auc.update(torch.softmax(outputs, dim=-1), torch.argmax(labels, dim=1))
    # print(f"Test AUC: {test_auc.compute()}")

In [None]:
from pytorch_lightning import Trainer, seed_everything

seed_everything(42)

In [None]:
# 设置训练参数
input_size = len(proteomics)  # 输入特征维度
hidden_size = 512  # 隐藏层维度
output_size = 2  # 输出类别数
learning_rate = 5e-4
batch_size = 256
num_epochs = 5


# 创建模型实例
best_model = FullyConnectedNet(
    input_size=len(proteomics), hidden_size=256, output_size=2, num_resblocks=6
)
best_model.to("cuda:0")
# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss(weight=torch.Tensor([0.1, 100]).to("cuda:0"))
optimizer = optim.NAdam(best_model.parameters(), lr=learning_rate, weight_decay=5e-3)


# 开始训练
train(best_model, dataset, criterion, optimizer, num_epochs)

In [None]:
y_pred_list = []
y_list = []
AUC = torchmetrics.AUROC(num_classes=2, task="multiclass")

best_model.eval()
for x, y in dataset.test_dataloader():
    y_pred = best_model(x.to("cuda:0")).cpu().detach()
    y_pred_list.append(y_pred)
    y_list.append(y)
    AUC.update(torch.softmax(y_pred, dim=-1), torch.argmax(y, dim=1))

AUC_values = AUC.compute()
print(f"AUC: {AUC_values}")

In [None]:
y_pred = torch.softmax(torch.cat(y_pred_list), dim=-1)[:, 1].numpy()
y_true = torch.argmax(torch.cat(y_list), dim=1).numpy()

test_df = pd.DataFrame(
    {
        "y_pred": y_pred,
        "y_true": y_true,
    }
)

cal_binary_metrics(y_true, y_pred)

In [None]:
y_true

In [None]:
cal_binary_metrics(test_df["y_true"], test_df["y_pred"])