In [1]:
#!pip3 install -q  neptune

In [2]:
import os; os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import random
import torch
import numpy as np


def setup_reproducibility(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")
    
SEED = 7031
setup_reproducibility(SEED)

In [3]:
from collections import OrderedDict
from transformers import get_cosine_schedule_with_warmup
from scipy import signal
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from huggingface_hub import login, snapshot_download
from tqdm.auto import tqdm


def rest(t=4000):
    import time
    for i in range(4000):
        time.sleep(i)
        

def average_state_dicts(state_dict_list):
    n = len(state_dict_list)
    # Ensure we don't modify the originals
    avg_sd = OrderedDict()

    # Iterate over every parameter/buffer key
    for k in state_dict_list[0]:
        # sum across models → float32 to avoid overflow on int types
        avg = sum(sd[k].float() for sd in state_dict_list) / n
        # cast back to original dtype if needed
        avg_sd[k] = avg.to(dtype=state_dict_list[0][k].dtype)

    return avg_sd


def cuda_to_np(tensor):
    return tensor.cpu().detach().numpy()


def get_scheduler(optimizer, train_dl, epochs):
    total_training_steps = len(train_dl) * epochs
    warmup_steps = int(total_training_steps * 0.05)  # e.g. 5% warmup
    
    return get_cosine_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_training_steps
    )


def get_stats(tensor, p=True, r=False, minmax=False):
    if minmax:
        min, max = tensor.min(), tensor.max()
        mean, std = tensor.mean(), tensor.std()
        if p: print(f"Min: {min}, Max: {max} ,Mean: {mean}, Std: {std}")
        if r: return min, max, mean, std
    else:
        mean, std = tensor.mean(), tensor.std()
        if p: print(f"Mean: {mean}, Std: {std}")
        if r: return mean, std
    
    
def zscore(tensor, mean=None, std=None):
    if mean is None: mean = tensor.mean()
    if std is None: std = tensor.std()
    return (tensor - mean) / (std + 1e-8)


def get_model_size(model):
    print(sum(p.numel() for p in model.parameters()) / 1e6)
    

def get_index(iterable):
    return random.randint(0, len(iterable) - 1)


def get_indices(iterable, n):
    return random.sample(range(len(iterable)), n)


def split(inputs, targets, seed):
    return train_test_split(
        inputs,
        targets, 
        test_size=0.2,
        shuffle=True, 
        random_state=seed
    ) 


def show_waves(waves, dpi=100):
    """
    waves: numpy array of shape (3, N)
    Creates three separate figures that stretch wide.
    """
    N = waves.shape[1]
    t = np.arange(N)

    # Wide aspect ratio; height modest so each window fills width
    for i in range(waves.shape[0]):
        fig = plt.figure(figsize=(14, 4), dpi=dpi)  # wide figure
        ax = fig.add_subplot(111)
        ax.plot(t, waves[i], linewidth=1)
        ax.set_title(f"Wave {i+1}")
        ax.set_xlabel("Sample")
        ax.set_ylabel("Amplitude")
        ax.grid(True)
        fig.tight_layout()  # reduce margins to use width
        
    plt.show()
    
    
def hf_ds_download(hf_token, repo_id):
    login(hf_token[1:])
    return snapshot_download(repo_id, repo_type="dataset")


def get_spectra_features(X, b=False):
    """Create multi-channel features from spectra: raw, 1st derivative, 2nd derivative."""
    X_processed = np.zeros_like(X)
    # Baseline correction and SNV
    for i in tqdm(range(X.shape[0])):
        poly = np.polyfit(np.arange(X.shape[1]), X[i], 3)
        baseline = np.polyval(poly, np.arange(X.shape[1]))
        corrected_spec = X[i] - baseline
        #X_processed[i] = (corrected_spec - corrected_spec.mean()) / (corrected_spec.std() + 1e-8)
        X_processed[i] = corrected_spec
        
    # Calculate derivatives
    deriv1 = signal.savgol_filter(X_processed, window_length=11, polyorder=3, deriv=1, axis=1)
    deriv2 = signal.savgol_filter(X_processed, window_length=11, polyorder=3, deriv=2, axis=1)

    if b: return np.stack([X_processed, deriv1, deriv2], axis=1)
    return np.stack([deriv1, deriv2], axis=1)

In [4]:
import os

path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
files = os.listdir(path)
[(i, files[i]) for i in range(len(files))]

[(0, 'sample_submission.csv'),
 (1, 'timegate.csv'),
 (2, 'mettler_toledo.csv'),
 (3, 'kaiser.csv'),
 (4, 'anton_532.csv'),
 (5, 'transfer_plate.csv'),
 (6, '96_samples.csv'),
 (7, 'tornado.csv'),
 (8, 'tec5.csv'),
 (9, 'metrohm.csv'),
 (10, 'anton_785.csv')]

In [5]:
import pandas as pd


dataset_names = ['anton_532', 'anton_785', 'kaiser', 'mettler_toledo', 'metrohm', 'tec5', 'timegate', 'tornado']

lower_bounds = {
    'anton_532': 200,
    'anton_785': 100,
    'kaiser': -37,
    'mettler_toledo': 300,
    'metrohm': 200,
    'tec5': 85,
    'timegate': 200,
    'tornado': 300,
}


upper_bounds = {
    'anton_532': 3500,
    'anton_785': 2300,
    'kaiser': 1942,
    'mettler_toledo': 3350,
    'metrohm': 3350,
    'tec5': 3210,
    'timegate': 2000,
    'tornado': 3300,
}

def get_csv_dataset(
    dataset_name,
    lower_wn=-1000,
    upper_wn=10000,
    dtype=None,
):
    lower_wn = max(lower_wn, lower_bounds[dataset_name])
    upper_wn = min(upper_wn, upper_bounds[dataset_name])
    dtype = dtype or np.float64

    df = pd.read_csv(
        os.path.join(
            DATA_PATH,
            '%s.csv' % dataset_name,
        ),
    )

    spectra_selection = np.logical_and(
        lower_wn <= np.array([float(one) for one in df.columns[:-5]]),
        np.array([float(one) for one in df.columns[:-5]]) <= upper_wn,
    )

    spectra = df.iloc[:, :-5].iloc[:, spectra_selection].values
    label = df.iloc[:, -5:-1].values
    cv_indices = df.iloc[:, -1].values
    all_indices = np.array(range(len(cv_indices)))

    cv_folds = [
        (
            # trainings indices
            all_indices[cv_indices != fold_idx],
            # validation indices
            all_indices[cv_indices == fold_idx],
        )
        for fold_idx in range(len(set(cv_indices)))
    ]
    
    wavenumbers = np.array([
        float(one) for one in df.columns[:-5]
    ])[spectra_selection]

    return (
        spectra.astype(dtype),
        label.astype(dtype),
        None,
        cv_folds,
        wavenumbers.astype(dtype)
    )

def load_joint_dataset(
    dataset_names,
    lower_wn=-1000,
    upper_wn=10000,
    dtype=None,
    leave_out_one_device=False,
):

    dtype = dtype or np.float64

    lower_wn = max(
        lower_wn,
        *[lower_bounds[name] for name in dataset_names])
    upper_wn = min(
        upper_wn,
        *[upper_bounds[name] for name in dataset_names]
    )

    print("Lower WN: ", lower_wn)
    print("Upper WN: ", upper_wn)

    datasets = [
        get_csv_dataset(
            dataset_name,
            lower_wn=lower_wn,
            upper_wn=upper_wn,
            dtype=dtype,
        )
        for dataset_name in dataset_names
    ]

    joint_wns = np.arange(lower_wn, upper_wn + 1)
    print("Joint WNS: ", joint_wns)
    
    interpolated_data = [
        np.array([
            np.interp(
                joint_wns,
                xp=wns,
                fp=spectrum,
            )
            for spectrum in spectra
        ])
        for spectra, _, _, _, wns in datasets
    ]
    
    normed_spectra = np.concatenate(
        [
            spectra / np.max(spectra)
            for spectra in interpolated_data
        ],
        axis=0,
    )
    
    dataset_offsets = np.concatenate(
        [
            [0],
            np.cumsum([len(one[0]) for one in datasets])[:-1]
        ]
    )

    num_items = sum((len(one[0]) for one in datasets))
    if leave_out_one_device:
        val_indices = [
            np.arange(start, end, 1)
            for start, end in zip(
                dataset_offsets,
                np.concatenate([dataset_offsets[1:], np.array([num_items])])
            )
        ]
    else:
        val_indices = [
            val_idxs + offset
            for one, offset in zip(datasets, dataset_offsets)
            for train_idxs, val_idxs in one[3]
        ]

    all_indices = set(range(num_items))

    cv_folds = [
        (np.array(list(all_indices - set(val_idxs))), val_idxs)
        for val_idxs in val_indices
    ]
    return (
        normed_spectra,
        np.concatenate([one[1] for one in datasets])[:, :3],
        cv_folds,
        np.concatenate(
            [
                [0],
                np.cumsum([len(one[0]) for one in datasets])
            ]
        ),
    )

In [6]:
DATA_PATH = path
inputs, targets, cv_folds, dataset_offsets = load_joint_dataset(dataset_names)
inputs.shape, targets.shape

Lower WN:  300
Upper WN:  1942
Joint WNS:  [ 300  301  302 ... 1940 1941 1942]


((2261, 1643), (2261, 3))

In [7]:
import random
import torch
from torch.utils.data import Dataset
import scipy.optimize


np_dtype_from_torch = {
    torch.float32: np.float32,
    torch.float64: np.float64,
}

class SpectralDataset(Dataset):
    def __init__(
        self,
        spectra,
        concentrations,
        dtype=None,
        spectra_mean_std=None,
        concentration_mean_std=None,
        combine_spectra_range=0.0,
        baseline_factor_bound=0.0,
        baseline_period_lower_bound=100.0,
        baseline_period_upper_bound=200.0,
        augment_slope_std=0.0,
        augment_intersept_std=0.0,
        rolling_bound=0,
        spectrum_rolling_sigma=0.0,
        augmentation_weight=0.1,
        original_datapoint_weight=1.,
    ):
        self.dtype = dtype or torch.float32
        self.combine_spectra_range = combine_spectra_range
        self.baseline_factor_bound = baseline_factor_bound
        self.augment_slope_std = augment_slope_std
        self.augment_intercept_std = augment_intersept_std
        self.baseline_period_lower_bound = baseline_period_lower_bound
        self.baseline_period_upper_bound = baseline_period_upper_bound
        self.rolling_bound = rolling_bound
        self.spectrum_rolling_sigma = spectrum_rolling_sigma
        self.augmentation_weight = torch.tensor(augmentation_weight, dtype=dtype)
        self.original_dp_weight = original_datapoint_weight

        # normalize spectra
        spectra = torch.tensor(spectra, dtype=dtype)

        if spectra_mean_std is None:
            self.s_mean = torch.mean(spectra)
            self.s_std = torch.std(spectra)
        else:
            self.s_mean, self.s_std = spectra_mean_std

        self.spectra = torch.divide(
            torch.subtract(spectra, self.s_mean),
            self.s_std,
        )

        self.dummy_wns = np.tile(
            np.arange(
                0., 1., 1. / self.spectra.shape[2],
                dtype=np_dtype_from_torch[self.dtype]
            )[None, :self.spectra.shape[2]],
            (self.spectra.shape[1], 1),
        )

        # normalize concentrations
        concentrations = torch.tensor(concentrations, dtype=dtype)
        if concentration_mean_std is None:
            self.concentration_means = torch.nanmean(concentrations, dim=0)

            self.concentration_stds = torch.maximum(
                torch.tensor(
                    [
                        torch.std(col[torch.logical_not(torch.isnan(col))])
                        for col in concentrations.T
                    ]
                ),
                torch.tensor([1e-3] * concentrations.shape[1]),
            )
        else:
            self.concentration_means = concentration_mean_std[0]
            self.concentration_stds = concentration_mean_std[1]

        self.concentrations = torch.divide(
            torch.subtract(
                concentrations,
                self.concentration_means,
            ),
            self.concentration_stds,
        )

    def pick_two(self, max_idx=None):
        max_idx = max_idx or len(self)
        return random.choices(range(max_idx), k=2)

    def __len__(self):
        return len(self.concentrations)

    def augment_spectra(self, spectra):
        if self.augment_slope_std > 0.0:

            def spectrum_approximation(x, slope, intercept):
                return (slope * x + intercept).reshape(-1, 1)[:, 0]

            slope, inter = scipy.optimize.curve_fit(
                spectrum_approximation,
                self.dummy_wns,
                spectra.reshape(-1, 1)[:, 0],
                p0=np.random.rand(2),
            )[0]

            new_slope = slope * (
                    np.random.gamma(
                        shape=1. / self.augment_slope_std,
                        scale=self.augment_slope_std,
                        size=1,
                    )
            )[0]
            new_intercept = inter * (
                1.0 + np.random.randn(1) * self.augment_intercept_std
            )[0]
            spectra += torch.tensor(
                (new_slope - slope)
            ) * self.dummy_wns + new_intercept - inter

        factor = self.baseline_factor_bound * torch.rand(size=(1,))
        offset = torch.rand(size=(1,)) * 2.0 * torch.pi
        period = self.baseline_period_lower_bound + (
            self.baseline_period_upper_bound - self.baseline_period_lower_bound
        ) * torch.rand(size=(1,))
        permutations = factor * torch.cos(
            2.0 * torch.pi / period * self.dummy_wns + offset
        )
        return self.roll_spectrum(
            spectra + permutations * spectra,
            delta=random.randint(-self.rolling_bound, self.rolling_bound),
        )

    def roll_spectrum(self, spectra, delta):
        num_spectra = spectra.shape[0]
        rolled_spectra = np.roll(spectra, delta, axis=1)
        if delta > 0:
            rolled_spectra[:, :delta] = (
                np.random.rand(num_spectra, delta) * self.spectrum_rolling_sigma + 1
            ) * rolled_spectra[:, delta:(delta + 1)]
        elif delta < 0:
            rolled_spectra[:, delta:] = (
                np.random.rand(num_spectra, -delta) * self.spectrum_rolling_sigma + 1
            ) * rolled_spectra[:, delta - 1:delta]
        return rolled_spectra

    def combine_k_items(self, indices, weights):
        return (
            # spectra
            torch.sum(
                torch.mul(weights[:, None, None], self.spectra[indices, :, :]),
                dim=0,
            ),
            # concentrations
            torch.sum(
                torch.mul(weights[:, None], self.concentrations[indices, :]),
                dim=0,
            )
        )

    def __getitem__(self, idx):
        if self.combine_spectra_range < 1e-12:
            spectrum = self.spectra[idx]
            spectrum = self.augment_spectra(spectrum)
            return (
                spectrum,
                self.concentrations[idx],
                torch.tensor(1.0, dtype=self.dtype),
            )
          
        else:
            if random.random() < self.original_dp_weight:
                one_weight = 1.
                label_weight = torch.tensor(1.0, dtype=self.dtype)
            else:
                one_weight = random.uniform(0.0, self.combine_spectra_range)
                label_weight = self.augmentation_weight
            weights = torch.tensor([one_weight, (1 - one_weight)])
            # just pick two random indices
            indices = random.choices(range(len(self)), k=2)

            mixed_spectra, mixed_concentrations = self.combine_k_items(
                indices=indices,
                weights=weights,
            )
            mixed_spectra = self.augment_spectra(mixed_spectra)
            return mixed_spectra, mixed_concentrations, label_weight


config = {
    'initial_cnn_channels': 32,
    'cnn_channel_factor': 1.279574024454846,
    'num_cnn_layers': 8,
    'kernel_size': 3,
    'stride': 2,
    'activation_function': 'ELU',
    'fc_dropout': 0.10361700399831791,
    'lr': 0.001,
    'gamma': 0.9649606352621118,
    'baseline_factor_bound': 0.748262317340447,
    'baseline_period_lower_bound': 0.9703081695287203,
    'baseline_period_span': 19.79744237606427,
    'original_datapoint_weight': 0.4335003268130408,
    'augment_slope_std': 0.08171025264382692,
    'batch_size': 32,
    'fc_dims': 226,
    'rolling_bound': 2,
    'num_blocks': 2,
}

def get_dataset(inputs, targets, config, inputs_mean_std=None, targets_mean_std=None):
    return SpectralDataset(
        spectra=inputs[:, None, :],
        concentrations=targets,
        dtype=torch.float32,
        spectra_mean_std=inputs_mean_std,
        concentration_mean_std=targets_mean_std,
        combine_spectra_range=1.0,
        baseline_factor_bound=config["baseline_factor_bound"],
        baseline_period_lower_bound=config["baseline_period_lower_bound"],
        baseline_period_upper_bound=(config["baseline_period_lower_bound"] + config["baseline_period_span"]),
        augment_slope_std=config["augment_slope_std"],
        augment_intersept_std=0.0,
        rolling_bound=config["rolling_bound"],
        spectrum_rolling_sigma=0.01,
        augmentation_weight=0.1,
        original_datapoint_weight=1.,
    )

In [8]:
from torch.utils.data import DataLoader


def build_loader(
    SEED,
    ds,
    train=True,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
):
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    generator = torch.Generator()
    generator.manual_seed(SEED if train else SEED+5232)

    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        persistent_workers=persistent_workers,
        worker_init_fn=seed_worker,
        generator=generator,
        #sampler=DistributedSampler(
        #    train_ds,
        #    shuffle=True,
        #    drop_last=True,
        #    seed=config.seed
        #)
    )
    
    
def return_dls(train_ds, eval_ds, train_batch_size, eval_batch_size):
    train_dl = build_loader(
        SEED,
        train_ds,
        train=True,
        batch_size=train_batch_size,
        shuffle=True,
        num_workers=0,
        drop_last=False,
        pin_memory=True,
        persistent_workers=False,
    )

    eval_dl = build_loader(
        SEED,
        eval_ds,
        train=False,
        batch_size=eval_batch_size,
        shuffle=False,
        num_workers=0,
        drop_last=False,
        pin_memory=True,
        persistent_workers=False,
    )
    
    return train_dl, eval_dl

In [9]:
import neptune


def setup_neptune():
    if not RESUME:
        neptune_run = neptune.init_run(
            project="arbaaz/kaggle-spect",
            name=MODEL_NAME,
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlOGE2YjNiZS1mZGUyLTRjYjItYTg5Yy1mZWJkZTIzNzE1NmIifQ=="
        )

        neptune_run["h_parameters"] = {
            "seed": SEED,
            "model_name": MODEL_NAME,
            "optimizer_name": "nadam",
            "learning_rate": LR,
            "scheduler_name": "default",
            "weight_decay": WD,
            "num_epochs": EPOCHS,
            "batch_size": BATCH_SIZE,
        }
        if DROPOUT: neptune_run["h_parameters"] = {"dropout": DROPOUT}
        if DROP_PATH_RATE: neptune_run["h_parameters"] = {"drop_path_rate": DROP_PATH_RATE}
    else:
        neptune_run = neptune.init_run(
            project="arbaaz/crunchdao-structural-break",
            with_id=config.with_id,
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlOGE2YjNiZS1mZGUyLTRjYjItYTg5Yy1mZWJkZTIzNzE1NmIifQ=="
        )

    return neptune_run

In [10]:
import torch.nn.functional as F
from torch.nn.modules.loss import _Loss
from sklearn.metrics import r2_score


def loss_fn(logits, targets):
    logits = logits.view(-1)
    targets = targets.view(-1)
    return F.mse_loss(logits, targets)


def metric_fn(logits, targets):
    preds = logits.cpu().detach().float().numpy()
    targets = targets.cpu().detach().float().numpy()
    
    dim1 = r2_score(targets[:, 0], preds[:, 0])
    dim2 = r2_score(targets[:, 1], preds[:, 1])
    dim3 = r2_score(targets[:, 2], preds[:, 2])
    
    return dim1, dim2, dim3, r2_score(targets, preds)


class MSEIgnoreNans(_Loss):
    def forward(
        self,
        input: torch.Tensor,
        target: torch.Tensor,
        weights: torch.Tensor,
    ) -> torch.Tensor:
        mask = torch.isfinite(target)
        mse = torch.mean(
            torch.mul(
                torch.square(input[mask] - target[mask]),
                torch.tile(weights[:, None], dims=(1, target.shape[1]))[mask],
            )
        )
        return torch.where(
            torch.isfinite(mse),
            mse,
            torch.tensor(0.).to(target.device),
        )

In [11]:
import torch.nn as nn


class ResidualBlock(nn.Module):
    """A residual block with two 1D convolutional layers."""
    def __init__(self, in_channels, out_channels, kernel_size=3, stride=1):
        super(ResidualBlock, self).__init__()
        self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding=kernel_size//2)
        self.bn1 = nn.BatchNorm1d(out_channels)
        self.elu = nn.ELU()
        self.conv2 = nn.Conv1d(out_channels, out_channels, kernel_size, padding=kernel_size//2)
        self.bn2 = nn.BatchNorm1d(out_channels)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_channels != out_channels:
            self.shortcut = nn.Sequential(
                nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=stride),
                nn.BatchNorm1d(out_channels)
            )

    def forward(self, x):
        out = self.elu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = self.elu(out)
        return out
    

class ResNet(nn.Module):
    """A deeper ResNet-style 1D CNN for Raman spectra."""
    def __init__(self, dropout, input_channels=1, num_classes=3):
        super().__init__()
        self.in_channels = 64
        self.conv1 = nn.Conv1d(input_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm1d(64)
        self.elu = nn.GELU()
        self.maxpool = nn.MaxPool1d(kernel_size=3, stride=2, padding=1)

        self.layer1 = self._make_layer(64, 2, stride=1)
        self.layer2 = self._make_layer(128, 2, stride=2)
        self.layer3 = self._make_layer(256, 2, stride=2)
        self.layer4 = self._make_layer(512, 2, stride=2)

        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Sequential(
            nn.Linear(512, 256),
            nn.ELU(),
            nn.Dropout(dropout), # Increased dropout for better regularization
            nn.Linear(256, num_classes)
        )

    def _make_layer(self, out_channels, num_blocks, stride):
        strides = [stride] + [1] * (num_blocks - 1)
        layers = []
        for s in strides:
            layers.append(ResidualBlock(self.in_channels, out_channels, stride=s))
            self.in_channels = out_channels
        return nn.Sequential(*layers)

    def forward(self, x):
        x = self.elu(self.bn1(self.conv1(x)))
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)
        x = self.classifier(x)
        return x

In [12]:
import math


class Identity(torch.torch.nn.Module):
    def forward(self, x):
        return x


# this is not a resnet yet
class ReZeroBlock(torch.torch.nn.Module):
    def __init__(
        self,
        in_channels,
        out_channels,
        activation_function,
        kernel_size,
        stride,
        dtype,
        norm_layer=None,
    ):
        super(ReZeroBlock, self).__init__()
        if norm_layer is None:
            norm_layer = torch.torch.nn.BatchNorm1d

        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = divmod(kernel_size, 2)[0] if stride == 1 else 0

        # does not change spatial dimension
        self.conv1 = torch.nn.Conv1d(
            in_channels,
            out_channels,
            kernel_size=1,
            stride=1,
            bias=False,
            dtype=dtype,
        )
        self.bn1 = norm_layer(out_channels, dtype=dtype)
        # Both self.conv2 and self.downsample layers
        # downsample the input when stride != 1
        self.conv2 = torch.nn.Conv1d(
            out_channels,
            out_channels,
            kernel_size=kernel_size,
            stride=stride,
            groups=out_channels,
            bias=False,
            dtype=dtype,
            padding=self.padding,
        )
        if stride > 1:
            down_conv = torch.nn.Conv1d(
                in_channels,
                out_channels,
                kernel_size=kernel_size,
                stride=stride,
                bias=False,
                dtype=dtype,
                # groups=out_channels,
            )
        else:
            down_conv = Identity()

        self.down_sample = torch.nn.Sequential(
            down_conv,
            norm_layer(out_channels),
        )
        self.bn2 = norm_layer(out_channels, dtype=dtype)
        # does not change the spatial dimension
        self.conv3 = torch.nn.Conv1d(
            out_channels,
            out_channels,
            kernel_size=1,
            stride=1,
            bias=False,
            dtype=dtype,
        )
        self.bn3 = norm_layer(out_channels, dtype=dtype)
        self.activation = activation_function(inplace=True)
        self.factor = torch.torch.nn.parameter.Parameter(torch.tensor(0.0, dtype=dtype))

    def next_spatial_dim(self, last_spatial_dim):
        return math.floor(
            (last_spatial_dim + 2 * self.padding - self.kernel_size)
            / self.stride + 1
        )

    def forward(self, x):
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.activation(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.activation(out)

        out = self.conv3(out)
        out = self.bn3(out)

        # not really the identity, but kind of
        identity = self.down_sample(x)

        return self.activation(out * self.factor + identity)


class ResNetEncoder(torch.torch.nn.Module):
    def __init__(
        self,
        spectrum_size,
        cnn_encoder_channel_dims,
        activation_function,
        kernel_size,
        stride,
        dtype,
        num_blocks,
        verbose=False,
    ):
        super(ResNetEncoder, self).__init__()

        self.spatial_dims = [spectrum_size]
        layers = []
        for in_channels, out_channels in zip(
            cnn_encoder_channel_dims[:-1],
            cnn_encoder_channel_dims[1:],
        ):
            block = ReZeroBlock(
                in_channels=in_channels,
                out_channels=out_channels,
                activation_function=activation_function,
                kernel_size=kernel_size,
                stride=stride,
                dtype=dtype,
            )
            layers.append(block)
            self.spatial_dims.append(block.next_spatial_dim(self.spatial_dims[-1]))
            for _ in range(num_blocks - 1):
                block = ReZeroBlock(
                    in_channels=out_channels,
                    out_channels=out_channels,
                    activation_function=activation_function,
                    kernel_size=kernel_size,
                    stride=1,
                    dtype=dtype,
                )
                layers.append(block)
                self.spatial_dims.append(block.next_spatial_dim(self.spatial_dims[-1]))

        self.resnet_layers = torch.torch.nn.Sequential(*layers)
        if verbose:
            print("CNN Encoder Channel Dims: %s" % (cnn_encoder_channel_dims))
            print("CNN Encoder Spatial Dims: %s" % (self.spatial_dims))

    def forward(self, x):
        return self.resnet_layers(x)


class ReZeroNet(torch.nn.Module):
    def __init__(
        self,
        spectra_channels,
        spectra_size,
        initial_cnn_channels,
        cnn_channel_factor,
        num_cnn_layers,
        kernel_size,
        stride,
        activation_function,
        fc_dims,
        fc_dropout=0.0,
        dtype=None,
        verbose=False,
        fc_output_channels=1,
        num_blocks=1,
        **kwargs,
    ):
        super().__init__()
        self.fc_output_channels = fc_output_channels
        self.dtype = dtype or torch.float32

        activation_function = getattr(torch.nn, activation_function)

        # Setup CNN Encoder
        cnn_encoder_channel_dims = [spectra_channels] + [
            int(initial_cnn_channels * (cnn_channel_factor**idx))
            for idx in range(num_cnn_layers)
        ]
        self.cnn_encoder = ResNetEncoder(
            spectrum_size=spectra_size,
            cnn_encoder_channel_dims=cnn_encoder_channel_dims,
            activation_function=activation_function,
            kernel_size=kernel_size,
            stride=stride,
            num_blocks=num_blocks,
            dtype=dtype,
            verbose=verbose,
        )
        self.fc_dims = [
            int(
                self.cnn_encoder.spatial_dims[-1]
            ) * int(cnn_encoder_channel_dims[-1])
        ] + fc_dims

        if verbose:
            print("Fc Dims: %s" % self.fc_dims)
        fc_layers = []
        for idx, (in_dim, out_dim) in enumerate(
                zip(self.fc_dims[:-2], self.fc_dims[1:-1])
        ):
            fc_layers.append(torch.nn.Linear(in_dim, out_dim))
            fc_layers.append(torch.nn.ELU())
            fc_layers.append(torch.nn.Dropout(fc_dropout / (2 ** idx)))
        fc_layers.append(
            torch.nn.Linear(
                self.fc_dims[-2],
                self.fc_dims[-1] * self.fc_output_channels,
            ),
        )
        self.fc_net = torch.nn.Sequential(*fc_layers)
        if verbose:
            num_params = sum(p.numel() for p in self.parameters())
            print("Number of Parameters: %s" % num_params)

    def forward(self, spectra):
        embeddings = self.cnn_encoder(spectra)
        forecast = self.fc_net(embeddings.view(-1, self.fc_dims[0]))
        if self.fc_output_channels > 1:
            forecast = forecast.reshape(
                -1, self.fc_output_channels, self.fc_dims[-1]
            )
        return forecast


In [13]:
import math
import torch


class ResZeroBlock(torch.nn.Module):
    def __init__(self, skip_part, model_part):
        super(ResZeroBlock, self).__init__()
        self.skip_part = skip_part
        self.model_part = model_part
        self.factor = torch.nn.parameter.Parameter(torch.tensor(0.))

    def forward(self, X):
        return self.skip_part(X) + self.factor * self.model_part(X)


class Identity(torch.nn.Module):
    def forward(self, X):
        return X


class RamanXception(torch.nn.Module):
    def __init__(
        self,
        spectra_size,
        initial_channels,
        entry_channels,
        num_mid_blocks,
        exit_channels,
        num_concentrations,
        fc_dims,
        fc_dropout,
        lower_bounds=None,
        dtype=None,
        activation_function='ReLU',
        classification_idx=None,
        verbose=False,
        **kwargs,
    ):
        super(RamanXception, self).__init__()

        self.classification_idx = classification_idx or num_concentrations

        if lower_bounds is None:
            self.lower_bounds = torch.nn.parameter.Parameter(
                torch.tensor([-1000] * num_concentrations),
                requires_grad=False,
            )
        else:
            self.lower_bounds = torch.nn.parameter.Parameter(
                lower_bounds,
                requires_grad=False,
            )
        dtype = dtype or torch.float32
        activation_function = getattr(torch.nn, activation_function)
        self.spatial_dimensions = [spectra_size]

        # setup initial layers
        initial_layers = torch.nn.Sequential()
        for idx, (in_channels, out_channels) in enumerate(
            zip(
                [1] + initial_channels[:-1],
                initial_channels,
            ),
        ):
            initial_layers.add_module(
                'initial_%s' % idx,
                torch.nn.Conv1d(
                    in_channels,
                    out_channels,
                    kernel_size=3,
                    stride=2,
                    padding=1,
                    dtype=dtype,
                    bias=False,
                ),
            )
            self.spatial_dimensions.append(
                math.floor((self.spatial_dimensions[-1] - 1) / 2 + 1)
            )
            initial_layers.add_module(
                'initial_batch_%s' % idx,
                torch.nn.BatchNorm1d(
                    out_channels,
                    dtype=dtype,
                ),
            )
            initial_layers.add_module(
                'initial_activation_%s' % idx,
                activation_function(),
            )

        # Entry flow
        entry_flow = torch.nn.Sequential()
        # self.entry_flow_length = len(entry_channels)
        for idx, (in_channels, out_channels) in enumerate(
            zip(
                [initial_channels[-1]] + entry_channels[:-1],
                entry_channels,
            )
        ):
            entry_flow.add_module(
                name='entry_flow_%s' % idx,
                module=ResZeroBlock(
                    skip_part=torch.nn.Conv1d(
                        in_channels,
                        out_channels,
                        kernel_size=1,
                        stride=2,
                        dtype=dtype,
                        bias=False,
                    ),
                    model_part=torch.nn.Sequential(
                        activation_function(),
                        # spatial dimension stays constant
                        torch.nn.Conv1d(
                            in_channels,
                            in_channels,
                            kernel_size=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.Conv1d(
                            in_channels,
                            in_channels,
                            kernel_size=3,
                            groups=in_channels,
                            padding=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        # spatial dimension stays constant
                        torch.nn.BatchNorm1d(
                            in_channels,
                            dtype=dtype,
                        ),
                        activation_function(),
                        torch.nn.Conv1d(
                            in_channels,
                            out_channels,
                            kernel_size=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.Conv1d(
                            out_channels,
                            out_channels,
                            kernel_size=3,
                            groups=out_channels,
                            padding=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.BatchNorm1d(
                            out_channels,
                            dtype=dtype,
                        ),
                        # spatial dimension: in_dim / 2
                        torch.nn.MaxPool1d(3, stride=2, padding=1),
                    )
                ),
            )
            self.spatial_dimensions.append(
                math.floor((self.spatial_dimensions[-1] - 1) / 2 + 1)
            )

        # Middle flow
        num_mid_channels = entry_channels[-1]
        middle_flow = torch.nn.Sequential()
        for idx in range(num_mid_blocks):
            middle_flow.add_module(
                name='middle_flow_%s' % idx,
                module=ResZeroBlock(
                    skip_part=Identity(),
                    model_part=torch.nn.Sequential(
                        activation_function(),
                        torch.nn.Conv1d(
                            num_mid_channels,
                            num_mid_channels,
                            kernel_size=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.Conv1d(
                            num_mid_channels,
                            num_mid_channels,
                            kernel_size=3,
                            groups=num_mid_channels,
                            padding=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.BatchNorm1d(
                            num_mid_channels,
                            dtype=dtype,
                        ),
                        activation_function(),
                        torch.nn.Conv1d(
                            num_mid_channels,
                            num_mid_channels,
                            kernel_size=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.Conv1d(
                            num_mid_channels,
                            num_mid_channels,
                            kernel_size=3,
                            groups=num_mid_channels,
                            padding=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.BatchNorm1d(
                            num_mid_channels,
                            dtype=dtype,
                        ),
                        activation_function(),
                        torch.nn.Conv1d(
                            num_mid_channels,
                            num_mid_channels,
                            kernel_size=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.Conv1d(
                            num_mid_channels,
                            num_mid_channels,
                            kernel_size=3,
                            groups=num_mid_channels,
                            padding=1,
                            dtype=dtype,
                            bias=False,
                        ),
                    )
                )
            )
            self.spatial_dimensions.append(self.spatial_dimensions[-1])

        exit_flow = torch.nn.Sequential()
        for idx, (in_channels, (mid_channels, out_channels)) in enumerate(list(
            zip(
                [num_mid_channels] + [out for _, out in exit_channels[:-2]],
                exit_channels[:-1],
            ),
        )):
            exit_flow.add_module(
                name='exit_flow_%s' % idx,
                module=ResZeroBlock(
                    skip_part=torch.nn.Conv1d(
                        in_channels,
                        out_channels,
                        kernel_size=1,
                        stride=2,
                        dtype=dtype,
                        bias=False,
                    ),
                    model_part=torch.nn.Sequential(
                        torch.nn.Conv1d(
                            in_channels,
                            mid_channels,
                            kernel_size=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.Conv1d(
                            mid_channels,
                            mid_channels,
                            kernel_size=3,
                            groups=mid_channels,
                            padding=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.BatchNorm1d(
                            mid_channels,
                            dtype=dtype,
                        ),
                        activation_function(),
                        torch.nn.Conv1d(
                            mid_channels,
                            out_channels,
                            kernel_size=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.Conv1d(
                            out_channels,
                            out_channels,
                            kernel_size=3,
                            groups=out_channels,
                            padding=1,
                            dtype=dtype,
                            bias=False,
                        ),
                        torch.nn.BatchNorm1d(
                            out_channels,
                            dtype=dtype,
                        ),
                        activation_function(),
                        torch.nn.MaxPool1d(
                            kernel_size=3,
                            stride=2,
                            padding=1,
                        ),
                    )
                )
            )
            self.spatial_dimensions.append(
                math.floor((self.spatial_dimensions[-1] - 1) / 2 + 1)
            )

        # Last part of the exit flow
        in_channels = exit_channels[-2][1]
        mid_channels = exit_channels[-1][0]
        out_channels = exit_channels[-1][1]
        final_flow = torch.nn.Sequential(
            torch.nn.Conv1d(
                in_channels,
                mid_channels,
                kernel_size=1,
                dtype=dtype,
                bias=False,
            ),
            torch.nn.Conv1d(
                mid_channels,
                mid_channels,
                kernel_size=3,
                groups=mid_channels,
                padding=1,
                dtype=dtype,
                bias=False,
            ),
            torch.nn.BatchNorm1d(
                mid_channels,
                dtype=dtype,
            ),
            activation_function(),
            torch.nn.Conv1d(
                mid_channels,
                out_channels,
                kernel_size=1,
                dtype=dtype,
            ),
            torch.nn.Conv1d(
                out_channels,
                out_channels,
                kernel_size=3,
                groups=out_channels,
                padding=1,
                dtype=dtype,
            ),
            torch.nn.BatchNorm1d(
                out_channels,
                dtype=dtype,
            ),
            activation_function(),
        )
        self.conv_net = torch.nn.Sequential(
            initial_layers,
            entry_flow,
            middle_flow,
            exit_flow,
            final_flow,
        )

        self.fc_input_dim = int(out_channels * self.spatial_dimensions[-1])
        self.fc_net = torch.nn.Sequential()
        for idx, (in_dim, out_dim) in enumerate(
            zip(
                [self.fc_input_dim] + fc_dims[:-1],
                fc_dims,
            )
        ):
            self.fc_net.add_module(
                'fc_net_%s' % idx,
                torch.nn.Linear(
                    in_dim,
                    out_dim,
                    dtype=dtype,
                    bias=True,
                ),
            )
            self.fc_net.add_module(
                'fc_relu_%s' % idx,
                torch.nn.ReLU(),
            )
            self.fc_net.add_module(
                'fc_dropout_%s' % idx,
                torch.nn.Dropout(fc_dropout),
            )

        self.fc_net.add_module(
            'output_layer',
            torch.nn.Linear(
                fc_dims[-1] if fc_dims else out_channels,
                num_concentrations,
                dtype=dtype,
                bias=True,
            ),
        )
        self.softplus = torch.nn.Softplus()
        if verbose:
            print('Spatial dimensions: %s' % self.spatial_dimensions)
            print(
                'Fully Connected dimensions %s' % (
                        [self.fc_input_dim] + fc_dims
                )
            )

    def forward(self, x):
        x = self.conv_net(x)

        fc_output = self.fc_net(torch.reshape(x, (-1, self.fc_input_dim)))
        return torch.concat(
            [
                fc_output[:, :self.classification_idx],
                torch.sigmoid(fc_output[:, self.classification_idx:])
            ],
            dim=1,
        )


model_config = {
    'initial_channels': 8,
    'entry_channels_start': 17,
    'channel_factor': 1.5692504144354933,
    'entry_exit_length': 3,
    'num_mid_blocks': 4,
    'fc_dims': 101,
    'fc_dropout': 0.11748964300948816,
    'learning_rate': 0.001,
    'gamma': 0.9921697445978254,
    'batch_size': 21,
    'entropy_weight': 6.441421425536572,
    'uniform_sampling_range': 0.03803705551872033,
    'activation_function': 'ELU', 
    'fake_weight': 0.032878013410751736,
    'just_scale_concentrations': True,
    'entry_factor': 1.5692504144354933,
    'exit_factor': 1.5692504144354933,
    'entry_length': 3,
    'exit_length': 3,
    'spectra_size': 1643,
    'dtype': torch.float32}

lr = model_config.get('learning_rate')
l2_reg = model_config.get('l2_reg', 0.)
gamma = model_config.get('gamma', 1.)
model_config['initial_channels'] = [
    model_config['initial_channels'],
    2 * model_config['initial_channels'],
]
# create entry channel dimensions
entry_channels_start = model_config['entry_channels_start']
entry_factor = model_config['entry_factor']
entry_length = model_config['entry_length']
entry_channels = [entry_channels_start]
for _ in range(entry_length):
    entry_channels.append(int(entry_factor * entry_channels[-1]))
model_config['entry_channels'] = entry_channels
# create exit channel dimensions
exit_channels_start = entry_channels[-1]
exit_factor = model_config.get('exit_factor')
exit_length = model_config.get('exit_length')
exit_channels = [
    (
        int(exit_channels_start * math.sqrt(exit_factor)),
        int(exit_channels_start * exit_factor),
    )
]
for _ in range(1, exit_length):
    exit_channels.append(
        (
            int(exit_channels[-1][0] * math.sqrt(exit_factor)),
            int(exit_channels[-1][0] * exit_factor),
        )
    )
model_config['exit_channels'] = exit_channels
model_config["num_concnetrations"] = 2
model_config['fc_dims'] = [config['fc_dims']]

In [14]:
from tqdm.auto import tqdm


def train(
    model, 
    optimizer,
    device,
    amp_dtype,
    scheduler,
    train_dl,
    eval_dl,
    loss_fn,
    epochs,
    checkpoint_name,
    score=-float("inf"),
    neptune_run=None,
    p=True,
):  
    scaler = torch.amp.GradScaler(device)
    for epoch in tqdm(range(epochs), leave=False):
        model.train()
        total_loss = 0.0
        all_logits = []
        all_targets = []
        
        for inputs, targets, weights in train_dl:
            inputs = inputs.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)
            weights = weights.to(device, non_blocking=True)
            
            optimizer.zero_grad()
            with torch.amp.autocast(device_type=device, dtype=amp_dtype, cache_enabled=True):
                logits = model(inputs)
                loss = loss_fn(logits, targets, weights)
                  
            if amp_dtype == torch.bfloat16:                
                loss.backward()
                optimizer.step()
            else:
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()

            scheduler.step()
            if neptune_run is not None:  neptune_run["lr_step"].append(scheduler.get_last_lr()[0])
            
            total_loss += loss.detach().cpu()
            all_logits.append(logits.detach().cpu())
            all_targets.append(targets.detach().cpu())
        
        all_logits = torch.cat(all_logits)
        all_targets = torch.cat(all_targets)

        one, two, three, r2 = metric_fn(all_logits, all_targets)
        total_loss = total_loss / len(train_dl)
        
        model.eval()
        eval_total_loss = 0.0
        eval_all_logits = []
        eval_all_targets = []

        for inputs, targets, weights in eval_dl:
            inputs = inputs.to(device, non_blocking=True)
            targets = targets.to(device, non_blocking=True)
            weights = weights.to(device, non_blocking=True)

            with torch.inference_mode():
                with torch.amp.autocast(device_type=device, dtype=amp_dtype, cache_enabled=True):
                    logits = model(inputs)
                    loss = loss_fn(logits, targets, weights)

            eval_total_loss += loss.detach().cpu()
            eval_all_logits.append(logits.detach().cpu())
            eval_all_targets.append(targets.detach().cpu())
        
        eval_all_logits = torch.cat(eval_all_logits)
        eval_all_targets = torch.cat(eval_all_targets)

        eval_one, eval_two, eval_three, eval_r2 = metric_fn(eval_all_logits, eval_all_targets)
        eval_total_loss = eval_total_loss / len(eval_dl)
        
        if eval_r2 > score:
            score = eval_r2
            data = {"state_dict": model.state_dict()}
            data["epoch"] = epoch 
            data["score"] = score
            torch.save(data, f"/kaggle/working/{checkpoint_name}")
        
        if neptune_run is not None:
            neptune_run["train/loss"].append(total_loss)
            neptune_run["eval/loss"].append(eval_total_loss)
            neptune_run["train/r2"].append(r2)
            neptune_run["eval/r2"].append(eval_r2)
            neptune_run["train/one"].append(one)
            neptune_run["train/two"].append(two)
            neptune_run["train/three"].append(three)
            neptune_run["eval/one"].append(eval_one)
            neptune_run["eval/two"].append(eval_two)
            neptune_run["eval/three"].append(eval_three)
            
        if p and epoch % 5 == 0:
            print(
                f"Epoch: {epoch}, "
                f"train/loss: {total_loss:.4f}, "
                f"eval/loss: {eval_total_loss:.4f}, "
                f"train/r2: {r2:.4f}, "
                f"eval/r2: {eval_r2:.4f}, "
                f"train/one: {one:.4f}, "
                f"train/two: {two:.4f}, "
                f"train/three: {three:.4f}, "
                f"eval/one: {eval_one:.4f}, "
                f"eval/two: {eval_two:.4f}, "
                f"eval/three: {eval_three:.4f} "
            )
            
    if neptune_run is not None: neptune_run.stop()
    return score

In [15]:
EPOCHS = 100
WD = 1e-3
LR = 1e-4

DROPOUT = 0.5
DROP_PATH_RATE = 0.2

device = "cuda" if torch.cuda.is_available() else "cpu"
RESUME = False

#config["dtype"] = torch.float32
#config["spectra_size"] = 1643
#config["spectra_channels"] = 1
#config["fc_dims"] = [
#    config["fc_dims"],
#    int(config["fc_dims"] / 2),
#    3,
#]

mse_loss_function = MSEIgnoreNans()

In [16]:
from sklearn.model_selection import KFold


inputs_mean_std = []
targets_mean_std = []
scores = []
kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)
splits = kfold.split(inputs)

for fold, (train_idx, eval_idx) in enumerate(splits):
    MODEL_NAME = f"baseline.resnet.pretrain..fold.{fold}"
    checkpoint_name = f"baseline.resnet.pretrain.{fold}.pt"
    
    train_inputs = inputs[train_idx]
    train_targets = targets[train_idx]
    eval_inputs = inputs[eval_idx]
    eval_targets = targets[eval_idx]

    train_ds = get_dataset(train_inputs, train_targets, config)
    
    inputs_mean_std.append((fold, train_ds.s_mean, train_ds.s_std))
    targets_mean_std.append((fold, train_ds.concentration_means, train_ds.concentration_stds))
    
    eval_ds = get_dataset(
        eval_inputs, 
        eval_targets, 
        config, 
        (train_ds.s_mean, train_ds.s_std), 
        (train_ds.concentration_means, train_ds.concentration_stds)
    )
    
    BATCH_SIZE = 32
    train_dl, eval_dl = return_dls(train_ds, eval_ds, BATCH_SIZE, len(eval_ds))
    
    model = ResNet(dropout=DROPOUT).to(device)
    #model = ReZeroNet(**config).to(device)
    #model = convnextv2_atto().to(device)
    #model = SAINT(**model_config, classification_idx=3).to(device)
    #model = RamanXception(
    #    **model_config,
    #    classification_idx=3,
    #    num_concentrations=3
    #).to(device)
    
    if fold == 0: print(get_model_size(model))
    #print(model)    
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD, foreach=True)
    scheduler = get_scheduler(optimizer, train_dl, EPOCHS)
    
    score = train(
            model, 
            optimizer, 
            device,
            torch.float16,
            scheduler,
            train_dl, 
            eval_dl,
            mse_loss_function,
            EPOCHS,
            checkpoint_name,
            neptune_run=setup_neptune(),
        )
    
    scores.append(score)

3.980803
None




[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/arbaaz/kaggle-spect/e/KAG-300


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0, train/loss: 1.0523, eval/loss: 0.9207, train/r2: -0.0338, eval/r2: 0.0256, train/one: -0.0483, train/two: -0.0492, train/three: -0.0039, eval/one: 0.0061, eval/two: -0.0094, eval/three: 0.0801 
Epoch: 5, train/loss: 0.8439, eval/loss: 0.7757, train/r2: 0.1396, eval/r2: 0.1724, train/one: -0.0361, train/two: -0.0295, train/three: 0.4843, eval/one: -0.0617, eval/two: -0.0554, eval/three: 0.6344 
Epoch: 10, train/loss: 0.7993, eval/loss: 0.7113, train/r2: 0.2135, eval/r2: 0.2140, train/one: -0.0047, train/two: 0.0401, train/three: 0.6051, eval/one: 0.0025, eval/two: 0.0407, eval/three: 0.5987 
Epoch: 15, train/loss: 0.6858, eval/loss: 0.5732, train/r2: 0.3153, eval/r2: 0.3239, train/one: 0.0478, train/two: 0.1814, train/three: 0.7166, eval/one: 0.0425, eval/two: 0.2245, eval/three: 0.7046 
Epoch: 20, train/loss: 0.5240, eval/loss: 0.4705, train/r2: 0.4777, eval/r2: 0.4223, train/one: 0.2816, train/two: 0.3847, train/three: 0.7669, eval/one: 0.1711, eval/two: 0.3073, eval/three: 

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0, train/loss: 1.0143, eval/loss: 1.1384, train/r2: -0.0225, eval/r2: 0.0250, train/one: -0.0515, train/two: -0.0447, train/three: 0.0288, eval/one: -0.0103, eval/two: -0.0054, eval/three: 0.0906 
Epoch: 5, train/loss: 0.8640, eval/loss: 0.9330, train/r2: 0.1316, eval/r2: 0.1486, train/one: -0.0625, train/two: -0.0458, train/three: 0.5031, eval/one: -0.0630, eval/two: -0.0454, eval/three: 0.5544 
Epoch: 10, train/loss: 0.7742, eval/loss: 0.8643, train/r2: 0.2367, eval/r2: 0.2286, train/one: 0.0167, train/two: 0.0408, train/three: 0.6527, eval/one: -0.0283, eval/two: -0.0024, eval/three: 0.7165 
Epoch: 15, train/loss: 0.7263, eval/loss: 0.8188, train/r2: 0.2741, eval/r2: 0.3187, train/one: 0.0453, train/two: 0.0566, train/three: 0.7205, eval/one: 0.0886, eval/two: 0.1075, eval/three: 0.7599 
Epoch: 20, train/loss: 0.5758, eval/loss: 0.6536, train/r2: 0.4112, eval/r2: 0.3854, train/one: 0.2374, train/two: 0.2588, train/three: 0.7375, eval/one: 0.1364, eval/two: 0.2554, eval/three:

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0, train/loss: 1.0537, eval/loss: 1.0074, train/r2: -0.0438, eval/r2: -0.0048, train/one: -0.0468, train/two: -0.0736, train/three: -0.0111, eval/one: -0.0299, eval/two: -0.0345, eval/three: 0.0500 
Epoch: 5, train/loss: 0.8840, eval/loss: 0.8382, train/r2: 0.1223, eval/r2: 0.1688, train/one: -0.0256, train/two: -0.0619, train/three: 0.4544, eval/one: 0.0088, eval/two: -0.0129, eval/three: 0.5106 
Epoch: 10, train/loss: 0.7907, eval/loss: 0.6883, train/r2: 0.2046, eval/r2: 0.2723, train/one: -0.0201, train/two: 0.0155, train/three: 0.6184, eval/one: 0.0355, eval/two: 0.0230, eval/three: 0.7582 
Epoch: 15, train/loss: 0.6921, eval/loss: 0.7133, train/r2: 0.2890, eval/r2: 0.2932, train/one: 0.0491, train/two: 0.0984, train/three: 0.7196, eval/one: 0.0209, eval/two: 0.0741, eval/three: 0.7846 
Epoch: 20, train/loss: 0.5967, eval/loss: 0.6093, train/r2: 0.3989, eval/r2: 0.4012, train/one: 0.1642, train/two: 0.2866, train/three: 0.7458, eval/one: 0.1719, eval/two: 0.2220, eval/three:

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0, train/loss: 1.0458, eval/loss: 0.9344, train/r2: -0.0415, eval/r2: 0.0124, train/one: -0.0738, train/two: -0.0430, train/three: -0.0076, eval/one: -0.0297, eval/two: -0.0555, eval/three: 0.1226 
Epoch: 5, train/loss: 0.8793, eval/loss: 0.7240, train/r2: 0.1200, eval/r2: 0.1957, train/one: -0.0455, train/two: -0.0337, train/three: 0.4393, eval/one: 0.0307, eval/two: 0.0359, eval/three: 0.5205 
Epoch: 10, train/loss: 0.7893, eval/loss: 0.7368, train/r2: 0.2080, eval/r2: 0.2480, train/one: -0.0098, train/two: 0.0084, train/three: 0.6255, eval/one: 0.0055, eval/two: 0.0418, eval/three: 0.6968 
Epoch: 15, train/loss: 0.7544, eval/loss: 0.7983, train/r2: 0.2480, eval/r2: 0.2212, train/one: -0.0044, train/two: 0.0787, train/three: 0.6697, eval/one: 0.0221, eval/two: 0.0495, eval/three: 0.5921 
Epoch: 20, train/loss: 0.6565, eval/loss: 0.6893, train/r2: 0.3434, eval/r2: 0.2914, train/one: 0.0373, train/two: 0.2547, train/three: 0.7382, eval/one: 0.0579, eval/two: 0.0753, eval/three: 

  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0, train/loss: 1.0371, eval/loss: 1.0434, train/r2: -0.0447, eval/r2: -0.0434, train/one: -0.0933, train/two: -0.0596, train/three: 0.0187, eval/one: -0.0728, eval/two: 0.0030, eval/three: -0.0604 
Epoch: 5, train/loss: 0.8650, eval/loss: 0.8756, train/r2: 0.1245, eval/r2: 0.1089, train/one: -0.0567, train/two: -0.0259, train/three: 0.4562, eval/one: 0.0096, eval/two: -0.0349, eval/three: 0.3520 
Epoch: 10, train/loss: 0.8109, eval/loss: 0.7421, train/r2: 0.1967, eval/r2: 0.2121, train/one: -0.0235, train/two: 0.0156, train/three: 0.5979, eval/one: 0.0045, eval/two: 0.0031, eval/three: 0.6286 
Epoch: 15, train/loss: 0.7281, eval/loss: 0.7831, train/r2: 0.2618, eval/r2: 0.2605, train/one: 0.0094, train/two: 0.1337, train/three: 0.6422, eval/one: 0.0138, eval/two: 0.0675, eval/three: 0.7002 
Epoch: 20, train/loss: 0.6413, eval/loss: 0.6433, train/r2: 0.3458, eval/r2: 0.3786, train/one: 0.0600, train/two: 0.2707, train/three: 0.7067, eval/one: 0.0955, eval/two: 0.2853, eval/three: 

In [None]:
import os 
import torch


def get_ckpt(path):
    return torch.load(path, weights_only=False)


def get_ckpt_paths(keyword):
    output_dir = "/kaggle/working"
    output_files = sorted(os.listdir(output_dir))

    ckpt_paths = []
    for f in output_files:
        if keyword in f and "csv" not in f:
            ckpt_path = os.path.join(output_dir, f)
            ckpt_paths.append(ckpt_path)
            ckpt = torch.load(ckpt_path, weights_only=False)
            print(ckpt_path, ckpt["epoch"], ckpt["score"])
            
    return ckpt_paths

ckpt_paths = get_ckpt_paths("no.augs")
#state_dicts = [get_ckpt(p)["state_dict"] for p in ckpt_paths]

#avg_weights = average_state_dicts(state_dicts)
#torch.save(avg_weights, "/kaggle/working/avg_weights_data_fixed.pt")

/kaggle/working/resnet.pretrain.no.augs.fixed.0.pt 96 0.8834351766789243
/kaggle/working/resnet.pretrain.no.augs.fixed.1.pt 53 0.847152388976595
/kaggle/working/resnet.pretrain.no.augs.fixed.2.pt 59 0.8777620875263968
/kaggle/working/resnet.pretrain.no.augs.fixed.3.pt 69 0.8311964183406229
/kaggle/working/resnet.pretrain.no.augs.fixed.4.pt 80 0.8713698457573411
