In [9]:
import torch
import random
import numpy as np


SEED = 1000

def setup_reproducibility():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")

setup_reproducibility()

In [1]:
import os

path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
files = os.listdir(path)
[(i, files[i]) for i in range(len(files))]

[(0, 'sample_submission.csv'),
 (1, 'timegate.csv'),
 (2, 'mettler_toledo.csv'),
 (3, 'kaiser.csv'),
 (4, 'anton_532.csv'),
 (5, 'transfer_plate.csv'),
 (6, '96_samples.csv'),
 (7, 'tornado.csv'),
 (8, 'tec5.csv'),
 (9, 'metrohm.csv'),
 (10, 'anton_785.csv')]

In [2]:
import pandas as pd

df = pd.read_csv(os.path.join(path, files[5]))
df

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 2043,Unnamed: 2044,Unnamed: 2045,Unnamed: 2046,Unnamed: 2047,Unnamed: 2048,Analyte concentration,Glucose (g/L),Sodium Acetate (g/L),Magnesium Acetate (g/L)
0,sample1,[6293,7095,8325,9934,11917,14394,18925,34874,65535,...,1616,1024,1013,1067,1277,5618],sample1,4.619282,1.937172,1.052928
1,,[6505,7332,8482,10175,12132,14792,19594,35813,65535,...,1655,1004,1032,1049,1271,5756],sample2,5.782718,1.175902,1.214738
2,sample2,[6478,7158,8444,9979,11932,14503,19309,35118,65535,...,1651,1024,1009,1049,1275,5685],sample3,3.953448,1.350473,2.132459
3,,[6511,7308,8520,10205,12260,14777,19569,35825,65535,...,1623,1021,1008,1026,1250,5839],sample4,2.038084,0.948045,1.380240
4,sample3,[6561,7342,8562,10166,12202,14838,19593,35869,65535,...,1638,1010,1012,1047,1307,5801],sample5,4.978295,0.459765,2.539622
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
187,,[6652,7453,8641,10270,12168,15014,20000,36732,65535,...,1619,1026,1025,1078,1294,5834],,,,
188,sample95,[6798,7514,8786,10431,12372,15419,20547,37854,65535,...,1587,1035,1033,1063,1271,5985],,,,
189,,[6764,7534,8828,10532,12454,15504,20566,37705,65535,...,1640,1030,1006,1073,1280,5925],,,,
190,sample96,[6847,7545,8795,10452,12588,15515,20492,37710,65535,...,1667,1052,1019,1048,1266,6056],,,,


In [3]:
input_cols = df.columns[1:2049]
target_cols = df.columns[2050:]

In [4]:
targets  = df[target_cols].dropna().to_numpy()
targets[:5]

array([[4.61928175, 1.93717196, 1.0529281 ],
       [5.78271763, 1.17590196, 1.21473752],
       [3.95344763, 1.35047324, 2.13245934],
       [2.03808365, 0.94804543, 1.38023962],
       [4.97829455, 0.4597653 , 2.53962152]])

In [5]:
df = df[input_cols]
df['Unnamed: 1'] = df['Unnamed: 1'].str.replace('[\[\]]', '', regex=True).astype('int64')
df['Unnamed: 2048'] = df['Unnamed: 2048'].str.replace('[\[\]]', '', regex=True).astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Unnamed: 1'] = df['Unnamed: 1'].str.replace('[\[\]]', '', regex=True).astype('int64')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Unnamed: 2048'] = df['Unnamed: 2048'].str.replace('[\[\]]', '', regex=True).astype('int64')


In [6]:
import numpy as np

inputs = []
for i in range(0, len(df), 2):
    row1 = df.iloc[i].to_numpy()
    row2 = df.iloc[i+1].to_numpy()
    i = np.concatenate([row1, row2])
    inputs.append(i)
    
inputs = np.stack(inputs)
inputs.shape

(96, 4096)

In [7]:
from sklearn.model_selection import train_test_split


train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(
        inputs,
        targets,                      
        test_size=0.2,
        random_state=1000,
        shuffle=True
)

train_inputs.shape, eval_inputs.shape, train_targets.shape, eval_targets.shape

((76, 4096), (20, 4096), (76, 3), (20, 3))

In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
train_inputs = scaler.fit_transform(train_inputs)
eval_inputs = scaler.transform(eval_inputs)

In [13]:
train_inputs = torch.tensor(train_inputs)
train_targets = torch.tensor(train_targets)
eval_inputs = torch.tensor(eval_inputs)
eval_targets = torch.tensor(eval_targets)

In [14]:
from torch.utils.data import TensorDataset

train_ds = TensorDataset(train_inputs, train_targets)
eval_ds = TensorDataset(eval_inputs, eval_targets)

In [18]:
from torch.utils.data import DataLoader


def build_loader(
    SEED,
    ds,
    train=True,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
):
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    generator = torch.Generator()
    generator.manual_seed(SEED if train else SEED+1)

    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        persistent_workers=persistent_workers,
        worker_init_fn=seed_worker,
        generator=generator,
        #sampler=DistributedSampler(
        #    train_ds,
        #    shuffle=True,
        #    drop_last=True,
        #    seed=config.seed
        #)
    )
    
    
def return_dls(train_ds, eval_ds):
    train_dl = build_loader(
        SEED,
        train_ds,
        train=True,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=0,
        drop_last=True,
        pin_memory=True,
        persistent_workers=False,
    )

    eval_dl = build_loader(
        SEED,
        eval_ds,
        train=False,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=0,
        drop_last=True,
        pin_memory=True,
        persistent_workers=False,
    )
    
    return train_dl, eval_dl

In [20]:
import torch.nn.functional as F
from sklearn.metrics import r2_score


def loss_fn(logits, targets):
    logits = logits.view(-1)
    targets = targets.view(-1)
    return F.mse_loss(logits, targets)


def metric_fn(logits, targets):
    preds = logits.cpu().detach().numpy()
    targets = targets.cpu().detach().numpy()
    one = r2_score(targets[:, 0], preds[:, 0])
    two = r2_score(targets[:, 1], preds[:, 1])
    three = r2_score(targets[:, 2], preds[:, 2])
    mean_r2 = (one + two + three) / 3
    return one, two, three, mean_r2

In [None]:
import torch.nn as nn

class Model(nn.Module):
    """
    MLP optimized for Raman spectroscopy concentration prediction
    """
    def __init__(self, input_size=2048, hidden_sizes=[1024, 512, 256, 128], 
                 output_size=3, dropout_rate=0.3, use_batch_norm=True):
        super().__init__()
        
        self.input_size = input_size
        self.output_size = output_size
        self.use_batch_norm = use_batch_norm
        
        # Build the network layers
        layers = []
        prev_size = input_size
        
        for i, hidden_size in enumerate(hidden_sizes):
            # Linear layer
            layers.append(nn.Linear(prev_size, hidden_size))
            
            # Batch normalization
            if use_batch_norm:
                layers.append(nn.BatchNorm1d(hidden_size))
            
            # Activation
            layers.append(nn.ReLU())
            
            # Dropout (not on the last hidden layer)
            if i < len(hidden_sizes) - 1:
                layers.append(nn.Dropout(dropout_rate))
            
            prev_size = hidden_size
        
        # Output layer
        layers.append(nn.Linear(prev_size, output_size))
        
        self.network = nn.Sequential(*layers)
        
        # Initialize weights
        self._initialize_weights()
    
    def _initialize_weights(self):
        """Initialize network weights using Xavier/Glorot initialization"""
        for module in self.modules():
            if isinstance(module, nn.Linear):
                nn.init.xavier_normal_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
    
    def forward(self, x):
        return self.network(x)


In [None]:
from transformers import get_cosine_schedule_with_warmup
from tqdm.auto import tqdm



MODEL_NAME = "MLP.Baseline.AdamW"
EPOCHS = 100
BATCH_SIZE = 64
WD = 1e-3
LR = 1e-4
DROPOUT = 0.2
SCORE = float('-inf')
LOG = False
RESUME = False
device = "cuda" if torch.cuda.is_available() else "cpu"



model = Model().to(device)
#model = nn.DataParallel(model)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD, foreach=True)
scaler = torch.GradScaler(device)
train_dl, eval_dl = return_dls(train_ds, eval_ds)

total_training_steps = len(train_dl) * EPOCHS
warmup_steps = int(total_training_steps * 0.05)  # e.g. 5% warmup
scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_training_steps
)

if LOG:
    neptune_run = setup_neptune()



for epoch in tqdm(range(EPOCHS)):
    model.train()
    total_loss = 0.0
    all_logits = []
    all_targets = []
    
    for inputs, targets in train_dl:
        inputs = inputs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        
        with torch.autocast(device_type=device, dtype=torch.float16, cache_enabled=True):
            logits = model(inputs)
            loss = loss_fn(logits, targets)
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()

        total_loss += loss.detach().cpu()
        all_logits.append(logits.detach().cpu())
        all_targets.append(targets.detach().cpu())
    
    all_logits = torch.stack(all_logits)
    all_targets = torch.stack(all_targets)
    
    one, two, three, r2 = metric_fn(all_logits, all_targets)
    total_loss = total_loss / len(train_dl)
    
    
    model.eval()
    eval_total_loss = 0.0
    eval_all_logits = []
    eval_all_targets = []

    for inputs, targets in eval_dl:
        inputs = inputs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        
        with torch.inference_mode():
            with torch.autocast(device_type=device, dtype=torch.float16, cache_enabled=True):
                logits = model(inputs)
                loss = loss_fn(logits, targets)
                
        eval_total_loss += loss.detach().cpu()
        eval_all_logits.append(logits.detach().cpu())
        eval_all_targets.append(targets.detach().cpu())
    
    eval_all_logits = torch.stack(eval_all_logits)
    eval_all_targets = torch.stack(eval_all_targets)
    
    eval_one, eval_two, eval_three, eval_r2 = metric_fn(eval_all_logits, eval_all_targets)
    eval_total_loss = eval_total_loss / len(eval_dl)
    
    if r2 > SCORE:
        SCORE = r2
        data = {"state_dict": model.state_dict()}
        data["epoch"] = epoch 
        data["score"] = SCORE
        torch.save(data, "/kaggle/working/ckpt.pt")
    
    if LOG:
        neptune_run["train/loss"].append(LOSS)
        neptune_run["eval/loss"].append(EVAL_LOSS)
        neptune_run["train/auroc"].append(auroc)
        neptune_run["eval/auroc"].append(eval_auroc)
        
    print(
        f"Epoch: {epoch}, "
        f"train/loss: {total_loss:.4f}, "
        f"eval/loss: {eval_total_loss:.4f}, "
        f"train/r2: {r2:.4f}, "
        f"eval/r2: {eval_r2:.4f}, "
        f"train/one: {one:.4f}, "
        f"train/two: {two:.4f}, "
        f"train/three: {three:.4f}, "
        f"eval/one: {eval_one:.4f}, "
        f"eval/two: {eval_two:.4f}, "
        f"eval/three: {eval_three:.4f}, "
    )

In [None]:
#def load_test():
test = pd.read_csv(os.path.join(path, files[6]))

row1 = test.columns[1:].to_numpy().copy()
row1[-1] = "5611"
row1 = row1.astype(np.float64)


cols = test.columns[1:]
test = test[cols]
test[" 5611]"] = test[" 5611]"].str.replace('[\[\]]', '', regex=True).astype('int64')
test = test.to_numpy()

test = np.insert(test, 0, row1, axis=0)
test = test.reshape(-1, 2, 2048).reshape(96, 2*2048)
test = test.astype(np.float32)
test.shape, test.dtype

In [None]:
#def scale():
test = scaler.transform(test)
preds = xgb.predict(test)
preds.shape

In [None]:
#def prepare_test():
column_names = ['Glucose', 'Sodium Acetate', 'Magnesium Sulfate']
preds_df = pd.DataFrame(preds, columns=column_names)
preds_df.insert(0, 'ID', [i+1 for i in range(len(preds_df))])
preds_df

In [None]:
#def save_test():
preds_df.to_csv("baseline.csv", index=False)
f = pd.read_csv("/kaggle/working/xgb_baseline.csv")
f