In [1]:
#!pip3 install -qU xgboost optuna pandas scipy scikit-learn matplotlib tqdm 

In [1]:
import random
import torch
import numpy as np


SEED = 1000

def setup_reproducibility():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")

setup_reproducibility()

In [2]:
from sklearn.model_selection import train_test_split


def get_stats(tensor, p=True, r=False):
    mean, std = tensor.mean(), tensor.std()
    min, max =  tensor.min(), tensor.max()
    
    if p: print(f"Min: {min}, Max: {max}, Mean: {mean}, Std: {std}")
    
    if r: return min, max, mean, std
    
    
def zscore(tensor, mean=None, std=None):
    if mean is None: mean = tensor.mean()
    if std is None: std = tensor.std()
    return (tensor - mean) / (std + 1e-6)


def get_model_size(model):
    print(sum(p.numel() for p in model.parameters()) / 1e6)
    

def get_index(iterable):
    return random.randint(0, len(iterable) - 1)


def split(inputs, targets, seed):
    return train_test_split(
        inputs,
        targets, 
        test_size=0.2,
        shuffle=True, 
        random_state=seed
    ) 

In [3]:
import matplotlib.pyplot as plt


def show_waves(waves, dpi=100):
    """
    waves: numpy array of shape (3, N)
    Creates three separate figures that stretch wide.
    """

    N = waves.shape[1]
    t = np.arange(N)

    # Wide aspect ratio; height modest so each window fills width
    for i in range(waves.shape[0]):
        fig = plt.figure(figsize=(14, 4), dpi=dpi)  # wide figure
        ax = fig.add_subplot(111)
        ax.plot(t, waves[i], linewidth=1)
        ax.set_title(f"Wave {i+1}")
        ax.set_xlabel("Sample")
        ax.set_ylabel("Amplitude")
        ax.grid(True)
        fig.tight_layout()  # reduce margins to use width
    plt.show()

In [4]:
from huggingface_hub import login, snapshot_download

if False:
    hf_token = "xhf_uOkImkbEroqtIuyvGJrttTzaebfeIdPZID"
    login(hf_token)
    repo_id = "ArbaazBeg/kaggle-spectogram"
    dataset_path = snapshot_download(repo_id, repo_type="dataset")
    dataset_path

In [5]:
#dataset_path

In [6]:
import os

path = "/root/.cache/huggingface/hub/datasets--ArbaazBeg--kaggle-spectogram/snapshots/b61d17629d4886fcc89e5bd9ca022af4da493d73"
files = sorted(os.listdir(path))
[(i, files[i]) for i in range(len(files))]

[(0, '.gitattributes'),
 (1, '96_samples.csv'),
 (2, 'anton_532.csv'),
 (3, 'anton_785.csv'),
 (4, 'kaiser.csv'),
 (5, 'metrohm.csv'),
 (6, 'mettler_toledo.csv'),
 (7, 'sample_submission.csv'),
 (8, 'tec5.csv'),
 (9, 'timegate.csv'),
 (10, 'tornado.csv'),
 (11, 'transfer_plate.csv')]

In [7]:
import pandas as pd


csv_path = os.path.join(path, files[11])
df = pd.read_csv(csv_path)

input_cols = df.columns[1:2049]
target_cols = df.columns[2050:]

targets  = df[target_cols].dropna().to_numpy()

df = df[input_cols]
df['Unnamed: 1'] = df['Unnamed: 1'].str.replace("[\[\]]", "", regex=True).astype('int64')
df['Unnamed: 2048'] = df['Unnamed: 2048'].str.replace("[\[\]]", "", regex=True).astype('int64')

inputs = df.to_numpy().reshape(-1, 2, 2048)
inputs = inputs.mean(axis=1)

inputs.shape, targets.shape

  df['Unnamed: 1'] = df['Unnamed: 1'].str.replace("[\[\]]", "", regex=True).astype('int64')
  df['Unnamed: 2048'] = df['Unnamed: 2048'].str.replace("[\[\]]", "", regex=True).astype('int64')


((96, 2048), (96, 3))

In [8]:
from scipy import signal
from tqdm.auto import tqdm


def get_advanced_spectra_features(X):
    """Create multi-channel features from spectra: raw, 1st derivative, 2nd derivative."""
    X_processed = np.zeros_like(X)
    # Baseline correction and SNV
    for i in tqdm(range(X.shape[0])):
        poly = np.polyfit(np.arange(X.shape[1]), X[i], 3)
        baseline = np.polyval(poly, np.arange(X.shape[1]))
        corrected_spec = X[i] - baseline
        X_processed[i] = (corrected_spec - corrected_spec.mean()) / (corrected_spec.std() + 1e-8)

    # Calculate derivatives
    deriv1 = signal.savgol_filter(X_processed, window_length=11, polyorder=3, deriv=1, axis=1)
    deriv2 = signal.savgol_filter(X_processed, window_length=11, polyorder=3, deriv=2, axis=1)

    # Stack as channels
    return np.stack([X_processed, deriv1, deriv2], axis=1)

inputs = get_advanced_spectra_features(inputs)
inputs.shape

  0%|          | 0/96 [00:00<?, ?it/s]

(96, 3, 2048)

In [9]:
inputs = inputs.reshape(-1, 3 * 2048).astype(np.float32)
targets = targets.astype(np.float32)

In [10]:
train_inputs, eval_inputs, train_targets, eval_targets = split(inputs, targets, SEED)

In [11]:
_, _, mean, std = get_stats(train_inputs, p=False, r=True)
train_inputs = zscore(train_inputs, mean, std)
eval_inputs = zscore(eval_inputs, mean, std)

In [13]:
import optuna
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score


def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 200, 1000)
    max_depth = trial.suggest_int("max_depth", 3, 20)
    learning_rate = trial.suggest_float("learning_rate", 0.0001, 0.8, log=True)
    
    model = XGBRegressor(
        n_estimators=n_estimators,
        learning_rate=learning_rate,
        max_depth=max_depth,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=SEED,
        n_jobs=-1,
        #reg_lambda=1.0,
        eval_metric="rmse",  
        #early_stopping_rounds=5,        
        tree_method="hist", 
        device="cuda",
    )

    model.fit(
        train_inputs, train_targets,
        eval_set=[(eval_inputs, eval_targets)],
        verbose=False,
    )

    preds = model.predict(eval_inputs)
    return r2_score(eval_targets, preds)

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=500)

[I 2025-08-21 10:15:13,717] A new study created in memory with name: no-name-d49d2e0e-77be-4bc5-b638-c10b76e4553a
[I 2025-08-21 10:15:25,310] Trial 0 finished with value: 0.8887209892272949 and parameters: {'n_estimators': 588, 'max_depth': 9, 'learning_rate': 0.03845930070451266}. Best is trial 0 with value: 0.8887209892272949.
[I 2025-08-21 10:15:37,635] Trial 1 finished with value: 0.9089828133583069 and parameters: {'n_estimators': 536, 'max_depth': 9, 'learning_rate': 0.005896263859985883}. Best is trial 1 with value: 0.9089828133583069.
[I 2025-08-21 10:15:49,727] Trial 2 finished with value: 0.5599100589752197 and parameters: {'n_estimators': 599, 'max_depth': 8, 'learning_rate': 0.0025285768710404223}. Best is trial 1 with value: 0.9089828133583069.
[I 2025-08-21 10:15:56,265] Trial 3 finished with value: 0.8731076717376709 and parameters: {'n_estimators': 940, 'max_depth': 4, 'learning_rate': 0.23118046311115328}. Best is trial 1 with value: 0.9089828133583069.
[I 2025-08-21 1

KeyboardInterrupt: 

In [14]:
study.best_params

{'n_estimators': 834, 'max_depth': 10, 'learning_rate': 0.004921367727649843}

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from tqdm.auto import tqdm


seeds = [SEED]#np.random.randint(0, 10**6, size=10).tolist()
models = {"1": [], "2": [], "3": []}

for seed in tqdm(seeds):
    print(f"Seed {seed}")
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    scores_mean = []
    
    for i in range(3):
        splits = kf.split(inputs, targets)
        scores = []
        
        for j, (train_idx, eval_idx) in enumerate(splits):
            train_inputs, train_targets = inputs[train_idx], targets[train_idx, i]
            eval_inputs, eval_targets = inputs[eval_idx], targets[eval_idx, i]
            
            _, _, mean, std = get_stats(train_inputs, p=False, r=True)
            train_inputs = zscore(train_inputs, mean, std)
            eval_inputs = zscore(eval_inputs, mean, std)
            
            model = XGBRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=3,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=seed,
                n_jobs=-1,
                #reg_lambda=1.0,
                eval_metric="rmse",  
                #early_stopping_rounds=5,        
                tree_method="hist", 
                device="cuda",
            )
            
            model.fit(
                train_inputs, train_targets,
                eval_set=[(eval_inputs, eval_targets)],
                verbose=False,
            )
            
            preds = model.predict(eval_inputs)
            score = r2_score(eval_targets, preds)
            scores.append(score)
            models[str(i+1)].append((score, model, mean, std))
        
        scores = np.mean(scores)
        scores_mean.append(scores)
        print(f"Mean R2 (target {i}): {scores:.4f}")
    
    scores_mean = np.mean(scores_mean)
    print(f"Final: {scores_mean:.4f}\n")

In [None]:
def get_best_models(models):
    best_models = []
    
    for i in range(1, 4):
        score = float('-inf')
        for s, model, mean, std in models[str(i)]:
            if s > score:
                score = s
                best = model
                mu = mean
                sigma = std
        best_models.append((best, score, mu, sigma))
            
    return best_models

best_models = get_best_models(models)

In [None]:
scores = {"1": [], "2": [], "3": []}
for i in range(1, 4):
    for s, m, mean, std in models[str(i)]:
        scores[str(i)].append(s)    
    scores[str(i)] = np.mean(scores[str(i)])
    
scores, np.mean([scores["1"], scores["2"], scores["3"]])

In [None]:
scores = []
for m, s, mu, sigma in best_models:
    scores.append(s)
scores, np.mean(scores)

In [None]:
csv_path = os.path.join(path, files[6])
test_df = pd.read_csv(csv_path)


row1 = test_df.columns[1:].to_numpy().copy()
row1[-1] = "5611"
row1 = row1.astype(np.float64)


cols = test_df.columns[1:]
test_df = test_df[cols]
test_df[" 5611]"] = test_df[" 5611]"].str.replace('[\[\]]', '', regex=True).astype('int64')
test = test_df.to_numpy()

test = np.insert(test, 0, row1, axis=0)
test = test.reshape(-1, 2, 2048).mean(axis=1)

get_stats(test)
test = get_advanced_spectra_features(test)
test = test.reshape(-1, 3 * 2048)
test.shape, test.dtype, get_stats(test)

In [None]:
preds = []

for i in range(3):
    local_preds = []
    for _, model, mean, std in models[str(i + 1)]:
        t = zscore(test.copy(), mean, std)
        p = model.predict(test)
        local_preds.append(p)
    
    local_preds = np.stack(local_preds).mean(axis=0)
    preds.append(local_preds)
    
preds = np.column_stack(preds)
preds.shape

In [None]:
preds = []

for model, s, mean, std in best_models:
    print(mean, std)
    t = zscore(test.copy(), mean, std)
    p = model.predict(t)
    preds.append(p)
    
preds = np.column_stack(preds)
preds.shape

In [None]:
column_names = ['Glucose', 'Sodium Acetate', 'Magnesium Sulfate']
preds_df = pd.DataFrame(preds, columns=column_names)
preds_df.insert(0, 'ID', [i+1 for i in range(len(preds_df))])
preds_df

In [None]:
name = "xgboost.concat.10.seeds.top1.mu.sigma.done.csv"
preds_df.to_csv(name, index=False)
f = pd.read_csv(f"/kaggle/working/{name}")
f