In [1]:
#!pip3 install -qU xgboost #pandas scipy scikit-learn matplotlib tqdm 

In [1]:
import random
import torch
import numpy as np


SEED = 1000

def setup_reproducibility():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")

setup_reproducibility()

In [2]:
from sklearn.model_selection import train_test_split
from huggingface_hub import login, snapshot_download
import matplotlib.pyplot as plt


def get_stats(tensor, p=True, r=False):
    mean, std = tensor.mean(), tensor.std()
    min, max =  tensor.min(), tensor.max()
    
    if p: print(f"Min: {min}, Max: {max}, Mean: {mean}, Std: {std}")
    if r: return min, max, mean, std
    
    
def zscore(tensor, mean=None, std=None):
    if mean is None: mean = tensor.mean()
    if std is None: std = tensor.std()
    return (tensor - mean) / (std + 1e-6)


def get_model_size(model):
    print(sum(p.numel() for p in model.parameters()) / 1e6)
    

def get_index(iterable):
    return random.randint(0, len(iterable) - 1)


def split(inputs, targets, seed):
    return train_test_split(
        inputs,
        targets, 
        test_size=0.2,
        shuffle=True, 
        random_state=seed
    ) 
    
    
def hf_ds_download(hf_token, repo_id):
    login(hf_token[1:])
    return snapshot_download(repo_id, repo_type="dataset")


def show_waves(waves, dpi=100):
    """
    waves: numpy array of shape (3, N)
    Creates three separate figures that stretch wide.
    """

    N = waves.shape[1]
    t = np.arange(N)

    # Wide aspect ratio; height modest so each window fills width
    for i in range(waves.shape[0]):
        fig = plt.figure(figsize=(14, 4), dpi=dpi)  # wide figure
        ax = fig.add_subplot(111)
        ax.plot(t, waves[i], linewidth=1)
        ax.set_title(f"Wave {i+1}")
        ax.set_xlabel("Sample")
        ax.set_ylabel("Amplitude")
        ax.grid(True)
        fig.tight_layout()  # reduce margins to use width
    plt.show()

In [None]:
import numpy as np
from scipy import signal
from scipy.signal import find_peaks, peak_widths
from scipy.stats import skew, kurtosis
from tqdm.auto import tqdm


def get_advanced_spectra_features(X, b=False):
    """Create multi-channel features from spectra: raw, 1st derivative, 2nd derivative."""
    X_processed = np.zeros_like(X)
    
    # Baseline correction and SNV
    for i in tqdm(range(X.shape[0])):
        poly = np.polyfit(np.arange(X.shape[1]), X[i], 3)
        baseline = np.polyval(poly, np.arange(X.shape[1]))
        corrected_spec = X[i] - baseline
        X_processed[i] = (corrected_spec - corrected_spec.mean()) / (corrected_spec.std() + 1e-8)
        
    # Calculate derivatives
    deriv1 = signal.savgol_filter(X_processed, window_length=11, polyorder=3, deriv=1, axis=1)
    deriv2 = signal.savgol_filter(X_processed, window_length=11, polyorder=3, deriv=2, axis=1)

    if b:
        return X_processed
    else:
        return np.stack([X_processed, deriv1, deriv2], axis=1)
    

def compute_statistical_features(spectra):
    """Compute basic statistical features from spectra."""
    return np.stack([
        np.mean(spectra, axis=1), np.std(spectra, axis=1),
        skew(spectra, axis=1), kurtosis(spectra, axis=1)
    ], axis=1)
    
    
def extract_peak_features(spectra):
    """Extract features based on spectral peaks."""
    features = []
    for spec in spectra:
        peaks, _ = find_peaks(spec, height=np.percentile(spec, 90), prominence=1)
        widths, _, _, _ = peak_widths(spec, peaks, rel_height=0.5)
        features.append([
            len(peaks),
            np.sum(spec[peaks]) if len(peaks) > 0 else 0,
            np.mean(spec[peaks]) if len(peaks) > 0 else 0,
            np.mean(widths) if len(widths) > 0 else 0,
        ])
    return np.array(features)

In [3]:
#HF_TOKEN = "xhf_XURkoNhwOIPtEdHfNeRpVkjEwKSkhtigFi"
#path = hf_ds_download(HF_TOKEN, repo_id="ArbaazBeg/kaggle-spectogram")

In [4]:
import os

#path = "/root/.cache/huggingface/hub/datasets--ArbaazBeg--kaggle-spectogram/snapshots/b61d17629d4886fcc89e5bd9ca022af4da493d73"
path = "/kaggle/input/dig-4-bio-raman-transfer-learning-challenge"
files = sorted(os.listdir(path))
[(i, files[i]) for i in range(len(files))]

[(0, '96_samples.csv'),
 (1, 'anton_532.csv'),
 (2, 'anton_785.csv'),
 (3, 'kaiser.csv'),
 (4, 'metrohm.csv'),
 (5, 'mettler_toledo.csv'),
 (6, 'sample_submission.csv'),
 (7, 'tec5.csv'),
 (8, 'timegate.csv'),
 (9, 'tornado.csv'),
 (10, 'transfer_plate.csv')]

In [23]:
import pandas as pd


csv_path = os.path.join(path, files[10])
df = pd.read_csv(csv_path)

input_cols = df.columns[1:2049]
target_cols = df.columns[2050:]

targets  = df[target_cols].dropna().to_numpy()

df = df[input_cols]
df['Unnamed: 1'] = df['Unnamed: 1'].str.replace("[\[\]]", "", regex=True).astype('int64')
df['Unnamed: 2048'] = df['Unnamed: 2048'].str.replace("[\[\]]", "", regex=True).astype('int64')

inputs = df.to_numpy().reshape(-1, 2, 2048)
inputs = inputs.mean(axis=1)

inputs.shape, targets.shape

((96, 2048), (96, 3))

In [None]:
i = get_advanced_spectra_features(inputs, True)
statistical_features = compute_statistical_features(i)
statistical_features.shape

  0%|          | 0/96 [00:00<?, ?it/s]

(96, 4)

In [10]:
inputs = get_advanced_spectra_features(inputs)
inputs.shape

  0%|          | 0/96 [00:00<?, ?it/s]

(96, 3, 2048)

In [11]:
inputs = inputs.reshape(-1, 3 * 2048).astype(np.float32)
inputs = np.hstack([inputs, peak_features, statistical_features])
targets = targets.astype(np.float32)
inputs.shape, targets.shape

((96, 6152), (96, 3))

In [12]:
train_inputs, eval_inputs, train_targets, eval_targets = split(inputs, targets, SEED)

In [13]:
_, _, mean, std = get_stats(train_inputs, p=True, r=True)
train_inputs = zscore(train_inputs, mean, std)
eval_inputs = zscore(eval_inputs, mean, std)

Min: -5.800850868225098, Max: 243.3925281028047, Mean: 0.03586188864101771, Std: 2.231636925819348


In [14]:
get_stats(train_inputs, True)
get_stats(eval_inputs, True)

Min: -2.6154389515150225, Max: 109.04845423112938, Mean: -1.261358032220765e-18, Std: 0.9999995518986353
Min: -1.9500487490491285, Max: 87.04192702291617, Mean: 0.00021977133176571292, Std: 1.0071908805508913


In [18]:
score = -float("inf")
best_params = {}

In [16]:
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score


class hparams:
    n_estimators = 4000
    learning_rate = 0.001
    max_depth = 10
    #reg_lambda = 0.0001
    
    
model = XGBRegressor(
    n_estimators=hparams.n_estimators,
    learning_rate=hparams.learning_rate,
    max_depth=hparams.max_depth,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=SEED,
    n_jobs=-1,
    #reg_lambda=hparams.reg_lambda,
    eval_metric="rmse",  
    #early_stopping_rounds=5,        
    tree_method="hist", 
    device="cuda",
)

model.fit(
    train_inputs, train_targets,
    eval_set=[(eval_inputs, eval_targets)],
    verbose=False,
)

preds = model.predict(eval_inputs)
r2 = r2_score(eval_targets, preds)

if r2 > score:    
    best_params["n_estimators"] = hparams.n_estimators
    best_params["learning_rate"] = hparams.learning_rate
    best_params["max_depth"] = hparams.max_depth
    #best_params["reg_lambda"] = hparams.reg_lambda
    print(best_params)
    print("previous", score)
    score = r2
    print("best", score) 
else:
    print("Failed", r2, "best", score)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




NameError: name 'best_params' is not defined

In [17]:
r2

0.9163453382992218

In [None]:
print("Failed", r2, "best", score) 

In [None]:
{'n_estimators': 4000,
 'learning_rate': 0.001,
 'max_depth': 10,
 'reg_lambda': 0.0001}

0.919169955244037

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
from tqdm.auto import tqdm


seeds = [SEED]#np.random.randint(0, 10**6, size=10).tolist()
models = {"1": [], "2": [], "3": []}

for seed in tqdm(seeds):
    print(f"Seed {seed}")
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    scores_mean = []
    
    for i in range(3):
        splits = kf.split(inputs, targets)
        scores = []
        
        for j, (train_idx, eval_idx) in enumerate(splits):
            train_inputs, train_targets = inputs[train_idx], targets[train_idx, i]
            eval_inputs, eval_targets = inputs[eval_idx], targets[eval_idx, i]
            
            _, _, mean, std = get_stats(train_inputs, p=False, r=True)
            train_inputs = zscore(train_inputs, mean, std)
            eval_inputs = zscore(eval_inputs, mean, std)
            
            model = XGBRegressor(
                n_estimators=200,
                learning_rate=0.05,
                max_depth=3,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=seed,
                n_jobs=-1,
                #reg_lambda=1.0,
                eval_metric="rmse",  
                #early_stopping_rounds=5,        
                tree_method="hist", 
                device="cuda",
            )
            
            model.fit(
                train_inputs, train_targets,
                eval_set=[(eval_inputs, eval_targets)],
                verbose=False,
            )
            
            preds = model.predict(eval_inputs)
            score = r2_score(eval_targets, preds)
            scores.append(score)
            models[str(i+1)].append((score, model, mean, std))
        
        scores = np.mean(scores)
        scores_mean.append(scores)
        print(f"Mean R2 (target {i}): {scores:.4f}")
    
    scores_mean = np.mean(scores_mean)
    print(f"Final: {scores_mean:.4f}\n")

In [None]:
def get_best_models(models):
    best_models = []
    
    for i in range(1, 4):
        score = float('-inf')
        for s, model, mean, std in models[str(i)]:
            if s > score:
                score = s
                best = model
                mu = mean
                sigma = std
        best_models.append((best, score, mu, sigma))
            
    return best_models

best_models = get_best_models(models)

In [None]:
scores = {"1": [], "2": [], "3": []}
for i in range(1, 4):
    for s, m, mean, std in models[str(i)]:
        scores[str(i)].append(s)    
    scores[str(i)] = np.mean(scores[str(i)])
    
scores, np.mean([scores["1"], scores["2"], scores["3"]])

In [None]:
scores = []
for m, s, mu, sigma in best_models:
    scores.append(s)
scores, np.mean(scores)

In [19]:
csv_path = os.path.join(path, files[0])
test_df = pd.read_csv(csv_path)


row1 = test_df.columns[1:].to_numpy().copy()
row1[-1] = "5611"
row1 = row1.astype(np.float64)


cols = test_df.columns[1:]
test_df = test_df[cols]
test_df[" 5611]"] = test_df[" 5611]"].str.replace('[\[\]]', '', regex=True).astype('int64')
test = test_df.to_numpy()

test = np.insert(test, 0, row1, axis=0)
test = test.reshape(-1, 2, 2048).mean(axis=1)

get_stats(test)
t = get_advanced_spectra_features(test, True)
test_statistical_features = compute_statistical_features(t)
test_peak_features = extract_peak_features(t)
test = get_advanced_spectra_features(test)
test = test.reshape(-1, 3 * 2048)
test = np.hstack([test, test_peak_features, test_peak_features])
test = zscore(test, mean, std)
test.shape, test.dtype, get_stats(test)

Min: 983.5, Max: 65535.0, Mean: 3623.216623942057, Std: 6772.114021862655


  0%|          | 0/96 [00:00<?, ?it/s]

  0%|          | 0/96 [00:00<?, ?it/s]

Min: -1.9732901959496465, Max: 87.351733101176, Mean: 0.005301608113692912, Std: 1.0965628607387299


((96, 6152), dtype('float64'), None)

In [20]:
preds = model.predict(test)
preds.shape

(96, 3)

In [None]:
preds = []

for i in range(3):
    local_preds = []
    for _, model, mean, std in models[str(i + 1)]:
        t = zscore(test.copy(), mean, std)
        p = model.predict(test)
        local_preds.append(p)
    
    local_preds = np.stack(local_preds).mean(axis=0)
    preds.append(local_preds)
    
preds = np.column_stack(preds)
preds.shape

In [None]:
preds = []

for model, s, mean, std in best_models:
    print(mean, std)
    t = zscore(test.copy(), mean, std)
    p = model.predict(t)
    preds.append(p)
    
preds = np.column_stack(preds)
preds.shape

In [21]:
column_names = ['Glucose', 'Sodium Acetate', 'Magnesium Sulfate']
preds_df = pd.DataFrame(preds, columns=column_names)
preds_df.insert(0, 'ID', [i+1 for i in range(len(preds_df))])
preds_df

Unnamed: 0,ID,Glucose,Sodium Acetate,Magnesium Sulfate
0,1,3.180426,0.841516,0.599093
1,2,6.543715,1.960217,1.715010
2,3,5.501711,0.486627,1.090748
3,4,3.456721,0.948714,0.590265
4,5,9.866538,0.977800,1.190017
...,...,...,...,...
91,92,5.677711,0.646986,1.292048
92,93,4.867382,0.491818,0.914359
93,94,4.728037,0.498267,0.867846
94,95,3.156774,1.472639,0.944631


In [22]:
name = "xgboost.9163.peak.features.stat.features.csv"
preds_df.to_csv(name, index=False)
f = pd.read_csv(f"{name}")
f

Unnamed: 0,ID,Glucose,Sodium Acetate,Magnesium Sulfate
0,1,3.180426,0.841516,0.599093
1,2,6.543715,1.960217,1.715010
2,3,5.501711,0.486627,1.090748
3,4,3.456721,0.948714,0.590265
4,5,9.866538,0.977800,1.190017
...,...,...,...,...
91,92,5.677711,0.646986,1.292048
92,93,4.867382,0.491818,0.914359
93,94,4.728037,0.498267,0.867846
94,95,3.156774,1.472639,0.944631
