In [None]:
!pip install -q jupyter_contrib_nbextensions
!jupyter contrib nbextension install --user
!jupyter nbextension enable hinterland/hinterland

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()

In [None]:
pip install -qU pytorch-tabnet scikit-learn neptune

In [None]:
!nvidia-smi

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub


arbaazbeg_bitgrit_air_pollution_path = kagglehub.dataset_download('arbaazbeg/bitgrit-air-pollution')
print('Data source import complete.')

In [None]:
import torch


def print_config(config):
    attrs = {}
    attrs.update(config.__dict__)

    for key, value in config.__class__.__dict__.items():
        if not key.startswith('__') and key not in attrs:
            attrs[key] = value

    for key, value in attrs.items():
        print(f"{key}: {value}")


class Config:
    # Model
    model_name = "tabnet.kfold"
    input_dim = 6
    target_dim = 1

    # Device & reproducibility
    device = "cuda" if torch.cuda.is_available() else "cpu"
    seed = 5274

    # Optimization
    optimizer_name = "AdamW"
    lr = 1e-4
    weight_decay = 1e-3

    grad_accum = False
    grad_accum_steps = 1
    batch_size = 32  # // grad_accum_steps

    # Training schedule
    num_epochs = 100
    scheduler_name = "default"

    # Regularization
    dropout = 0.0
    drop_path_rate = 0.0
    label_smoothing = 0.0

    # Experiment tracking
    neptune_token = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlOGE2YjNiZS1mZGUyLTRjYjItYTg5Yy1mZWJkZTIzNzE1NmIifQ=="
    with_id = ""
    resume = False

config = Config()

In [None]:
import random
import numpy as np
import torch

def setup_reproducibility(config):
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")

setup_reproducibility(config)

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


def load_csv(path, np=True):
    df = pd.read_csv(path).drop(columns=['id'])
    df.dropna(inplace=True)
    return df.to_numpy() if np else df


def scale_data(data, scaler, scale_targets=False):
    if scale_targets:
        data = scaler.fit_transform(data)
        inputs = data[:, :-1]
        targets = data[:, -1].reshape(-1, 1)
        return inputs, targets
    else:
        inputs = data[:, :-1]
        targets = data[:, -1].reshape(-1, 1)
        inputs = scaler.fit_transform(inputs)
        return inputs, targets

def split_data(inputs, targets,  seed, shuffle=True):    
    train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(
        inputs, 
        targets, 
        train_size=0.8, 
        random_state=seed, 
        shuffle=shuffle
    )
   
    return (
        train_inputs.astype(np.float32),
        train_targets.astype(np.float32),
        eval_inputs.astype(np.float32),
        eval_targets.astype(np.float32),
    )


train_path = arbaazbeg_bitgrit_air_pollution_path+"/train.csv"

data = load_csv(train_path)
scaler = StandardScaler()
inputs, targets = scale_data(data, scaler, scale_targets=False)
mean = targets.mean()
std = targets.std()
mean, std, inputs.shape, targets.shape

In [None]:
from torch.utils.data import Dataset


class TabularDataset(Dataset):
    def __init__(self, inputs, targets):
        self.inputs = inputs
        self.targets = targets
        assert len(self.inputs) == len(self.targets), "len not eq"

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return self.inputs[index], self.targets[index]

ds = TabularDataset(inputs.astype(np.float32), targets.astype(np.float32))
len(ds)

In [None]:
from torch.utils.data import DataLoader


def build_loader(
    SEED,
    ds,
    train=True,
    batch_size=16,
    shuffle=False,
    num_workers=4,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
):
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    generator = torch.Generator()
    generator.manual_seed(SEED if train else SEED+1)

    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        persistent_workers=persistent_workers,
        worker_init_fn=seed_worker,
        generator=generator,
        #sampler=DistributedSampler(
        #    train_ds,
        #    shuffle=True,
        #    drop_last=True,
        #    seed=config.seed
        #)
    )
    
    
def return_dls(train_ds, eval_ds, seed):
    train_dl = build_loader(
        seed,
        train_ds,
        train=True,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=0,
        drop_last=True,
        pin_memory=True,
        persistent_workers=False,
    )

    eval_dl = build_loader(
        seed,
        eval_ds,
        train=False,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=0,
        drop_last=True,
        pin_memory=True,
        persistent_workers=False,
    )
    
    return train_dl, eval_dl

In [None]:
import torch.nn as nn
from pytorch_tabnet.tab_network import TabNet
from pytorch_tabnet.utils import create_group_matrix


class Model(nn.Module):
    def __init__(self, mean, std):
        super().__init__()
        
        grp_list = [[i] for i in range(6)]
        group_matrix = create_group_matrix(
            grp_list,
            config.input_dim,
        ).to(config.device)

        self.backbone = TabNet(
            input_dim=config.input_dim,
            output_dim=config.target_dim,
            group_attention_matrix=group_matrix
        ).to(config.device)

        self.register_buffer('mean', torch.tensor(mean))
        self.register_buffer('std',  torch.tensor(std))

    def forward(self, x):
        x = self.backbone(x)[0]
        return (x * self.std) + self.mean
        
#model = Model(mean, std).to(config.device)       
        
#optimizer = torch.optim.AdamW(
#    model.parameters(),
#    lr=config.lr,
#    weight_decay=config.weight_decay,
#    fused=True
#)

In [None]:
from transformers import get_cosine_schedule_with_warmup


def build_scheduler(train_dl, optimizer, config, ratio=0.05):
    total_training_steps = (len(train_dl)) * config.num_epochs
    warmup_steps = int(total_training_steps * ratio)

    scheduler = get_cosine_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_training_steps
    )
    
    return scheduler

In [None]:
from collections import OrderedDict


def remove_orig_mod(state_dict):
    new_state_dict = OrderedDict()
    prefix = "_orig_mod."
    for key, value in state_dict.items():
        if key.startswith(prefix):
            new_key = key[len(prefix):]
        else:
            new_key = key
        new_state_dict[new_key] = value
    return new_state_dict

In [None]:
import neptune

def setup_neptune(config):
    if not config.resume:
        neptune_run = neptune.init_run(
            project="arbaaz/bitgrit-air-pollution-1",
            name=config.model_name,
            api_token=config.neptune_token
        )

        neptune_run["h_parameters"] = {
            "model_name": config.model_name,
            "optimizer_name": config.optimizer_name,
            "learning_rate": config.lr,
            "scheduler_name": config.scheduler_name,
            "weight_decay": config.weight_decay,
            "dropout": config.dropout,
            "label_smoothing": config.label_smoothing,
            "num_epochs": config.num_epochs,
            "batch_size": config.batch_size,
            "grad_accum_steps": config.grad_accum_steps,
        }
    else:
        neptune_run = neptune.init_run(
            project="arbaaz/onward-speed-and-structure",
            with_id=config.with_id,
            api_token=config.neptune_token
        )

    return neptune_run

#neptune_run = setup_neptune(config)    

In [None]:
import os
import torch.nn.functional as F
from sklearn.metrics import root_mean_squared_error
    

def loss_fn(preds, targets):
    return F.mse_loss(preds, targets)


def metric_fn(preds, targets):
    mse = F.mse_loss(preds, targets)
    rmse = -(torch.sqrt(mse) / 100)
    return torch.exp(rmse)


def metric_fn_np(preds, targets):
    rmse = root_mean_squared_error(preds, targets)
    rmse = -(rmse / 100)
    return np.exp(rmse)

In [None]:
from tqdm.auto import tqdm
            

def return_logits(model, inputs):
    return  model(inputs)


def train_step(model, inputs, targets):
    with torch.autocast(device_type=config.device, dtype=torch.float16, cache_enabled=True):
        logits = return_logits(model, inputs)
        mse = loss_fn(logits, targets)
        rmse = metric_fn(logits, targets)
    return mse, rmse


def eval_step(model, inputs, targets):
    with torch.inference_mode():
        with torch.autocast(device_type=config.device, dtype=torch.float16, cache_enabled=True):
            logits = return_logits(model, inputs)
            mse = loss_fn(logits, targets)
            rmse = metric_fn(logits, targets)
    return mse, rmse


def inference_step(model, inputs):
    model.eval() # FIX THIS 
    with torch.inference_mode():
        with torch.autocast(device_type=config.device, dtype=torch.float16, cache_enabled=True):
            logits = return_logits(model, inputs)
    return logits
    

def backward_step(loss, optimizer, scaler, scheduler):
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad()
    scheduler.step()
          
    
def reduce_outputs(outputs):
    data = {}
    for key in outputs[0].keys():
        if outputs[0][key].dim() == 0:
            value = torch.stack([x[key] for x in outputs]).mean().item()
            data[key] = value
    return data


def log_and_print(neptune_run, data, log=True, print=True):
    if log:
        for key, value in data.items():
            neptune_run[key].append(value)
        
    if print:
        tqdm.write((
            f"Epoch: {data['epoch']}, "
            f"train/mse: {data['train/mse']:.4f}, "
            f"eval/mse: {data['eval/mse']:.4f}, "
            f"train/rmse: {data['train/rmse']:.4f}, "
            f"eval/rmse: {data['eval/rmse']:.4f}, "
        ))
        
        
def checkpoint(outputs, metric, score, epoch, model, ckpt_name):
    if outputs[metric] > score: 
        score = outputs[metric]
        metadata = {
            "train_rmse": outputs["train/rmse"],
            "eval_rmse": score,
            "epoch": epoch
        }
        save_ckpt(model, metadata, f"/ckpts/{ckpt_name}.pt")
        return score


def save_ckpt(model, metadata, ckpt_path):
    data = {"state_dict": model.state_dict()}
    for k, v in metadata.items():
        data[k] = v
    torch.save(data, ckpt_path)

In [None]:
def train_one_epoch(
    model, 
    optimizer,
    scaler,  
    scheduler,
    train_dl,
    epoch,
    num_epochs,
    resume_epoch,
    resume, 
    log_lr_on_step,
    neptune_run
):
    model.train()
    pbar = tqdm(
        train_dl,
        desc=f"Epoch: {epoch}/{num_epochs}",
        leave=False,
        dynamic_ncols=True,
        colour="red",
        position=1,
    )
    outputs = []

    for index, (inputs, targets) in enumerate(pbar):
        if resume and epoch < resume_epoch:
            continue
        
        inputs = inputs.to(config.device, non_blocking=True)
        targets = targets.to(config.device, non_blocking=True)
    
        mse, rmse = train_step(model, inputs, targets)
        backward_step(mse, optimizer, scaler, scheduler)

        outputs.append({
            "train/mse": mse.detach().cpu(),
            "train/rmse": rmse.detach().cpu(),
        })

        if log_lr_on_step:
            log_and_print(
                neptune_run,
                data={"STEP/lr": scheduler.get_last_lr()[0]}, 
                log=True, 
                print=False
            )
            
    return outputs
            
            
def evaluate(model, eval_dl):
    model.eval()
    pbar = tqdm(
        eval_dl,
        desc=f"Evaluating",
        leave=False,
        dynamic_ncols=True,
        colour="blue",
        position=1,
    )
    outputs = []
    
    for inputs, targets in pbar:
        inputs = inputs.to(config.device, non_blocking=True)
        targets = targets.to(config.device, non_blocking=True)
        mse, rmse = eval_step(model, inputs, targets)
        
        outputs.append({
            "eval/mse": mse.detach().cpu(),
            "eval/rmse": rmse.detach().cpu(),
        })
        
    return outputs

In [None]:
def train(
    model, 
    optimizer, 
    scheduler, 
    train_dl,
    eval_dl, 
    num_epochs, 
    resume_epoch,
    resume,
    log_lr_on_step,
    neptune_run,
    ckpt_name
):
    os.makedirs("/ckpts", exist_ok=True)
    train_outputs = []
    eval_outputs = []
    RMSE = float('-inf')
    
    scaler = torch.GradScaler(device=config.device)
    pbar = tqdm(range(0, num_epochs), position=0, colour="green")
    
    for epoch in pbar:
        train_outputs = train_one_epoch(
            model,
            optimizer, 
            scaler, 
            scheduler, 
            train_dl,
            epoch,
            num_epochs,
            resume_epoch,
            resume,
            log_lr_on_step,
            neptune_run
        )
        eval_outputs = evaluate(model, eval_dl)
        
        train_outputs = reduce_outputs(train_outputs)
        eval_outputs = reduce_outputs(eval_outputs)
        
        outputs = train_outputs | eval_outputs
        outputs["epoch"] = epoch
        log_and_print(neptune_run, outputs)
        
        RMSE = checkpoint(
            outputs=outputs, 
            epoch=epoch,
            model=model,
            score=RMSE, 
            metric="eval/rmse",
            ckpt_name=ckpt_name
        )

        #if RMSE >= torch.tensor(0.7000).to(config.device):
            #break
            
    neptune_run.stop()


def start_training():   
    try:
        train(
            model, 
            optimizer, 
            scheduler, 
            train_dl,
            eval_dl, 
            num_epochs=config.num_epochs, 
            resume_epoch=0,
            resume=config.resume,
            log_lr_on_step=True,
            neptune_run=neptune_run,
            ckpt_name=config.model_name
        )
    except Exception:
        pass  # Silently ignore any errors
    finally:
        neptune_run.stop()

In [None]:
from torch.utils.data import Subset
from sklearn.model_selection import KFold


def train_kfold(
    k,
    ds, 
    config
):
    kf = KFold(n_splits=k, shuffle=True, random_state=config.seed)
    for fold, (train_indices, eval_indices) in enumerate(kf.split(ds)):
        config.model_name = f"tabnet.{k}fold.{fold}"
        
        train_indices = torch.tensor(train_indices)
        eval_indices = torch.tensor(eval_indices)
        
        train_ds = Subset(ds, train_indices)
        eval_ds = Subset(ds, eval_indices)
        
        train_dl, eval_dl = return_dls(train_ds, eval_ds, config.seed)
        
        model = Model(mean, std).to(config.device)       
        
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=config.lr,
            weight_decay=config.weight_decay,
            fused=True
        )
        
        scheduler = build_scheduler(train_dl, optimizer, config, ratio=0.05)
        neptune_run = setup_neptune(config)
        
        try:
            train(
                model, 
                optimizer, 
                scheduler, 
                train_dl,
                eval_dl, 
                num_epochs=config.num_epochs, 
                resume_epoch=0,
                resume=config.resume,
                log_lr_on_step=True,
                neptune_run=neptune_run,
                ckpt_name=config.model_name
            )
        except Exception:
            pass 
        finally:
            neptune_run.stop()
            
        print()
        print()
        
        
train_kfold(5, ds, config)

In [None]:
test_path = arbaazbeg_bitgrit_air_pollution_path+"/test.csv"
test_inputs = load_csv(test_path)

test_scaler = StandardScaler()
test_inputs = test_scaler.fit_transform(test_inputs)
test_inputs = torch.tensor(test_inputs).float()
test_inputs.shape

In [None]:
import os

os.listdir("/ckpts/")

In [None]:
ckpt_path = f"/ckpts/{config.model_name}.pt"
ckpt_path = "/ckpts/tabnet.86.repro.stopping.at.71.pt"
ckpt = torch.load(ckpt_path)
ckpt["train_rmse"], ckpt["eval_rmse"], ckpt["epoch"]

In [None]:
model.load_state_dict(ckpt["state_dict"])
#[param.device for param in model.parameters()]

In [None]:
logits = inference_step(model, inputs=test_inputs.to(config.device))
logits.shape

In [None]:
logits = F.relu(logits)

In [None]:
logits_np = logits.cpu().detach().double().numpy()
logits_np.shape

In [None]:
if np.isnan(logits_np).any():
    print(np.isnan(logits_np.sum()))
    logits_ = np.nan_to_num(logits_np, nan=0.0)

In [None]:
(logits_np < 0).any(), (logits_np < 0).sum()

In [None]:
logits_np.mean(), logits_np.std()

In [None]:
ids = [i for i in range(len(logits_np))]
df_pred = pd.DataFrame(ids)

df_pred["pollution_value"] = logits_np
df_pred

In [None]:
config.model_name = "tabnet.86.repro.stopping.at.71.relu"
config.model_name

In [None]:
df_pred.to_csv(f"{config.model_name}.csv", index=False)