In [None]:
#!pip3 install -qU neptune tsfel pytorch-tabnet

In [None]:
def print_config(config):
    attrs = {}
    attrs.update(config.__dict__)

    for key, value in config.__class__.__dict__.items():
        if not key.startswith('__') and key not in attrs:
            attrs[key] = value

    for key, value in attrs.items():
        print(f"{key}: {value}")
        

def to_numpy(tensor):
    return tensor.detach().cpu().numpy()

In [None]:
import torch


class Config:
    model_name = "mlp.baseline"
    input_dim = 2048
    target_dim = 1

    device = "cuda" if torch.cuda.is_available() else "cpu"
    seed = 1000

    optimizer_name = "AdamW"
    lr = 1e-4
    weight_decay = 1e-3

    batch_size = 64  # // grad_accum_steps

    num_epochs = 100
    scheduler_name = "default"

    dropout = 0.4

    hf_token = "xhf_uOkImkbEroqtIuyvGJrttTzaebfeIdPZID"
    neptune_token = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlOGE2YjNiZS1mZGUyLTRjYjItYTg5Yy1mZWJkZTIzNzE1NmIifQ=="
    with_id = ""
    resume = False

config = Config()

In [None]:
import random
import numpy as np


def setup_reproducibility(config):
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")

setup_reproducibility(config)

In [None]:
from huggingface_hub import login, snapshot_download

login(config.hf_token)
repo_id = "ArbaazBeg/crunchdao-structural-break-detection"
path = snapshot_download(repo_id, repo_type="dataset")

In [None]:
import os

inputs_path = path+"/X_train.parquet"
targets_path = path+"/y_train.parquet"
os.listdir(path)

In [None]:
import pandas as pd

def load_parquet(path):
    return pd.read_parquet(path)

inputs_df = load_parquet(inputs_path)
targets_df = load_parquet(targets_path)

In [None]:
import pandas as pd
import numpy as np

# --- Feature engineering ---
def extract_features(df: pd.DataFrame) -> pd.Series:
    # Separate the time series into 'before' and 'after' the boundary
    before = df[df["period"] == 0]["value"]
    after = df[df["period"] == 1]["value"]

    features = {
        "mean_diff": after.mean() - before.mean(),
        "std_diff": after.std() - before.std(),
        "median_diff": after.median() - before.median(),
        "iqr_diff": (
            np.percentile(after, 75) - np.percentile(after, 25)
        ) - (
            np.percentile(before, 75) - np.percentile(before, 25)
        ),
        "mean_ratio": after.mean() / (before.mean() + 1e-8),
        "std_ratio": after.std() / (before.std() + 1e-8),
        "skew_diff": after.skew() - before.skew(),
        "kurtosis_diff": after.kurtosis() - before.kurtosis(),
        "min_diff": after.min() - before.min(),
        "max_diff": after.max() - before.max(),
    }

    return pd.Series(features)


In [None]:
from tqdm.auto import tqdm

features = []
groups = inputs_df.groupby("id")

for i, g in tqdm(groups):
    f = extract_features(g)
    features.append(f)
    
input_features = pd.DataFrame(features)
inputs = input_features.to_numpy()
input_features.head()

In [None]:
def preprocess_targets(target_df):
    targets = []
    for id, target in target_df.groupby("id"):
        target = target["structural_breakpoint"].values.astype(np.float32)
        targets.append(target)
    return np.stack(targets)    

targets = preprocess_targets(targets_df)
targets.shape

In [None]:
from sklearn.model_selection import train_test_split

def split(inputs, targets, seed):
    train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(
        inputs,
        targets,                      
        test_size=0.2,
        random_state=seed,
        stratify=targets.ravel()
    )
    
    return (
        train_inputs,
        train_targets, 
        eval_inputs,
        eval_targets
    )
    

data = split(inputs, targets, config.seed)
[data[i].shape for i in range(4)]

In [None]:
train_inputs = data[0]
train_targets = data[1]
eval_inputs = data[2]
eval_targets = data[3]

In [None]:
def quantile_and_clamp(train_inputs, eval_inputs):
    train_inputs = train_inputs.clone()
    eval_inputs = eval_inputs.clone()
    x = train_inputs
    p1 = torch.quantile(x, 0.01)
    p99 = torch.quantile(x, 0.99)

    print("1st percentile:", p1.item())
    print("99th percentile:", p99.item())

    train_inputs = torch.clamp(x, min=p1.item(), max=p99.item())
    eval_inputs = torch.clamp(eval_inputs, min=p1.item(), max=p99.item())
    
    return train_inputs, eval_inputs


def return_stats(tensor, p=True):
    mean, std = tensor.mean(), tensor.std()
    min, max =  tensor.min(), tensor.max()

    if p:
        print(f"Min: {min}, Max: {max}, Mean: {mean}, Std: {std}")
        
    return min, max, mean, std

In [None]:
import torch.nn.functional as F
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler

class SequentialDataset(Dataset):
    def __init__(
        self, 
        inputs, 
        targets,
    ):  
        inputs = torch.tensor(inputs).float()
        targets = torch.tensor(targets).float()
        
        assert len(inputs) == len(targets), "Length Error"
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):   
        return self.inputs[index], self.targets[index]


scaler = StandardScaler()
train_inputs = scaler.fit_transform(train_inputs)
eval_inputs = scaler.transform(eval_inputs)

train_ds = SequentialDataset(
    inputs=train_inputs, 
    targets=train_targets, 
)

eval_ds = SequentialDataset(
    inputs=eval_inputs, 
    targets=eval_targets, 
)

In [None]:
from torch.utils.data import DataLoader


def build_loader(
    SEED,
    ds,
    train=True,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
):
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    generator = torch.Generator()
    generator.manual_seed(SEED if train else SEED+1)

    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        persistent_workers=persistent_workers,
        worker_init_fn=seed_worker,
        generator=generator,
        #sampler=DistributedSampler(
        #    train_ds,
        #    shuffle=True,
        #    drop_last=True,
        #    seed=config.seed
        #)
    )
    
    
def return_dls(train_ds, eval_ds):
    train_dl = build_loader(
        config.seed,
        train_ds,
        train=True,
        batch_size=config.batch_size,
        shuffle=True,
        num_workers=0,
        drop_last=True,
        pin_memory=True,
        persistent_workers=False,
    )

    eval_dl = build_loader(
        config.seed,
        eval_ds,
        train=False,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=0,
        drop_last=True,
        pin_memory=True,
        persistent_workers=False,
    )
    
    return train_dl, eval_dl

#train_dl, eval_dl = return_dls()

In [None]:
import torch.nn as nn
#from pytorch_tabnet.tab_network import TabNet
#from pytorch_tabnet.utils import create_group_matrix

class Model(nn.Module):
    def __init__(self, in_dimensions):
        super().__init__()
        
        grp_list = [[i] for i in range(in_dimensions)]
        group_matrix = create_group_matrix(
            grp_list,
            in_dimensions,
        ).to(config.device)

        self.backbone = TabNet(
            input_dim=in_dimensions,
            output_dim=1,
            group_attention_matrix=group_matrix
        ).to(config.device)

    def forward(self, x):
        x = self.backbone(x)[0]
        return x
    
    
class Model(nn.Module):
    def __init__(self, in_dimensions):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(in_dimensions, 256),
            nn.LayerNorm(256), 
            nn.GELU(),
            nn.Dropout(0.6),
            nn.Linear(256, 256),
            nn.LayerNorm(256), 
            nn.GELU(),
            nn.Dropout(0.6),
            nn.Linear(256, 1),
        )
        
    def forward(self, x):
        x = self.layers(x)
        return x
    

def return_optim(model):
    return torch.optim.AdamW(
        model.parameters(),
        lr=config.lr,
        weight_decay=config.weight_decay,
        fused=True
    )


def count_model(model):
    print(sum(p.numel() for p in model.parameters()) / 1e6)
    
    
def model_device(model):
    for p in model.parameters():
        print(p.device)
        break

In [None]:
from transformers import get_cosine_schedule_with_warmup


def return_scheduler(train_dl, optimizer, config):
    total_training_steps = (len(train_dl)) * config.num_epochs
    warmup_steps = int(total_training_steps * 0.05)

    scheduler = get_cosine_schedule_with_warmup(
        optimizer=optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_training_steps
    )
    return scheduler

In [None]:
from collections import OrderedDict


def remove_orig_mod(state_dict):
    new_state_dict = OrderedDict()
    prefix = "_orig_mod."
    for key, value in state_dict.items():
        if key.startswith(prefix):
            new_key = key[len(prefix):]
        else:
            new_key = key
        new_state_dict[new_key] = value
    return new_state_dict

In [None]:
import neptune

def setup_neptune(config):
    if not config.resume:
        neptune_run = neptune.init_run(
            project="arbaaz/crunchdao-structural-break",
            name=config.model_name,
            api_token=config.neptune_token
        )

        neptune_run["h_parameters"] = {
            "model_name": config.model_name,
            "optimizer_name": config.optimizer_name,
            "learning_rate": config.lr,
            "scheduler_name": config.scheduler_name,
            "weight_decay": config.weight_decay,
            "dropout": config.dropout,
            "num_epochs": config.num_epochs,
            "batch_size": config.batch_size,
        }
    else:
        neptune_run = neptune.init_run(
            project="arbaaz/crunchdao-structural-break",
            with_id=config.with_id,
            api_token=config.neptune_token
        )

    return neptune_run

In [None]:
from sklearn.metrics import roc_auc_score


def loss_fn(logits, targets):
    logits = logits.view(-1)
    targets = targets.view(-1)
    return F.binary_cross_entropy_with_logits(logits, targets)


def metric_fn(logits, targets):
    preds = logits.sigmoid().view(-1)
    targets = targets.view(-1)
    preds = to_numpy(preds)
    targets = to_numpy(targets)
    return roc_auc_score(targets, preds)

In [None]:
from tqdm.auto import tqdm
            

def train_step(model, inputs, targets):
    with torch.autocast(device_type=config.device, dtype=torch.float16, cache_enabled=True):
        logits = model(inputs)
        loss = loss_fn(logits, targets)
    return loss, logits, targets


def eval_step(model, inputs, targets):
    with torch.inference_mode():
        with torch.autocast(device_type=config.device, dtype=torch.float16, cache_enabled=True):
            logits = model(inputs)
            loss = loss_fn(logits, targets)
    return loss, logits, targets


def inference_step(model, inputs):
    model.eval() # FIX THIS 
    with torch.inference_mode():
        with torch.autocast(device_type=config.device, dtype=torch.float16, cache_enabled=True):
            logits = model(inputs)
    return logits
    

def backward_step(loss, optimizer, scaler, scheduler):
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad()
    scheduler.step()
          
    
def reduce_outputs(outputs):
    data, keys = {}, []
    for key in outputs[0].keys():
        if outputs[0][key].dim() == 0 and ("logits" not in key or "targets" not in key):
            value = torch.stack([x[key] for x in outputs]).mean().item()
            data[key] = value
            keys.append(key)
            
    for i in range(len(outputs)): # delete the scalar and loss keys 
        for k in keys:
            del outputs[i][k]
    
    o = reduce_dimensional_outputs(outputs)
    return data | o


def reduce_dimensional_outputs(outputs):
    o = {}
    for key in outputs[0].keys():
        if "logits" in key or "targets" in key:
            tensor = torch.stack([x[key] for x in outputs]).flatten()
            o[key] = tensor
    return o


def save_ckpt(model, metadata, ckpt_path):
    data = {"state_dict": model.state_dict()}
    for k, v in metadata.items():
        data[k] = v
    torch.save(data, ckpt_path)


def log_and_print(neptune_run, data, log=True, print=True):
    if log:
        for key, value in data.items():
            neptune_run[key].append(value)
        
    if print:
        tqdm.write((
            f"Epoch: {data['epoch']}, "
            f"train/loss: {data['train/loss']:.4f}, "
            f"eval/loss: {data['eval/loss']:.4f}, "
            f"train/auroc: {data['train/auroc']:.4f}, "
            f"eval/auroc: {data['eval/auroc']:.4f}, "
        ))

In [None]:
def train_one_epoch(
    model, 
    optimizer,
    scaler,  
    scheduler,
    train_dl,
    epoch,
    num_epochs,
    resume_epoch,
    resume, 
    log_lr_on_step,
    neptune_run
):
    model.train()
    pbar = tqdm(
        train_dl,
        desc=f"Epoch: {epoch}/{num_epochs}",
        leave=False,
        dynamic_ncols=True,
        colour="red",
        position=1,
    )
    outputs = []

    for index, (inputs, targets) in enumerate(pbar):
        if resume and epoch < resume_epoch:
            continue
        
        inputs = inputs.to(config.device, non_blocking=True)
        targets = targets.to(config.device, non_blocking=True)

        loss, logits, targets = train_step(model, inputs, targets)
        backward_step(loss, optimizer, scaler, scheduler)

        outputs.append({
            "train/loss": loss.detach().cpu(),
            "train/logits": logits.detach().cpu(),
            "train/targets": targets.detach().cpu()
        })

        if log_lr_on_step:
            log_and_print(
                neptune_run,
                data={"STEP/lr": scheduler.get_last_lr()[0]}, 
                log=True, 
                print=False
            )
            
    return outputs
            
            
def evaluate(model, eval_dl):
    model.eval()
    pbar = tqdm(
        eval_dl,
        desc=f"Evaluating",
        leave=False,
        dynamic_ncols=True,
        colour="blue",
        position=1,
    )
    outputs = []
    
    for inputs, targets in pbar:
        inputs = inputs.to(config.device, non_blocking=True)
        targets = targets.to(config.device, non_blocking=True)
        loss, logits, targets = eval_step(model, inputs, targets)
        
        outputs.append({
            "eval/loss": loss.detach().cpu(),
            "eval/logits": logits.detach().cpu(),
            "eval/targets": targets.detach().cpu()
        })
        
    return outputs

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
    def __init__(self, in_features, n_classes):
        super().__init__()

        self.dropout1 = nn.Dropout(0.2)
        self.bn1 = nn.LayerNorm(in_features)

        self.fc1 = nn.Linear(in_features, 2064)
        self.dropout2 = nn.Dropout(0.2)
        self.bn2 = nn.LayerNorm(2064)

        self.fc2 = nn.Linear(2064, 1024)
        self.dropout3 = nn.Dropout(0.2)
        self.bn3 = nn.LayerNorm(1024)
        
        self.fc3 = nn.Linear(1024, 128)
        self.dropout3 = nn.Dropout(0.2)
        self.bn4 = nn.LayerNorm(128)

        self.fc_out = nn.Linear(128, n_classes)

    def forward(self, x):
        # Dropout + BN before first layer
        x = self.dropout1(x)
        x = self.bn1(x)

        x = F.gelu(self.fc1(x))
        x = self.dropout2(x)
        x = self.bn2(x)

        x = F.gelu(self.fc2(x))
        x = self.dropout3(x)
        x = self.bn3(x)
        
        x = F.gelu(self.fc3(x))
        x = self.dropout3(x)
        x = self.bn4(x)

        x = self.fc_out(x)
        return x


In [None]:
import timm
import traceback


def train(
    model, 
    optimizer, 
    scheduler, 
    train_dl,
    eval_dl, 
    num_epochs, 
    resume_epoch,
    resume,
    log_lr_on_step,
    neptune_run
):
    os.makedirs("/ckpts", exist_ok=True)
    train_outputs = []
    eval_outputs = []
    score = float('-inf')
    
    scaler = torch.GradScaler()
    pbar = tqdm(range(0, num_epochs), position=0, colour="green")
    
    for epoch in pbar:
        train_outputs = train_one_epoch(
            model,
            optimizer, 
            scaler, 
            scheduler, 
            train_dl,
            epoch,
            num_epochs,
            resume_epoch,
            resume,
            log_lr_on_step,
            neptune_run
        )
        eval_outputs = evaluate(model, eval_dl)
        
        train_outputs = reduce_outputs(train_outputs)
        eval_outputs = reduce_outputs(eval_outputs)

        train_outputs["train/auroc"] = metric_fn(train_outputs["train/logits"], train_outputs["train/targets"])
        eval_outputs["eval/auroc"] = metric_fn(eval_outputs["eval/logits"], eval_outputs["eval/targets"])
        
        outputs = train_outputs | eval_outputs
        outputs["epoch"] = epoch
        del outputs["train/logits"]
        del outputs["eval/logits"]
        del outputs["train/targets"]
        del outputs["eval/targets"]
        
        log_and_print(neptune_run, outputs)

        if outputs["eval/auroc"] > score:
            score = outputs["eval/auroc"]
            metadata = {
                "train_auroc": outputs["train/auroc"],
                "eval_auroc": score,
                "epoch": epoch
            }
            save_ckpt(model, metadata, f"/ckpts/{config.model_name}.pt")
            
        #if RMSE >= torch.tensor(0.7000).cuda():
        #    break
            
    neptune_run.stop()
    
    
def start_training():
    model = Model(10, 1).cuda()
    model = nn.DataParallel(model)
    count_model(model)
    
    optimizer = return_optim(model)
    
    config.batch_size = 64
    train_dl, eval_dl = return_dls(train_ds, eval_ds)
    
    scheduler = return_scheduler(train_dl, optimizer, config)
    
    config.model_name = f"mlp.{config.batch_size}.FromKaggle"
    neptune_run = setup_neptune(config)
    
    print(model)
    print_config(config)    
    
    try:
        train(
            model, 
            optimizer, 
            scheduler, 
            train_dl,
            eval_dl, 
            num_epochs=config.num_epochs, 
            resume_epoch=0,
            resume=config.resume,
            log_lr_on_step=True,
            neptune_run=neptune_run
        )
    except Exception as e:
        print(f"Exception occurred: {e}")
        traceback.print_exc()
    finally:
        neptune_run.stop()
        
start_training()