In [1]:
!pip3 install -qU neptune
#tsfel pytorch-tabnet

In [2]:
from collections import OrderedDict        


def to_numpy(tensor):
    return tensor.detach().cpu().numpy()


def return_stats(tensor, p=True):
    mean, std = tensor.mean(), tensor.std()
    min, max =  tensor.min(), tensor.max()

    if p:
        print(f"Min: {min}, Max: {max}, Mean: {mean}, Std: {std}")
        
    return min, max, mean, std


def remove_orig_mod(state_dict):
    new_state_dict = OrderedDict()
    prefix = "_orig_mod."
    for key, value in state_dict.items():
        if key.startswith(prefix):
            new_key = key[len(prefix):]
        else:
            new_key = key
        new_state_dict[new_key] = value
    return new_state_dict

In [3]:
import torch
import random
import numpy as np


SEED = 5274

def setup_reproducibility():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")

setup_reproducibility()

In [None]:
from huggingface_hub import login, snapshot_download


login("ahf_uOkImkbEroqtIuyvGJrttTzaebfeIdPZID")
repo_id = "ArbaazBeg/crunchdao-structural-break-detection"
path = snapshot_download(repo_id, repo_type="dataset")

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

In [5]:
import os

inputs_path = path+"/X_train.parquet"
targets_path = path+"/y_train.parquet"
os.listdir(path)

['X_test.reduced.parquet',
 'y_train.parquet',
 '.gitignore',
 'y_test.reduced.parquet',
 '.gitattributes',
 'features.csv',
 'X_train.parquet']

In [6]:
import pandas as pd


def load_parquet(path):
    return pd.read_parquet(path)

inputs_df = load_parquet(inputs_path)
targets_df = load_parquet(targets_path)

In [7]:
import pandas as pd
import numpy as np



def extract_features(df: pd.DataFrame) -> pd.Series:
    # Separate the time series into 'before' and 'after' the boundary
    before = df[df["period"] == 0]["value"]
    after = df[df["period"] == 1]["value"]

    features = {
        "mean_diff": after.mean() - before.mean(),
        "std_diff": after.std() - before.std(),
        "median_diff": after.median() - before.median(),
        "iqr_diff": (
            np.percentile(after, 75) - np.percentile(after, 25)
        ) - (
            np.percentile(before, 75) - np.percentile(before, 25)
        ),
        "mean_ratio": after.mean() / (before.mean() + 1e-8),
        "std_ratio": after.std() / (before.std() + 1e-8),
        "skew_diff": after.skew() - before.skew(),
        "kurtosis_diff": after.kurtosis() - before.kurtosis(),
        "min_diff": after.min() - before.min(),
        "max_diff": after.max() - before.max(),
    }

    return pd.Series(features)

In [8]:
from tqdm.auto import tqdm

def prepare_inputs():
    features = []
    groups = inputs_df.groupby("id")

    for i, g in tqdm(groups):
        f = extract_features(g)
        features.append(f)
    
    input_features = pd.DataFrame(features)
    inputs = input_features.to_numpy()
    return inputs, input_features

#inputs, f = prepare_inputs()

In [9]:
#f.to_csv(f"{path}/features.csv" , index=False)
inputs = pd.read_csv(f"{path}/features.csv")
print(inputs.head())
inputs = inputs.to_numpy()

   mean_diff  std_diff  median_diff  iqr_diff  mean_ratio  std_ratio  \
0  -0.000008 -0.000111    -0.000217  0.000035    0.436447   0.984158   
1  -0.000218 -0.000489    -0.000112 -0.000220   -0.700515   0.806390   
2   0.001400  0.005678     0.000648  0.005029    4.598149   1.329721   
3  -0.000055  0.000898    -0.000264  0.000989    0.856212   1.107022   
4   0.000040  0.000094     0.000107  0.000093   -1.486097   1.028504   

   skew_diff  kurtosis_diff  min_diff  max_diff  
0  -0.106954      -0.339531  0.002323 -0.011145  
1  -1.157882       1.695049  0.003525 -0.014110  
2   0.516622       1.672687  0.002784  0.043154  
3   0.362957      -0.511544  0.012217 -0.016013  
4   0.090713       0.024389  0.001669  0.000556  


In [10]:
def preprocess_targets(target_df):
    targets = []
    for id, target in target_df.groupby("id"):
        target = target["structural_breakpoint"].values.astype(np.float32)
        targets.append(target)
    return np.stack(targets)    

targets = preprocess_targets(targets_df)
targets.shape

(10001, 1)

In [11]:
from sklearn.model_selection import train_test_split


def split(inputs, targets, seed):
    train_inputs, eval_inputs, train_targets, eval_targets = train_test_split(
        inputs,
        targets,                      
        test_size=0.2,
        random_state=seed,
        stratify=targets.ravel()
    )
    
    return (
        train_inputs,
        train_targets, 
        eval_inputs,
        eval_targets
    )

data = split(inputs, targets, SEED)
[data[i].shape for i in range(4)]

[(8000, 10), (8000, 1), (2001, 10), (2001, 1)]

In [12]:
train_inputs = data[0]
train_targets = data[1]
eval_inputs = data[2]
eval_targets = data[3]

In [13]:
import torch.nn.functional as F
from torch.utils.data import Dataset
from sklearn.preprocessing import StandardScaler


class SequentialDataset(Dataset):
    def __init__(
        self, 
        inputs, 
        targets,
    ):  
        inputs = torch.tensor(inputs).float()
        targets = torch.tensor(targets).float()
        
        assert len(inputs) == len(targets), "Length Error"
        self.inputs = inputs
        self.targets = targets
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):   
        return self.inputs[index], self.targets[index]


scaler = StandardScaler()
train_inputs = scaler.fit_transform(train_inputs)
eval_inputs = scaler.transform(eval_inputs)

train_ds = SequentialDataset(
    inputs=train_inputs, 
    targets=train_targets, 
)

eval_ds = SequentialDataset(
    inputs=eval_inputs, 
    targets=eval_targets, 
)

In [14]:
from torch.utils.data import DataLoader


def build_loader(
    SEED,
    ds,
    train=True,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
):
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    generator = torch.Generator()
    generator.manual_seed(SEED if train else SEED+1)

    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        persistent_workers=persistent_workers,
        worker_init_fn=seed_worker,
        generator=generator,
        #sampler=DistributedSampler(
        #    train_ds,
        #    shuffle=True,
        #    drop_last=True,
        #    seed=config.seed
        #)
    )
    
    
def return_dls(train_ds, eval_ds):
    train_dl = build_loader(
        SEED,
        train_ds,
        train=True,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=0,
        drop_last=True,
        pin_memory=True,
        persistent_workers=False,
    )

    eval_dl = build_loader(
        SEED,
        eval_ds,
        train=False,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=0,
        drop_last=True,
        pin_memory=True,
        persistent_workers=False,
    )
    
    return train_dl, eval_dl

In [None]:
import neptune


def setup_neptune():
    if not RESUME:
        neptune_run = neptune.init_run(
            project="arbaaz/crunchdao-structural-break",
            name=MODEL_NAME,
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlOGE2YjNiZS1mZGUyLTRjYjItYTg5Yy1mZWJkZTIzNzE1NmIifQ=="
        )

        neptune_run["h_parameters"] = {
            "seed": SEED,
            "model_name": MODEL_NAME,
            "optimizer_name": "adamw",
            "learning_rate": LR,
            "scheduler_name": "default",
            "weight_decay": WD,
            "dropout": DROPOUT,
            "num_epochs": EPOCHS,
            "batch_size": BATCH_SIZE,
        }
    else:
        neptune_run = neptune.init_run(
            project="arbaaz/crunchdao-structural-break",
            with_id=config.with_id,
            api_token="eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlOGE2YjNiZS1mZGUyLTRjYjItYTg5Yy1mZWJkZTIzNzE1NmIifQ=="
        )

    return neptune_run

In [16]:
from sklearn.metrics import roc_auc_score


def loss_fn(logits, targets):
    logits = logits.view(-1)
    targets = targets.view(-1)
    return F.binary_cross_entropy_with_logits(logits, targets)


def metric_fn(logits, targets):
    preds = logits.sigmoid().view(-1)
    targets = targets.view(-1)
    preds = to_numpy(preds)
    targets = to_numpy(targets)
    return roc_auc_score(targets, preds)

In [17]:
import torch
import torch.nn as nn

class Model(nn.Module):
    def __init__(self, in_features, n_classes, dropout):
        super().__init__()

        self.layers = nn.Sequential(
            nn.Dropout(0.2),
            nn.LayerNorm(in_features),
            
            nn.Linear(in_features, 2064),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.LayerNorm(2064),

            nn.Linear(2064, 1024),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.LayerNorm(1024),

            nn.Linear(1024, 512),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.LayerNorm(512),
            
            nn.Linear(512, 128),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.LayerNorm(128),

            nn.Linear(128, n_classes)
        )

    def forward(self, x):
        return self.layers(x)


In [18]:
from transformers import get_cosine_schedule_with_warmup
from tqdm.auto import tqdm



MODEL_NAME = "mlp.baseline"
EPOCHS = 100
BATCH_SIZE = 64
WD = 1e-3
LR = 1e-4
DROPOUT = 0.2
SCORE = float('-inf')
LOG = True
RESUME = False
device = "cuda" if torch.cuda.is_available() else "cpu"



model = Model(10, 1, DROPOUT).to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
scaler = torch.GradScaler(device)
train_dl, eval_dl = return_dls(train_ds, eval_ds)

total_training_steps = len(train_dl) * EPOCHS
warmup_steps = int(total_training_steps * 0.05)  # e.g. 5% warmup
scheduler = get_cosine_schedule_with_warmup(
    optimizer=optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_training_steps
)

if LOG:
    neptune_run = setup_neptune()



for epoch in tqdm(range(EPOCHS)):
    model.train()
    LOSS = 0.0
    LOGITS = []
    TARGETS = []
    
    for inputs, targets in train_dl:
        inputs = inputs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        
        with torch.autocast(device_type=device, dtype=torch.float16, cache_enabled=True):
            logits = model(inputs)
            loss = loss_fn(logits, targets)
            
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        scheduler.step()

        LOSS += loss.detach().cpu()
        LOGITS.append(logits.detach().cpu())
        TARGETS.append(targets.detach().cpu())
    
    LOGITS = torch.stack(LOGITS)
    TARGETS = torch.stack(TARGETS)
    
    auroc = metric_fn(LOGITS, TARGETS)
    LOSS = LOSS / len(train_dl)
    
    
    model.eval()
    EVAL_LOSS = 0.0
    EVAL_LOGITS = []
    EVAL_TARGETS = []

    for inputs, targets in eval_dl:
        inputs = inputs.to(device, non_blocking=True)
        targets = targets.to(device, non_blocking=True)
        
        with torch.inference_mode():
            with torch.autocast(device_type=device, dtype=torch.float16, cache_enabled=True):
                logits = model(inputs)
                loss = loss_fn(logits, targets)
                
        EVAL_LOSS += loss.detach().cpu()
        EVAL_LOGITS.append(logits.detach().cpu())
        EVAL_TARGETS.append(targets.detach().cpu())
    
    EVAL_LOGITS = torch.stack(EVAL_LOGITS)
    EVAL_TARGETS = torch.stack(EVAL_TARGETS)
    
    eval_auroc = metric_fn(EVAL_LOGITS, EVAL_TARGETS)
    EVAL_LOSS = EVAL_LOSS / len(eval_dl)
    
    if eval_auroc > SCORE:
        SCORE = eval_auroc
        data = {"state_dict": model.state_dict()}
        data["epoch"] = epoch 
        data["score"] = SCORE
        torch.save(data, "/kaggle/working/ckpt.pt")
    
    if LOG:
        neptune_run["train/loss"].append(LOSS)
        neptune_run["eval/loss"].append(EVAL_LOSS)
        neptune_run["train/auroc"].append(auroc)
        neptune_run["eval/auroc"].append(eval_auroc)
        
    print(
        f"Epoch: {epoch}, "
        f"train/loss: {LOSS:.4f}, "
        f"eval/loss: {auroc:.4f}, "
        f"train/auroc: {EVAL_LOSS:.4f}, "
        f"eval/auroc: {eval_auroc:.4f}, "
    )



[neptune] [info   ] Neptune initialized. Open in the app: https://app.neptune.ai/arbaaz/crunchdao-structural-break/e/CRUN-141


  0%|          | 0/100 [00:00<?, ?it/s]

Epoch: 0, train/loss: 0.6605, eval/loss: 0.5092, train/auroc: 0.6432, eval/auroc: 0.5484, 
Epoch: 1, train/loss: 0.6169, eval/loss: 0.5268, train/auroc: 0.6144, eval/auroc: 0.5392, 
Epoch: 2, train/loss: 0.6093, eval/loss: 0.5309, train/auroc: 0.6097, eval/auroc: 0.5476, 
Epoch: 3, train/loss: 0.6070, eval/loss: 0.5386, train/auroc: 0.6016, eval/auroc: 0.5515, 
Epoch: 4, train/loss: 0.6055, eval/loss: 0.5364, train/auroc: 0.6025, eval/auroc: 0.5428, 
Epoch: 5, train/loss: 0.6029, eval/loss: 0.5441, train/auroc: 0.6074, eval/auroc: 0.5372, 
Epoch: 6, train/loss: 0.6015, eval/loss: 0.5470, train/auroc: 0.6075, eval/auroc: 0.5388, 
Epoch: 7, train/loss: 0.6016, eval/loss: 0.5475, train/auroc: 0.6040, eval/auroc: 0.5495, 
Epoch: 8, train/loss: 0.6010, eval/loss: 0.5459, train/auroc: 0.5998, eval/auroc: 0.5409, 
Epoch: 9, train/loss: 0.5986, eval/loss: 0.5540, train/auroc: 0.6002, eval/auroc: 0.5424, 
Epoch: 10, train/loss: 0.5982, eval/loss: 0.5564, train/auroc: 0.6047, eval/auroc: 0.5377,

In [19]:
model

Model(
  (layers): Sequential(
    (0): Dropout(p=0.2, inplace=False)
    (1): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
    (2): Linear(in_features=10, out_features=2064, bias=True)
    (3): GELU(approximate='none')
    (4): Dropout(p=0.2, inplace=False)
    (5): LayerNorm((2064,), eps=1e-05, elementwise_affine=True)
    (6): Linear(in_features=2064, out_features=1024, bias=True)
    (7): GELU(approximate='none')
    (8): Dropout(p=0.2, inplace=False)
    (9): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (10): Linear(in_features=1024, out_features=512, bias=True)
    (11): GELU(approximate='none')
    (12): Dropout(p=0.2, inplace=False)
    (13): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
    (14): Linear(in_features=512, out_features=128, bias=True)
    (15): GELU(approximate='none')
    (16): Dropout(p=0.2, inplace=False)
    (17): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
    (18): Linear(in_features=128, out_features=1, bias=True)
  