In [1]:
import torch


def print_config(config):
    attrs = {}
    attrs.update(config.__dict__)

    for key, value in config.__class__.__dict__.items():
        if not key.startswith('__') and key not in attrs:
            attrs[key] = value

    for key, value in attrs.items():
        print(f"{key}: {value}")


class Config:
    # Model
    model_name = "tabnet.86.repro.stopping.at.70"
    input_dim = 6
    target_dim = 1

    # Device & reproducibility
    device = "cuda" if torch.cuda.is_available() else "cpu"
    seed = 5274

    # Optimization
    optimizer_name = "AdamW"
    lr = 1e-4
    weight_decay = 1e-3

    batch_size = 1  # // grad_accum_steps

    # Training schedule
    num_epochs = 100
    scheduler_name = "default"

    # Regularization
    dropout = 0.0
    drop_path_rate = 0.0
    label_smoothing = 0.0

    # Experiment tracking
    neptune_token = "eyJhcGlfYWRkcmVzcyI6Imh0dHBzOi8vYXBwLm5lcHR1bmUuYWkiLCJhcGlfdXJsIjoiaHR0cHM6Ly9hcHAubmVwdHVuZS5haSIsImFwaV9rZXkiOiJlOGE2YjNiZS1mZGUyLTRjYjItYTg5Yy1mZWJkZTIzNzE1NmIifQ=="
    with_id = ""
    resume = False

config = Config()

In [2]:
import random
import numpy as np


def setup_reproducibility(config):
    random.seed(config.seed)
    np.random.seed(config.seed)
    torch.manual_seed(config.seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(config.seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.use_deterministic_algorithms(False, warn_only=True)
    torch.set_float32_matmul_precision("high")

setup_reproducibility(config)

In [3]:
import os

PATH = "/Users/arbaaz/Downloads/break/ds"
inputs_path = PATH+"/X_train.parquet"
targets_path = PATH+"/y_train.parquet"
os.listdir(PATH)

['y_train.parquet',
 'X_train.parquet',
 '.gitignore',
 'X_test.reduced.parquet',
 'y_test.reduced.parquet']

In [4]:
import pandas as pd

def load_parquet(path):
    return pd.read_parquet(path)

inputs_df = load_parquet(inputs_path)
targets_df = load_parquet(targets_path)

In [5]:
from sklearn.model_selection import train_test_split

def split(inputs_df, targets_df, seed):
    targets_df = targets_df.reset_index()

    train_ids, test_ids = train_test_split(
        targets_df['id'],                     
        test_size=0.2,
        random_state=seed,
        stratify=targets_df['structural_breakpoint']
    )

    train_inputs = inputs_df.loc[inputs_df.index.get_level_values('id').isin(train_ids)].copy()
    eval_inputs = inputs_df.loc[inputs_df.index.get_level_values('id').isin(test_ids)].copy()

    train_targets = targets_df[targets_df['id'].isin(train_ids)].set_index('id').copy()
    eval_targets = targets_df[targets_df['id'].isin(test_ids)].set_index('id').copy()
    
    return (
        train_inputs,
        train_targets, 
        eval_inputs,
        eval_targets
    )
    
data = split(inputs_df, targets_df, 1)

In [29]:
import torch
import torch.nn.functional as F


def preprocess_inputs(inputs_df, dimension=2048):
    inputs = []    
    for id, seq in inputs_df.groupby("id"):
        value = torch.tensor(seq["value"].values).float()
        period = torch.tensor(seq["period"].values).float()
        input = torch.stack([value, period], axis=0).unsqueeze(0)
        input = F.interpolate(input, size=dimension, mode="nearest-exact").squeeze(0)
        inputs.append(input) # 2, SEQ LEN
        
    return torch.stack(inputs)

train_inputs = preprocess_inputs(data[0])
eval_inputs = preprocess_inputs(data[2])
train_inputs.shape, eval_inputs.shape

(torch.Size([8000, 2, 2048]), torch.Size([2001, 2, 2048]))

In [30]:
def preprocess_targets(target_df):
    targets = []
    for id, target in target_df.groupby("id"):
        target = target["structural_breakpoint"].values.astype(np.int32)
        targets.append(torch.tensor(target))
    return torch.stack(targets)    

train_targets = preprocess_targets(data[1])
eval_targets = preprocess_targets(data[3])
train_targets.shape, eval_targets.shape

(torch.Size([8000, 1]), torch.Size([2001, 1]))

In [31]:
def preprocess_inputs_for_zscore(train_inputs, eval_inputs):
    x = train_inputs[:, 0]
    p1 = torch.quantile(x, 0.01)
    p99 = torch.quantile(x, 0.99)

    print("1st percentile:", p1.item())
    print("99th percentile:", p99.item())

    x = torch.clamp(x, min=p1.item(), max=p99.item())
    eval_inputs[:, 0] = torch.clamp(eval_inputs[:, 0], min=p1.item(), max=p99.item())
    train_inputs[:, 0] = x
    
    return train_inputs, eval_inputs

In [32]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np

class SequentialDataset(Dataset):
    def __init__(
        self, 
        inputs, 
        targets,
        mean=None,
        std=None,
        min=None,
        max=None,
        minmax=False,
        zscore=False,
    ):  
        inputs = inputs.clone()
        targets = targets.clone()
        
        if zscore:
            inputs = self.perform_zscore(inputs, mean, std)
        if minmax:
            inputs = self.perform_minmax(inputs, min, max)
            
        assert len(inputs) == len(targets), "Length Error"
        self.inputs = inputs
        self.targets = targets
            
    def perform_zscore(self, inputs, mean, std):
        for i in range(len(inputs)):
            inputs[i][0] = (inputs[i][0] - mean) / std
        return inputs

    def perform_minmax(self, inputs, min, max):
        for i in range(len(inputs)):
            inputs[i][0] = (inputs[i][0] - min) / (max - min + 1e-8)
        return inputs
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):   
        return self.inputs[index], self.targets[index]
    

train_inputs, eval_inputs = preprocess_inputs_for_zscore(train_inputs, eval_inputs)

x = train_inputs[:, 0]
mean, std = x.mean(), x.std()
min, max =  x.min(), x.max()
print(min, max, mean, std)

train_ds = SequentialDataset(
    inputs=train_inputs, 
    targets=train_targets, 
    mean=mean, 
    std=std,
    min=min,
    max=max,
    zscore=True,
    minmax=False
)

eval_ds = SequentialDataset(
    inputs=eval_inputs, 
    targets=eval_targets, 
    mean=mean, 
    std=std,
    min=min,
    max=max,
    zscore=True,
    minmax=False
)

1st percentile: -0.05925929918885231
99th percentile: 0.06588339805603027
tensor(-0.0593) tensor(0.0659) tensor(0.0004) tensor(0.0163)


In [33]:
from torch.utils.data import DataLoader


def build_loader(
    SEED,
    ds,
    train=True,
    batch_size=1,
    shuffle=False,
    num_workers=4,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
):
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    generator = torch.Generator()
    generator.manual_seed(SEED if train else SEED+1)

    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        persistent_workers=persistent_workers,
        worker_init_fn=seed_worker,
        generator=generator,
        #sampler=DistributedSampler(
        #    train_ds,
        #    shuffle=True,
        #    drop_last=True,
        #    seed=config.seed
        #)
    )
    
    
train_dl = build_loader(
    config.seed,
    train_ds,
    train=True,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=0,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
)

eval_dl = build_loader(
    config.seed,
    eval_ds,
    train=False,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=0,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
)

In [34]:
for id, (i, t) in enumerate(train_dl):
    print(i[0, 0].mean(), i[0, 0].std())
    if id == 10:
        break

tensor(-0.0230) tensor(0.4322)
tensor(-0.0188) tensor(0.1495)
tensor(-0.0249) tensor(0.2040)
tensor(0.0809) tensor(1.0818)
tensor(-0.0132) tensor(1.8295)
tensor(-0.0174) tensor(0.0879)
tensor(-0.0038) tensor(1.1149)
tensor(-0.0220) tensor(0.0447)
tensor(0.0279) tensor(1.1830)
tensor(0.0634) tensor(1.0301)
tensor(0.0351) tensor(0.7787)




In [35]:
for id, (i, t) in enumerate(eval_dl):
    print(i[0, 0].mean(), i[0, 0].std())
    if id == 10:
        break

tensor(0.0058) tensor(1.0911)
tensor(-0.0043) tensor(0.5305)
tensor(-0.0077) tensor(0.5707)
tensor(-0.0084) tensor(0.2564)
tensor(0.0114) tensor(1.2874)
tensor(-0.0233) tensor(0.3770)
tensor(0.0180) tensor(0.7214)
tensor(0.0364) tensor(0.8269)
tensor(0.0278) tensor(1.3101)
tensor(-0.0292) tensor(0.1756)
tensor(-0.0662) tensor(1.6382)


