In [107]:
import os

PATH = "/Users/arbaaz/Downloads/break/ds"
inputs_path = PATH+"/X_train.parquet"
targets_path = PATH+"/y_train.parquet"
os.listdir(PATH)

['y_train.parquet',
 'X_train.parquet',
 '.gitignore',
 'X_test.reduced.parquet',
 'y_test.reduced.parquet']

In [106]:
import pandas as pd

def load_parquet(path):
    return pd.read_parquet(path)

inputs_df = load_parquet(inputs_path)
targets_df = load_parquet(targets_path)

In [None]:
from sklearn.model_selection import train_test_split

In [109]:
targets_df.reset_index

<bound method DataFrame.reset_index of        structural_breakpoint
id                          
0                      False
1                      False
2                       True
3                      False
4                      False
...                      ...
9996                   False
9997                   False
9998                   False
9999                   False
10000                   True

[10001 rows x 1 columns]>

In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset
import numpy as np

class SequentialDataset(Dataset):
    def __init__(
        self, 
        inputs_df, 
        target_df,
        minmax=False,
        zscore=False,
        dimension=2048
    ):  
        if zscore:
            inputs_df = self.perform_zscore(inputs_df)
        if minmax:
            inputs_df = self.perform_minmax(inputs_df)
        
        inputs = []    
        for id, seq in inputs_df.groupby("id"):
            value = seq["value"].values.astype(np.float32)
            period = seq["period"].values.astype(np.float32)
            input = torch.tensor((value, period)).unsqueeze(0)
            input = F.interpolate(input, size=dimension, mode="nearest-exact").squeeze(0)
            inputs.append(input) # 2, SEQ LEN
            
        targets = []
        for id, target in target_df.groupby("id"):
            target = target["structural_breakpoint"].values.astype(np.int32)
            targets.append(torch.tensor(target))
            
        assert len(inputs) == len(targets), "Length Error"
        self.inputs = inputs
        self.targets = targets
            
    def perform_zscore(self, inputs_df):
        inputs_df['value'] = inputs_df.groupby('id')['value'].transform(
            lambda x: (x - x.mean()) / (x.std() + 1e-8)
        )
        return inputs_df

    def perform_minmax(self, inputs_df):
        inputs_df['value'] = inputs_df.groupby('id')['value'].transform(
            lambda x: (x - x.min()) / (x.max() - x.min() + 1e-8)
        )
        return inputs_df
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):   
        return self.inputs[index], self.targets[index]

ds = SequentialDataset(inputs_df, targets_df, zscore=True, dimension=2048)

In [96]:
from torch.utils.data import DataLoader


def build_loader(
    SEED,
    ds,
    train=True,
    batch_size=16,
    shuffle=False,
    num_workers=4,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
):
    def seed_worker(worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)

    generator = torch.Generator()
    generator.manual_seed(SEED if train else SEED+1)

    return DataLoader(
        ds,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        pin_memory=pin_memory,
        drop_last=drop_last,
        persistent_workers=persistent_workers,
        worker_init_fn=seed_worker,
        generator=generator,
        #sampler=DistributedSampler(
        #    train_ds,
        #    shuffle=True,
        #    drop_last=True,
        #    seed=config.seed
        #)
    )
    
    
train_dl = build_loader(
    1,
    ds,
    train=True,
    batch_size=4,
    shuffle=True,
    num_workers=0,
    drop_last=True,
    pin_memory=True,
    persistent_workers=False,
)

In [97]:
from tqdm.auto import tqdm

for i, t in tqdm(train_dl):
    pass



  0%|          | 0/2500 [00:00<?, ?it/s]