In [1]:
# default_exp dataloader

# Dataloader

> The data generator module

In [2]:
#hide
%reload_ext autoreload
%autoreload 2
from nbdev.showdoc import *
import warnings
warnings.filterwarnings("ignore")

In [4]:
# export
import os
import pandas as pd
import numpy as np
import random
from random import randrange
import pickle

import torch
from torch.utils.data import Dataset, DataLoader

from ti.prep import Transformer

In [5]:
# export
file_dir = os.path.dirname(os.path.realpath(__file__)) if '__file__' in globals() else './'
def getSTW(mode='sim'):
    window_file = os.path.join(file_dir, '../data/%s/'%(mode), 'space_time_windows')
    if not os.path.exists(window_file):
        print('Space time window doesn\'t exist create one first!: ', window_file)
        raise NotADirectoryError("Data folder not found")
    with open (window_file, 'rb') as fp:
        space_time_window_list = pickle.load(fp)
    return space_time_window_list

In [6]:
# slow
# usage: getSTW
mode = 'sim'
stw = getSTW(mode=mode)
print(len(stw))

12


In [7]:
# export
def splitData(size):
    train_size = int(size*0.8) # 80%
    val_size = int(size*0.1) # 10%
    test_size = size - (train_size+val_size) # 10%
    return (range(train_size), 
            range(train_size, train_size+val_size), 
            range(train_size+val_size, train_size+val_size+test_size))

In [8]:
# slow
# usage: splitData
train_range, val_range, test_range = splitData(len(stw))
print(f'Range (train, val, test): {train_range, val_range, test_range}')

Range (train, val, test): (range(0, 9), range(9, 10), range(10, 12))


In [9]:
# export
class DatasetTraj(Dataset):
    '''Characterizes a dataset for PyTorch'''
    def __init__(self, list_ids, space_time_window_list, mode='sim'):
        self.list_ids = list_ids
        self.mode = mode
        self.space_time_window_list = space_time_window_list
        self.trasformer = Transformer()

    def __len__(self):
        '''Denotes the total number of samples'''
        return len(self.list_ids)

    def __getitem__(self, index):
        '''Generates one sample of data'''
        id = self.list_ids[index]
        is_positive = random.getrandbits(1) # label
        
        # Select sample
        if is_positive:
            # Load data and get label
            if self.mode == 'sim':
                data = pd.read_csv(f'{file_dir}/../data/sim/{str(id)}.csv')
            else:
                window = self.space_time_window_list[id]
                tid = random.choice(window)
                data = pd.read_csv(f'{file_dir}/../data/real/{str(int(tid))}.csv')
            x1, org= self.trasformer.transform(data)
            total_steps = len(x1)
            dst_idx = randrange(int(0.7*total_steps), total_steps - 1)
            dst = x1[dst_idx]
            c_range = randrange(int(.25*dst_idx), int(.9*dst_idx))#total_steps#
            x1 = x1[:c_range]
            org = org[:c_range]
            dst = [dst] * len(org)
            x2 = [org, dst]
            y = 1
        else:
            # Load data and get label
            window = self.space_time_window_list[id]
            ids = random.sample(window, 2)
            pid, nid = ids[0], ids[1]
            if self.mode == 'sim':
                pid, nid = ids[0], ids[1]
                pos_data = pd.read_csv(f'{file_dir}/../data/sim/{str(int(pid))}.csv')
                neg_data = pd.read_csv(f'{file_dir}/../data/sim/{str(int(nid))}.csv')
            else:
                pos_data = pd.read_csv(f'{file_dir}/../data/real/{str(int(pid))}.csv')
                neg_data = pd.read_csv(f'{file_dir}/../data/real/{str(int(nid))}.csv')
            pos_x1, pos_org = self.trasformer.transform(pos_data)
            neg_x1, neg_org = self.trasformer.transform(neg_data)

            neg_total_steps = len(neg_x1)
            pos_total_steps = len(pos_x1)
            dst_idx = randrange(int(0.7*pos_total_steps), pos_total_steps - 1)
            dst = pos_x1[dst_idx]
            c_range = randrange(int(.25*neg_total_steps), int(.9*neg_total_steps))
            x1 = neg_x1[:c_range]
            org = [neg_org[0]] * len(x1)
            dst = [dst] * len(x1)
            x2 = [org, dst]
            y = 0
        
        return x1, x2, y
    

In [10]:
show_doc(DatasetTraj)

<h2 id="DatasetTraj" class="doc_header"><code>class</code> <code>DatasetTraj</code><a href="" class="source_link" style="float:right">[source]</a></h2>

> <code>DatasetTraj</code>(**\*`args`**, **\*\*`kwds`**) :: `Dataset`

Characterizes a dataset for PyTorch

In [11]:
# export
def zero_padding(batch):
    '''Pads batch of variable length with leading zeros'''
    x1 = [item[0] for item in batch]
    x2_org = [item[1][0] for item in batch]
    x2_dst = [item[1][1] for item in batch]
    y = [item[2] for item in batch]
    x_seq_lens = [len(item) for item in x1]
    max_seq_len = max(x_seq_lens)
    n_dim = len(x1[0][0])
    x1_pad = torch.FloatTensor([
        np.zeros((max_seq_len-len(item), n_dim)).tolist()+item
        for item in x1
    ])
    x2_org_pad = torch.FloatTensor([
        np.zeros((max_seq_len-len(item), n_dim)).tolist()+item
        for item in x2_org
    ])
    x2_dst_pad = torch.FloatTensor([
        np.zeros((max_seq_len-len(item), n_dim)).tolist()+item
        for item in x2_dst
    ])
    return x1_pad, (x2_org_pad, x2_dst_pad), y, x_seq_lens, max_seq_len

In [12]:
show_doc(zero_padding)

<h4 id="zero_padding" class="doc_header"><code>zero_padding</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>zero_padding</code>(**`batch`**)

Pads batch of variable length with leading zeros

In [17]:
# slow
# usage: DatasetTraj
# Parameters
params = {
    'batch_size': 4,
    'shuffle': True,
    'collate_fn': zero_padding
}

# Partitions
partition = {
    'train': train_range,
    'validation': val_range
}

# Generators
training_set = DatasetTraj(partition['train'], stw, mode=mode)
training_generator = DataLoader(training_set, **params)

validation_set = DatasetTraj(partition['validation'], stw, mode=mode)
validation_generator = DataLoader(validation_set, **params)

In [18]:
# slow
# usage: Training Generator
count = 0
for x1, x2, y, x_seq_lens, max_seq_len in training_generator:
    print('Batch')
    print(x1.shape)
    print(x2[0].shape, x2[0].shape)
    print(y)
    print(x_seq_lens)
    print(max_seq_len)
    if count >=2:
        break
    count += 1

Batch
torch.Size([4, 11, 4])
torch.Size([4, 11, 4]) torch.Size([4, 11, 4])
[1, 1, 1, 1]
[2, 10, 3, 11]
11
Batch
torch.Size([4, 9, 4])
torch.Size([4, 9, 4]) torch.Size([4, 9, 4])
[0, 0, 0, 1]
[4, 9, 8, 3]
9
Batch
torch.Size([1, 6, 4])
torch.Size([1, 6, 4]) torch.Size([1, 6, 4])
[1]
[6]
6


# Export -

In [19]:
#hide
from nbdev.export import notebook2script
notebook2script()

Converted 00_prep.ipynb.
Converted 01_dataloader.ipynb.
Converted 02_model.ipynb.
Converted 03_train.ipynb.
Converted index.ipynb.
