In [1]:
import torch
from torch.utils import data
from torchvision import transforms
import numpy as np

from utils import load_npz

import os
import time
import random

In [2]:
n_channel = 10

In [25]:
def generate_ids():
    """
    Descr: 
        Aim: To write and returns the partition (Train, Validation and Test) ids 
        with respect to the grid split index (range 1 - 100)
        
        - A random seed value is set within a random intger 1-10,
        - the set is spltted into 80:10:10,
        - save into a text file with the seed value used
    """
    # set a random seed value within the range 1 -10 
    start_time = time.time()
    seed_value = np.random.randint(0,10)
    np.random.seed(seed_value)
    # # block id range 1 - 100 (splitted grid)
    block_range = np.arange(1, 101)

    # Train, Validation and Test
    random.shuffle(block_range)
    train_id = block_range[:80] # 80%
    val_id = block_range[80:90] # 10%
    test_id = block_range[90:] # 10%
    print("Seed value: ", seed_value)
    
    if not os.path.exists("train_val_eval_seed_" + str(seed_value)+".txt"):
        with open("train_val_eval_seed_" + str(seed_value)+".txt", "w") as f:
            f.write("Training: " + str(list(train_id)) + "\n")
            f.write("Validation: " + str(list(val_id)) + "\n")
            f.write("Testing: " + str(list(test_id)) + "\n")
            f.close()
    print('Read set ids completed: %s second' % (time.time() - start_time))
generate_ids()

Seed value:  9
Read set ids completed: 0.0035910606384277344 second


In [3]:
def read_ids(seed_value):
    """
    Read ids from file
    """
    assert seed_value >= 0 and seed_value <= 10
    
    with open("train_val_eval_seed_" + str(seed_value)+".txt", "r") as f:
        lines = f.readlines()
        Train_ids = eval(lines[0].split(":")[1])
        Val_ids = eval(lines[1].split(":")[1])
        test_ids = eval(lines[2].split(":")[1])
    return Train_ids, Val_ids, test_ids

In [None]:
def compute_mean_std(source_sits, target_sits, case):
    """
    Descr: Compute mean and std for each channel
    Input: both SITS dataset(.npz) paths
            Case[1 - 3]:
            1 - concatenate both dataset, while 2 & 3 rep source and target respectively
    The data(from N,LxD) is reshaped into (N,D,L);
        where N - pixel, D - Bands (10), L - Time (33)
    
    """
    
    # case = 1: both, case = 2: target, case = 3: target
    if case == 1:
        sits = [source_sits, target_sits]
    elif case == 2:
        sits = source_sits
    elif case == 3:
        sits = target_sits
    else:
        print('Select case between 1-3')
        return None
    
    # if sits is a list, then it's a list of paths
    if isinstance(sits, list):
        # load data
        X_source = np.load(sits[0])['X']
        X_target = np.load(sits[1])['X']
        # concatenate the data
        X = np.concatenate((X_source, X_target), axis=0)
    # if sits is a string, then it's a path
    else: 
        with np.load(sits) as data:
            X = data['X']

    X = X.reshape(X.shape[0], n_channel, int(X.shape[1]/n_channel))
    # compute mean and std
    X_mean = np.mean(X, axis=(0,2))
    X_std = np.std(X, axis=(0,2))
    print('mean shape: ', X_mean.shape)
    print('std shape: ', X_std.shape)
    # save X_mean and X_std sepearately for sits as txt file
    np.savetxt(os.path.join('mean_'+ str(case) +'.txt'), X_mean)
    np.savetxt(os.path.join('std_'+ str(case) +'.txt'), X_std)

for i in [1,2,3]:
    start_time = time.time()
    source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
    target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"
    compute_mean_std(source_path, target_path, i)
    print("run time: ", time.time() - start_time)

In [34]:
class SITSData(data.Dataset):
    def __init__(self, case_: int,source_path, target_path, seed, partition='train', transform=None):
        self.case_ = case_
        self.source_path = source_path
        self.target_path = target_path
        self.seed = seed
        self.transform = transform
        
        # get partition ids using the read_id() func
        start_time = time.time()
        
        self.train_ids, self.val_ids, self.test_ids = read_ids(self.seed)
        print("read ids completed: %s second" % (time.time() - start_time))

        # select partition
        if partition == 'train':
            self.ids = self.train_ids
        elif partition == 'val':
            self.ids = self.val_ids
        elif partition == 'test':
            self.ids = self.test_ids
        else:
            raise ValueError('Invalid partition: {}'.format(partition))

        # sits = either source_path or target or both based on the case (1,2,3)
        # case = 1: both, case = 2: target, case = 3: target
        if self.case_ == 1:
            sits = [self.source_path, self.target_path]
        elif self.case_ == 2:
            sits = self.source_path
        elif self.case_ == 3:
            sits = self.target_path
        else:
            print('Wrong case!')

        if isinstance(sits, list):
            self.sits = sits
            print('reading files....')
            X_source, y_source, block_ids_source = load_npz(self.sits[0])
            X_target, y_target, block_ids_target = load_npz(self.sits[1])
            
            # concatenate the data
            start_time = time.time()
            data_source = np.concatenate((X_source, y_source[:, None], block_ids_source[:, None]), axis=1)
            data_target = np.concatenate((X_target, y_target[:, None], block_ids_target[:, None]), axis=1)
            print("Concatenating completed: %s seconds" % (time.time() - start_time))
            
            # filter by block_id
            start_time = time.time()
            data_source = data_source[np.isin(data_source[:, -1], self.ids)]
            data_target = data_target[np.isin(data_target[:, -1], self.ids)]
            print("filtering ids completed: %s seconds" % (time.time() - start_time))

            self.X_ = np.concatenate((data_source[:, :-2], data_target[:, :-2]), axis=0)
            self.y_ = np.concatenate((data_source[:, -2], data_target[:, -2]), axis=0)
            
            del X_source
            del y_source
            del block_ids_source
            del data_source
            del data_target
        else:
            self.sits = sits
            start_time = time.time()
            print('reading files....')
            X, y, block_ids = load_npz(self.sits)
            print("load npz: %s seconds" % (time.time() - start_time))
            print(X.dtype)
            
            # concatenate the data
            start_time = time.time()
            data_ = np.concatenate((X, y[:, None], block_ids[:, None]), axis=1)
            print("Concatenating completed: %s seconds" % (time.time() - start_time))

            # filter by block_id
            start_time = time.time()
            data_ = data_[np.isin(data_[:, -1], self.ids)]
            print("filtering ids completed: %s seconds" % (time.time() - start_time))
            
            self.X_ = data_[:, :-2]
            self.y_ = data_[:, -2]
            print("%s dataset shape: " % partition,self.X_.shape)
            
            del X
            del y
            del block_ids
            del data_

    def __len__(self):
        return len(self.y_)

    def __getitem__(self, idx):
        start_time = time.time()
        self.X = self.X_[idx]
        self.y = self.y_[idx]
        print("getting data: %s seconds" % ((time.time() - start_time)*100))

        start_time = time.time()
        self.X = np.array(self.X).astype('float32')
        self.y = np.array(self.y).astype('float32')
        print("conversion: %s seconds" % ((time.time() - start_time)*100))
        
        start_time = time.time()
        self.X = self.X.reshape(int(self.X.shape[0]/n_channel), n_channel)
        print("reshape data: %s seconds" % ((time.time() - start_time)*100))
        

        # transform
        start_time = time.time()
        if self.transform:
            self.X = self.transform(self.X)
        print("transform data: %s seconds" % ((time.time() - start_time)*100))
        print(self.X.shape)
        
        start_time = time.time()
        torch_x = torch.from_numpy(self.X)
        torch_y = torch.from_numpy(self.y)
        print("tensor: %s seconds" % ((time.time() - start_time)*100))
        
        return 0, 0

In [35]:
class standardize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, sample):
        return (sample - self.mean) / self.std

In [36]:
# testing for a single domain (case 2 or 3)
case = 2
#read mean and std files
mean = np.loadtxt('mean_'+str(case)+'.txt')
std = np.loadtxt('std_'+str(case)+'.txt')
seed = 0
transform = transforms.Compose([standardize(mean, std)])

# paths
source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"

# start_time = time.time()
print('train dataset........')
start_time = time.time()
train_dataset = SITSData(case, source_path, target_path, seed, partition='train', transform=transform)
print('total running time: %s' % (time.time() - start_time))
print('Validation dataset.........')
start_time = time.time()
val_dataset = SITSData(case, source_path, target_path, seed, partition='val', transform=transform)
print('total running time: %s' % (time.time() - start_time))
print('Test dataset........')
start_time = time.time()
test_dataset = SITSData(case, source_path, target_path, seed, partition='test', transform=transform)
print('total running time: %s' % (time.time() - start_time))

train dataset........
read ids completed: 0.0010828971862792969 second
reading files....
load npz: 68.20331120491028 seconds
uint16
Concatenating completed: 5.404128313064575 seconds
filtering ids completed: 16.801015377044678 seconds
train dataset shape:  (10784283, 330)
total running time: 90.4606704711914
Validation dataset.........
read ids completed: 0.0011830329895019531 second
reading files....
load npz: 68.02338194847107 seconds
uint16
Concatenating completed: 4.7882819175720215 seconds
filtering ids completed: 2.1778595447540283 seconds
val dataset shape:  (1606360, 330)
total running time: 75.014643907547
Test dataset........
read ids completed: 0.001016378402709961 second
reading files....
load npz: 69.27445673942566 seconds
uint16
Concatenating completed: 5.248713731765747 seconds
filtering ids completed: 2.356628894805908 seconds
test dataset shape:  (1701000, 330)
total running time: 76.92407989501953


In [45]:
batch_size = 1
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
val_loader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True)

In [44]:
start_time = time.time()
x_train,y_train= next(iter(train_loader))
print('Train dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_val, y_val = next(iter(val_loader))
print('val dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_test, y_test = next(iter(test_loader))
print('test dataLoader time: ', time.time() - start_time)

getting data: 0.0024318695068359375 seconds
conversion: 0.007724761962890625 seconds
reshape data: 0.0017881393432617188 seconds
transform data: 0.007271766662597656 seconds
(33, 10)
tensor: 0.0030517578125 seconds
Train dataLoader time:  0.004945516586303711
getting data: 0.008893013000488281 seconds
conversion: 0.013375282287597656 seconds
reshape data: 0.0018835067749023438 seconds
transform data: 0.011587142944335938 seconds
(33, 10)
tensor: 0.01938343048095703 seconds
getting data: 0.00133514404296875 seconds
conversion: 0.0034093856811523438 seconds
reshape data: 0.0010728836059570312 seconds
transform data: 0.005078315734863281 seconds
(33, 10)
tensor: 0.00476837158203125 seconds
val dataLoader time:  0.45595383644104004
getting data: 0.009322166442871094 seconds
conversion: 0.016999244689941406 seconds
reshape data: 0.002574920654296875 seconds
transform data: 0.015234947204589844 seconds
(33, 10)
tensor: 0.020742416381835938 seconds
getting data: 0.002765655517578125 seconds
c

In [28]:
import timeit

In [33]:
%timeit next(iter(test_loader))

getting data: 0.05061626434326172 seconds
conversion: 0.013589859008789062 seconds
reshape data: 0.002384185791015625 seconds
transform data: 0.01819133758544922 seconds
(33, 10)
tensor: 0.021076202392578125 seconds
getting data: 0.04856586456298828 seconds
conversion: 0.004410743713378906 seconds
reshape data: 0.0016689300537109375 seconds
transform data: 0.005745887756347656 seconds
(33, 10)
tensor: 0.005125999450683594 seconds
getting data: 0.056672096252441406 seconds
conversion: 0.022411346435546875 seconds
reshape data: 0.0032663345336914062 seconds
transform data: 0.023484230041503906 seconds
(33, 10)
tensor: 0.02033710479736328 seconds
getting data: 0.0537872314453125 seconds
conversion: 0.0034332275390625 seconds
reshape data: 0.0010728836059570312 seconds
transform data: 0.004410743713378906 seconds
(33, 10)
tensor: 0.0025987625122070312 seconds
getting data: 0.04596710205078125 seconds
conversion: 0.0026702880859375 seconds
reshape data: 0.000858306884765625 seconds
transfor

In [46]:
start_time = time.time()
x_train,y_train= next(iter(train_loader))
print('Train dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_val, y_val = next(iter(val_loader))
print('val dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_test, y_test = next(iter(test_loader))
print('test dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_train,y_train= next(iter(train_loader))
print('Train dataLoader time: ', time.time() - start_time)

getting data: 0.007748603820800781 seconds
conversion: 0.008797645568847656 seconds
reshape data: 0.002002716064453125 seconds
transform data: 0.01201629638671875 seconds
(33, 10)
tensor: 0.009441375732421875 seconds
Train dataLoader time:  2.2028355598449707
getting data: 0.007271766662597656 seconds
conversion: 0.012230873107910156 seconds
reshape data: 0.00247955322265625 seconds
transform data: 0.010013580322265625 seconds
(33, 10)
tensor: 0.017881393432617188 seconds
getting data: 0.0013589859008789062 seconds
conversion: 0.0036478042602539062 seconds
reshape data: 0.0012636184692382812 seconds
transform data: 0.005316734313964844 seconds
(33, 10)
tensor: 0.00362396240234375 seconds
getting data: 0.0024557113647460938 seconds
conversion: 0.014710426330566406 seconds
reshape data: 0.0020742416381835938 seconds
transform data: 0.005984306335449219 seconds
(33, 10)
tensor: 0.004363059997558594 seconds
val dataLoader time:  0.5583875179290771
getting data: 0.008893013000488281 seconds

In [11]:
y_train

tensor([ 9,  9,  3,  5, 15,  6, 23, 16,  9,  9,  9,  9, 18,  9,  6,  6, 10,  8,
         2,  9,  9,  9,  8, 16,  9,  9, 23,  9,  8,  9,  5, 13, 10,  9,  2,  9,
         9,  9,  6,  2,  9,  5,  8,  5,  9,  9,  9,  9,  9,  2,  8, 23,  2,  9,
         5,  9, 23,  5,  9,  9,  2,  8,  9,  9,  5,  9,  9, 23, 10,  6,  9,  9,
         9,  8,  9,  9,  3, 10,  9,  8,  9,  5,  5,  6,  6, 10,  6,  2,  9,  6,
         8,  6,  9,  9,  9,  7,  9,  6,  2,  9,  9,  9,  5,  6,  9, 16,  9,  8,
         9,  9,  9,  9,  9,  9,  6,  7,  9,  9,  5,  7,  9,  9,  3, 15,  9, 23,
         9,  9], dtype=torch.int16)

In [None]:
print(x_train.shape)
x_train

In [14]:
# testing for a case 1 (combined both domains)
case = 1
#read mean and std files
mean = np.loadtxt('mean_'+str(case)+'.txt')
std = np.loadtxt('std_'+str(case)+'.txt')
seed = 0
transform = transforms.Compose([standardize(mean, std)])

# paths
source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"

# start_time = time.time()
print('train dataset........')
start_time = time.time()
train_dataset = SITSData(case, source_path, target_path, seed, partition='train', transform=transform)
print('total running time: %s' % (time.time() - start_time))
print('Validation dataset.........')
start_time = time.time()
val_dataset = SITSData(case, source_path, target_path, seed, partition='val', transform=transform)
print('total running time: %s' % (time.time() - start_time))
print('Test dataset........')
start_time = time.time()
test_dataset = SITSData(case, source_path, target_path, seed, partition='test', transform=transform)
print('total running time: %s' % (time.time() - start_time))

train dataset........
read ids completed: 0.001753091812133789 second
reading files....
total running time: 134.69096612930298
Validation dataset.........
read ids completed: 0.002171039581298828 second
reading files....
total running time: 116.40252137184143
Test dataset........
read ids completed: 0.0019958019256591797 second
reading files....
total running time: 117.6309289932251


In [15]:
start_time = time.time()
x_train,y_train= next(iter(train_loader))
print('Train dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_val, y_val = next(iter(val_loader))
print('val dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_test, y_test = next(iter(test_loader))
print('test dataLoader time: ', time.time() - start_time)

Train dataLoader time:  1.5588064193725586
val dataLoader time:  0.37462925910949707
test dataLoader time:  0.3885047435760498
