In [None]:
import torch
from torch.utils import data
from torchvision import transforms
import numpy as np

from utils import load_npz

import os
import time
import random

In [2]:
n_channel = 10

In [3]:
def generate_ids():
    """
    Descr: 
        Aim: To write and returns the partition (Train, Validation and Test) ids 
        with respect to the grid split index (range 1 - 100)
        
        - A random seed value is set within a random intger 1-10,
        - the set is spltted into 80:10:10,
        - save into a text file with the seed value used
    """
    # set a random seed value within the range 1 -10 
    start_time = time.time()
    seed_value = np.random.randint(0,10)
    np.random.seed(seed_value)
    # # block id range 1 - 100 (splitted grid)
    block_range = np.arange(1, 101)

    # Train, Validation and Test
    random.shuffle(block_range)
    train_id = block_range[:80] # 80%
    val_id = block_range[80:90] # 10%
    test_id = block_range[90:] # 10%
    print("Seed value: ", seed_value)
    
    if not os.path.exists("./ids/train_val_eval_seed_" + str(seed_value)+".txt"):
        with open("./ids/train_val_eval_seed_" + str(seed_value)+".txt", "w") as f:
            f.write("Training: " + str(list(train_id)) + "\n")
            f.write("Validation: " + str(list(val_id)) + "\n")
            f.write("Testing: " + str(list(test_id)) + "\n")
            f.close()
    print('Read set ids completed: %s second' % (time.time() - start_time))
# generate_ids()

In [4]:
def read_ids(seed_value):
    """
    Read ids from file
    """
    assert seed_value >= 0 and seed_value <= 10
    
    with open("./ids/train_val_eval_seed_" + str(seed_value)+".txt", "r") as f:
        lines = f.readlines()
        Train_ids = eval(lines[0].split(":")[1])
        Val_ids = eval(lines[1].split(":")[1])
        test_ids = eval(lines[2].split(":")[1])
    return Train_ids, Val_ids, test_ids

In [5]:
def compute_mean_std(source_sits, target_sits, case):
    """
    Descr: Compute mean and std for each channel
    Input: both SITS dataset(.npz) paths
            Case[1 - 3]:
            1 - concatenate both dataset, while 2 & 3 rep source and target respectively
    The data(from N,LxD) is reshaped into (N,D,L);
        where N - pixel, D - Bands (10), L - Time (33)
    
    """
    
    # case = 1: both, case = 2: target, case = 3: target
    if case == 1:
        sits = [source_sits, target_sits]
    elif case == 2:
        sits = source_sits
    elif case == 3:
        sits = target_sits
    else:
        print('Select case between 1-3')
        return None
    
    # if sits is a list, then it's a list of paths
    if isinstance(sits, list):
        # load data
        X_source = np.load(sits[0])['X']
        X_target = np.load(sits[1])['X']
        # concatenate the data
        X = np.concatenate((X_source, X_target), axis=0)
    # if sits is a string, then it's a path
    else: 
        with np.load(sits) as data:
            X = data['X']

    X = X.reshape(X.shape[0], n_channel, int(X.shape[1]/n_channel))
    # compute mean and std
    X_mean = np.mean(X, axis=(0,2))
    X_std = np.std(X, axis=(0,2))
    print('mean shape: ', X_mean.shape)
    print('std shape: ', X_std.shape)
    # save X_mean and X_std sepearately for sits as txt file
    np.savetxt(os.path.join('mean_'+ str(case) +'.txt'), X_mean)
    np.savetxt(os.path.join('std_'+ str(case) +'.txt'), X_std)

# for i in [1,2,3]:
#     start_time = time.time()
#     source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
#     target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"
#     compute_mean_std(source_path, target_path, i)
#     print("run time: ", time.time() - start_time)

In [6]:
def compute_mean_stdv2(sits, domain='source'):
    """
    Descr: Compute mean and std for each channel
    Input: both SITS dataset(.npz) paths
    The data(from N,LxD) is reshaped into (N,D,L);
        where N - pixel, D - Bands (10), L - Time (33)
    """
    with np.load(sits) as data:
            X = data['X']

    X = X.reshape(X.shape[0], n_channel, int(X.shape[1]/n_channel))
    # compute mean and std
    X_mean = np.mean(X, axis=(0,2))
    X_std = np.std(X, axis=(0,2))
    print('mean shape: ', X_mean.shape)
    print('std shape: ', X_std.shape)
    # save X_mean and X_std sepearately for sits as txt file
    np.savetxt(os.path.join('./mean_std/', domain + '_mean.txt'), X_mean)
    np.savetxt(os.path.join('./mean_std/', domain + '_std.txt'), X_std)

source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"
# compute_mean_stdv2(source_path, 'source')
# compute_mean_stdv2(target_path, domain = 'target')

In [6]:
# class SITSData(data.Dataset):
#     def __init__(self, case_: int,source_path, target_path, seed, partition='train', transform=None):
#         self.case_ = case_
#         self.source_path = source_path
#         self.target_path = target_path
#         self.seed = seed
#         self.transform = transform
        
#         # get partition ids using the read_id() func
#         start_time = time.time()
        
#         self.train_ids, self.val_ids, self.test_ids = read_ids(self.seed)
#         print("read ids completed: %s second" % (time.time() - start_time))

#         # select partition
#         if partition == 'train':
#             self.ids = self.train_ids
#         elif partition == 'val':
#             self.ids = self.val_ids
#         elif partition == 'test':
#             self.ids = self.test_ids
#         else:
#             raise ValueError('Invalid partition: {}'.format(partition))

#         # sits = either source_path or target or both based on the case (1,2,3)
#         # case = 1: both, case = 2: target, case = 3: target
#         if self.case_ == 1:
#             sits = [self.source_path, self.target_path]
#         elif self.case_ == 2:
#             sits = self.source_path
#         elif self.case_ == 3:
#             sits = self.target_path
#         else:
#             print('Wrong case!')

#         if isinstance(sits, list):
#             self.sits = sits
#             print('reading files....')
#             X_source, y_source, block_ids_source = load_npz(self.sits[0])
#             X_target, y_target, block_ids_target = load_npz(self.sits[1])
            
#             # concatenate the data
#             start_time = time.time()
#             data_source = np.concatenate((X_source, y_source[:, None], block_ids_source[:, None]), axis=1)
#             data_target = np.concatenate((X_target, y_target[:, None], block_ids_target[:, None]), axis=1)
#             print("Concatenating completed: %s seconds" % (time.time() - start_time))
            
#             # filter by block_id
#             start_time = time.time()
#             data_source = data_source[np.isin(data_source[:, -1], self.ids)]
#             data_target = data_target[np.isin(data_target[:, -1], self.ids)]
#             print("filtering ids completed: %s seconds" % (time.time() - start_time))

#             self.X_ = np.concatenate((data_source[:, :-2], data_target[:, :-2]), axis=0)
#             self.y_ = np.concatenate((data_source[:, -2], data_target[:, -2]), axis=0)
            
#             del X_source
#             del y_source
#             del block_ids_source
#             del data_source
#             del data_target
#         else:
#             self.sits = sits
#             start_time = time.time()
#             print('reading files....')
#             X, y, block_ids = load_npz(self.sits)
#             print("load npz: %s seconds" % (time.time() - start_time))
#             print(X.dtype)
            
#             # concatenate the data
#             start_time = time.time()
#             data_ = np.concatenate((X, y[:, None], block_ids[:, None]), axis=1)
#             print("Concatenating completed: %s seconds" % (time.time() - start_time))

#             # filter by block_id
#             start_time = time.time()
#             data_ = data_[np.isin(data_[:, -1], self.ids)]
#             print("filtering ids completed: %s seconds" % (time.time() - start_time))
            
#             self.X_ = data_[:, :-2]
#             self.y_ = data_[:, -2]
#             print("%s dataset shape: " % partition,self.X_.shape)
            
#             del X
#             del y
#             del block_ids
#             del data_

#     def __len__(self):
#         return len(self.y_)

#     def __getitem__(self, idx):
#         start_time = time.time()
#         self.X = self.X_[idx]
#         self.y = self.y_[idx]
#         print("getting data: %s seconds" % ((time.time() - start_time)*100))

#         start_time = time.time()
#         self.X = np.array(self.X).astype('float32')
#         self.y = np.array(self.y).astype('float32')
#         print("conversion: %s seconds" % ((time.time() - start_time)*100))
        
#         start_time = time.time()
#         self.X = self.X.reshape(int(self.X.shape[0]/n_channel), n_channel)
#         print("reshape data: %s seconds" % ((time.time() - start_time)*100))
        

#         # transform
#         start_time = time.time()
#         if self.transform:
#             self.X = self.transform(self.X)
#         print("transform data: %s seconds" % ((time.time() - start_time)*100))
#         print(self.X.shape)
        
#         start_time = time.time()
#         torch_x = torch.from_numpy(self.X)
#         torch_y = torch.from_numpy(self.y)
#         print("tensor: %s seconds" % ((time.time() - start_time)*100))
        
#         return torch_x, torch_y

In [7]:
class SITSDatav2(data.Dataset):
    def __init__(self, sits, seed, partition='train', transform=None):
        
        self.sits = sits
        self.seed = seed
        self.transform = transform
        
        # get partition ids using the read_id() func
        start_time = time.time()
        self.train_ids, self.val_ids, self.test_ids = read_ids(self.seed)
        print("read ids completed: %s second" % (time.time() - start_time))

        # select partition
        if partition == 'train':
            self.ids = self.train_ids
        elif partition == 'val':
            self.ids = self.val_ids
        elif partition == 'test':
            self.ids = self.test_ids
        else:
            raise ValueError('Invalid partition: {}'.format(partition))
        
        start_time = time.time()
        print('reading files....')
        X, y, block_ids = load_npz(self.sits)
        print("load npz: %s seconds" % (time.time() - start_time))
        print(X.dtype)
        
        # concatenate the data
        start_time = time.time()
        data_ = np.concatenate((X, y[:, None], block_ids[:, None]), axis=1)
        print("Concatenating completed: %s seconds" % (time.time() - start_time))

        # filter by block_id
        start_time = time.time()
        data_ = data_[np.isin(data_[:, -1], self.ids)]
        print("filtering ids completed: %s seconds" % (time.time() - start_time))
        
        self.X_ = data_[:, :-2]
        self.y_ = data_[:, -2]
        print("%s dataset shape: " % partition,self.X_.shape)
        
        del X
        del y
        del block_ids
        del data_

    def __len__(self):
        return len(self.y_)

    def __getitem__(self, idx):
        start_time = time.time()
        self.X = self.X_[idx]
        self.y = self.y_[idx]
        print("getting data: %s seconds" % ((time.time() - start_time)*100))

        start_time = time.time()
        self.X = np.array(self.X).astype('float32')
        self.y = np.array(self.y).astype('float32')
        print("conversion: %s seconds" % ((time.time() - start_time)*100))
        
        start_time = time.time()
        self.X = self.X.reshape(int(self.X.shape[0]/n_channel), n_channel)
        print("reshape data: %s seconds" % ((time.time() - start_time)*100))

        # transform
        start_time = time.time()
        if self.transform:
            self.X = self.transform(self.X)
        print("transform data: %s seconds" % ((time.time() - start_time)*100))
        print(self.X.shape)
        
        start_time = time.time()
        torch_x = torch.from_numpy(self.X)
        torch_y = torch.from_numpy(self.y)
        print("tensor: %s seconds" % ((time.time() - start_time)*100))
        
        return torch_x, torch_y

In [8]:
class standardize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, sample):
        return (sample - self.mean) / self.std

In [9]:
mean = np.loadtxt('./mean_std/source_mean.txt')
std = np.loadtxt('./mean_std/source_std.txt')
seed = 0 
transform = transforms.Compose([standardize(mean, std)])

# paths
source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"

train_dataset = SITSDatav2(source_path, seed, partition='train', transform=transform)

read ids completed: 0.02151179313659668 second
reading files....
load npz: 99.5813729763031 seconds
uint16
Concatenating completed: 6.561253070831299 seconds
filtering ids completed: 13.308213472366333 seconds
train dataset shape:  (10784283, 330)


In [10]:
batch_size = 2
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True)

In [20]:
def recursive_todevice(x, device):
    if isinstance(x, torch.Tensor):
        return x.to(device)
    else:
        return [recursive_todevice(c, device) for c in x]

In [25]:
y_true = []
for i, (x,y) in enumerate(train_loader):
    # y_true.extend(list(map(int,y)))
    print(x)
    x = recursive_todevice(x)
    break

SyntaxError: invalid syntax (<ipython-input-25-3e0179226cff>, line 6)

In [18]:
y_true

[2, 9]

In [10]:
start_time = time.time()
train_iter = iter(train_loader)
next(train_iter)
next(train_iter)
print('Train dataLoader time: ', time.time() - start_time)

getting data: 0.06923675537109375 seconds
conversion: 0.0225067138671875 seconds
reshape data: 0.0034332275390625 seconds
transform data: 0.023627281188964844 seconds
(33, 10)
tensor: 0.02582073211669922 seconds
getting data: 0.05042552947998047 seconds
conversion: 0.006318092346191406 seconds
reshape data: 0.002574920654296875 seconds
transform data: 0.009632110595703125 seconds
(33, 10)
tensor: 0.0058650970458984375 seconds
getting data: 0.0095367431640625 seconds
conversion: 0.0041484832763671875 seconds
reshape data: 0.0021696090698242188 seconds
transform data: 0.009632110595703125 seconds

(33, 10)Train dataLoader time:  13.386313676834106
tensor: 0.00438690185546875 seconds
getting data: 0.0017642974853515625 secondsconversion: 0.0039577484130859375 seconds


reshape data: 0.0013589859008789062 secondstransform data: 0.0049591064453125 seconds
(33, 10)
tensor: 0.0026226043701171875 seconds


In [None]:
# # testing for a single domain (case 2 or 3)
# case = 2
# #read mean and std files
# mean = np.loadtxt('mean_'+str(case)+'.txt')
# std = np.loadtxt('std_'+str(case)+'.txt')
# seed = 0
# transform = transforms.Compose([standardize(mean, std)])

# # paths
# source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
# target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"

# # start_time = time.time()
# print('train dataset........')
# start_time = time.time()
# train_dataset = SITSData(case, source_path, target_path, seed, partition='train', transform=transform)
# print('total running time: %s' % (time.time() - start_time))
# print('Validation dataset.........')
# start_time = time.time()
# val_dataset = SITSData(case, source_path, target_path, seed, partition='val', transform=transform)
# print('total running time: %s' % (time.time() - start_time))
# print('Test dataset........')
# start_time = time.time()
# test_dataset = SITSData(case, source_path, target_path, seed, partition='test', transform=transform)
# print('total running time: %s' % (time.time() - start_time))

In [9]:
# batch_size = 128
# train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
# val_loader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
# test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)

In [10]:
# start_time = time.time()
# train_iter = iter(train_loader)
# next(train_iter)
# next(train_iter)
# print('Train dataLoader time: ', time.time() - start_time)

getting data: 0.007700920104980469 seconds
conversion: 0.013184547424316406 seconds
reshape data: 0.0024318695068359375 seconds
transform data: 0.020194053649902344 seconds
(33, 10)
tensor: 0.030612945556640625 seconds
getting data: 0.005984306335449219 seconds
conversion: 0.008702278137207031 seconds
reshape data: 0.002193450927734375 seconds
transform data: 0.009965896606445312 seconds
(33, 10)
tensor: 0.005245208740234375 seconds
Train dataLoader time:  7.261138439178467


In [10]:
# start_time = time.time()
# next(train_iter)
# next(train_iter)
# print('Train dataLoader time: ', time.time() - start_time)

getting data: 0.002193450927734375 seconds
conversion: 0.004506111145019531 seconds
reshape data: 0.00133514404296875 seconds
transform data: 0.006103515625 seconds
(33, 10)
tensor: 0.006651878356933594 seconds
getting data: 0.001811981201171875 seconds
conversion: 0.0020265579223632812 seconds
reshape data: 0.0008344650268554688 seconds
transform data: 0.003528594970703125 seconds
(33, 10)
tensor: 0.002288818359375 seconds
Train dataLoader time:  0.00993800163269043


In [11]:
start_time = time.time()
val_iter = iter(val_loader)
next(val_iter)
next(val_iter)
print('Val dataLoader time: ', time.time() - start_time)

getting data: 0.0021219253540039062 seconds
conversion: 0.0030517578125 seconds
reshape data: 0.001239776611328125 seconds
transform data: 0.00514984130859375 seconds
(33, 10)
tensor: 0.0042438507080078125 seconds
getting data: 0.0013828277587890625 seconds
conversion: 0.001621246337890625 seconds
reshape data: 0.0007152557373046875 seconds
transform data: 0.0027418136596679688 seconds
(33, 10)
tensor: 0.0016689300537109375 seconds
Val dataLoader time:  0.19984221458435059


In [12]:
start_time = time.time()
next(val_iter)
next(val_iter)
print('Val dataLoader time: ', time.time() - start_time)

getting data: 0.0021696090698242188 seconds
conversion: 0.0024557113647460938 seconds
reshape data: 0.0008344650268554688 seconds
transform data: 0.004410743713378906 seconds
(33, 10)
tensor: 0.004839897155761719 seconds
getting data: 0.001239776611328125 seconds
conversion: 0.0017404556274414062 seconds
reshape data: 0.0006437301635742188 seconds
transform data: 0.0027894973754882812 seconds
(33, 10)
tensor: 0.0013828277587890625 seconds
Val dataLoader time:  0.008423566818237305


In [13]:
start_time = time.time()
test_iter = iter(test_loader)
next(test_iter)
next(test_iter)
print('Test dataLoader time: ', time.time() - start_time)

getting data: 0.0018835067749023438 seconds
conversion: 0.0031232833862304688 seconds
reshape data: 0.0011444091796875 seconds
transform data: 0.004649162292480469 seconds
(33, 10)
tensor: 0.004172325134277344 seconds
getting data: 0.000858306884765625 seconds
conversion: 0.0014066696166992188 seconds
reshape data: 0.00057220458984375 seconds
transform data: 0.0021696090698242188 seconds
(33, 10)
tensor: 0.0012159347534179688 seconds
Test dataLoader time:  0.2181689739227295


In [14]:
start_time = time.time()
next(test_iter)
next(test_iter)
print('Test dataLoader time: ', time.time() - start_time)

getting data: 0.0017881393432617188 seconds
conversion: 0.019669532775878906 seconds
reshape data: 0.000858306884765625 seconds
transform data: 0.0051021575927734375 seconds
(33, 10)
tensor: 0.004553794860839844 seconds
getting data: 0.0009059906005859375 seconds
conversion: 0.0015020370483398438 seconds
reshape data: 0.0006198883056640625 seconds
transform data: 0.0027418136596679688 seconds
(33, 10)
tensor: 0.00152587890625 seconds
Test dataLoader time:  0.0036427974700927734


In [None]:
start_time = time.time()
x_train,y_train= next(iter(train_loader))
print('Train dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_val, y_val = next(iter(val_loader))
print('val dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_test, y_test = next(iter(test_loader))
print('test dataLoader time: ', time.time() - start_time)

In [28]:
import timeit

In [None]:
%timeit next(iter(test_loader))

In [None]:
start_time = time.time()
x_train,y_train= next(iter(train_loader))
print('Train dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_val, y_val = next(iter(val_loader))
print('val dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_test, y_test = next(iter(test_loader))
print('test dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_train,y_train= next(iter(train_loader))
print('Train dataLoader time: ', time.time() - start_time)

In [11]:
y_train

tensor([ 9,  9,  3,  5, 15,  6, 23, 16,  9,  9,  9,  9, 18,  9,  6,  6, 10,  8,
         2,  9,  9,  9,  8, 16,  9,  9, 23,  9,  8,  9,  5, 13, 10,  9,  2,  9,
         9,  9,  6,  2,  9,  5,  8,  5,  9,  9,  9,  9,  9,  2,  8, 23,  2,  9,
         5,  9, 23,  5,  9,  9,  2,  8,  9,  9,  5,  9,  9, 23, 10,  6,  9,  9,
         9,  8,  9,  9,  3, 10,  9,  8,  9,  5,  5,  6,  6, 10,  6,  2,  9,  6,
         8,  6,  9,  9,  9,  7,  9,  6,  2,  9,  9,  9,  5,  6,  9, 16,  9,  8,
         9,  9,  9,  9,  9,  9,  6,  7,  9,  9,  5,  7,  9,  9,  3, 15,  9, 23,
         9,  9], dtype=torch.int16)

In [None]:
print(x_train.shape)
x_train

In [14]:
# testing for a case 1 (combined both domains)
case = 1
#read mean and std files
mean = np.loadtxt('mean_'+str(case)+'.txt')
std = np.loadtxt('std_'+str(case)+'.txt')
seed = 0
transform = transforms.Compose([standardize(mean, std)])

# paths
source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"

# start_time = time.time()
print('train dataset........')
start_time = time.time()
train_dataset = SITSData(case, source_path, target_path, seed, partition='train', transform=transform)
print('total running time: %s' % (time.time() - start_time))
print('Validation dataset.........')
start_time = time.time()
val_dataset = SITSData(case, source_path, target_path, seed, partition='val', transform=transform)
print('total running time: %s' % (time.time() - start_time))
print('Test dataset........')
start_time = time.time()
test_dataset = SITSData(case, source_path, target_path, seed, partition='test', transform=transform)
print('total running time: %s' % (time.time() - start_time))

train dataset........
read ids completed: 0.001753091812133789 second
reading files....
total running time: 134.69096612930298
Validation dataset.........
read ids completed: 0.002171039581298828 second
reading files....
total running time: 116.40252137184143
Test dataset........
read ids completed: 0.0019958019256591797 second
reading files....
total running time: 117.6309289932251


In [15]:
start_time = time.time()
x_train,y_train= next(iter(train_loader))
print('Train dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_val, y_val = next(iter(val_loader))
print('val dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_test, y_test = next(iter(test_loader))
print('test dataLoader time: ', time.time() - start_time)

Train dataLoader time:  1.5588064193725586
val dataLoader time:  0.37462925910949707
test dataLoader time:  0.3885047435760498
