In [1]:
import torch
from torch.utils import data
from torchvision import transforms
import numpy as np

from utils import load_npz

import os
import time
import random

In [2]:
n_channel = 10

In [25]:
def generate_ids():
    """
    Descr: 
        Aim: To write and returns the partition (Train, Validation and Test) ids 
        with respect to the grid split index (range 1 - 100)
        
        - A random seed value is set within a random intger 1-10,
        - the set is spltted into 80:10:10,
        - save into a text file with the seed value used
    """
    # set a random seed value within the range 1 -10 
    start_time = time.time()
    seed_value = np.random.randint(0,10)
    np.random.seed(seed_value)
    # # block id range 1 - 100 (splitted grid)
    block_range = np.arange(1, 101)

    # Train, Validation and Test
    random.shuffle(block_range)
    train_id = block_range[:80] # 80%
    val_id = block_range[80:90] # 10%
    test_id = block_range[90:] # 10%
    print("Seed value: ", seed_value)
    
    if not os.path.exists("train_val_eval_seed_" + str(seed_value)+".txt"):
        with open("train_val_eval_seed_" + str(seed_value)+".txt", "w") as f:
            f.write("Training: " + str(list(train_id)) + "\n")
            f.write("Validation: " + str(list(val_id)) + "\n")
            f.write("Testing: " + str(list(test_id)) + "\n")
            f.close()
    print('Read set ids completed: %s second' % (time.time() - start_time))
generate_ids()

Seed value:  9
Read set ids completed: 0.0035910606384277344 second


In [3]:
def read_ids(seed_value):
    """
    Read ids from file
    """
    assert seed_value >= 0 and seed_value <= 10
    
    with open("train_val_eval_seed_" + str(seed_value)+".txt", "r") as f:
        lines = f.readlines()
        Train_ids = eval(lines[0].split(":")[1])
        Val_ids = eval(lines[1].split(":")[1])
        test_ids = eval(lines[2].split(":")[1])
    return Train_ids, Val_ids, test_ids

In [None]:
def compute_mean_std(source_sits, target_sits, case):
    """
    Descr: Compute mean and std for each channel
    Input: both SITS dataset(.npz) paths
            Case[1 - 3]:
            1 - concatenate both dataset, while 2 & 3 rep source and target respectively
    The data(from N,LxD) is reshaped into (N,D,L);
        where N - pixel, D - Bands (10), L - Time (33)
    
    """
    
    # case = 1: both, case = 2: target, case = 3: target
    if case == 1:
        sits = [source_sits, target_sits]
    elif case == 2:
        sits = source_sits
    elif case == 3:
        sits = target_sits
    else:
        print('Select case between 1-3')
        return None
    
    # if sits is a list, then it's a list of paths
    if isinstance(sits, list):
        # load data
        X_source = np.load(sits[0])['X']
        X_target = np.load(sits[1])['X']
        # concatenate the data
        X = np.concatenate((X_source, X_target), axis=0)
    # if sits is a string, then it's a path
    else: 
        with np.load(sits) as data:
            X = data['X']

    X = X.reshape(X.shape[0], n_channel, int(X.shape[1]/n_channel))
    # compute mean and std
    X_mean = np.mean(X, axis=(0,2))
    X_std = np.std(X, axis=(0,2))
    print('mean shape: ', X_mean.shape)
    print('std shape: ', X_std.shape)
    # save X_mean and X_std sepearately for sits as txt file
    np.savetxt(os.path.join('mean_'+ str(case) +'.txt'), X_mean)
    np.savetxt(os.path.join('std_'+ str(case) +'.txt'), X_std)

for i in [1,2,3]:
    start_time = time.time()
    source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
    target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"
    compute_mean_std(source_path, target_path, i)
    print("run time: ", time.time() - start_time)

In [5]:
class SITSData(data.Dataset):
    def __init__(self, case_: int,source_path, target_path, seed, partition='train', transform=None):
        self.case_ = case_
        self.source_path = source_path
        self.target_path = target_path
        self.seed = seed
        self.transform = transform
        
        # get partition ids using the read_id() func
        start_time = time.time()
        
        self.train_ids, self.val_ids, self.test_ids = read_ids(self.seed)
        print("read ids completed: %s second" % (time.time() - start_time))

        # select partition
        if partition == 'train':
            self.ids = self.train_ids
        elif partition == 'val':
            self.ids = self.val_ids
        elif partition == 'test':
            self.ids = self.test_ids
        else:
            raise ValueError('Invalid partition: {}'.format(partition))

        # sits = either source_path or target or both based on the case (1,2,3)
        # case = 1: both, case = 2: target, case = 3: target
        if self.case_ == 1:
            sits = [self.source_path, self.target_path]
        elif self.case_ == 2:
            sits = self.source_path
        elif self.case_ == 3:
            sits = self.target_path
        else:
            print('Wrong case!')

        if isinstance(sits, list):
            self.sits = sits
            print('reading files....')
            X_source, y_source, block_ids_source = load_npz(self.sits[0])
            X_target, y_target, block_ids_target = load_npz(self.sits[1])
            
            # concatenate the data
            data_source = np.concatenate((X_source, y_source[:, None], block_ids_source[:, None]), axis=1)
            data_target = np.concatenate((X_target, y_target[:, None], block_ids_target[:, None]), axis=1)
            
            # filter by block_id
            data_source = data_source[np.isin(data_source[:, -1], self.ids)]
            data_target = data_target[np.isin(data_target[:, -1], self.ids)]

            self.X_ = np.concatenate((data_source[:, :-2], data_target[:, :-2]), axis=0)
            self.y_ = np.concatenate((data_source[:, -2], data_target[:, -2]), axis=0)
            
            del X_source
            del y_source
            del block_ids_source
            del data_source
            del data_target
        else:
            self.sits = sits
            start_time = time.time()
            print('reading files....')
            X, y, block_ids = load_npz(self.sits)
            print("load npz: %s seconds" % (time.time() - start_time))
            
            # concatenate the data
            start_time = time.time()
            data_ = np.concatenate((X, y[:, None], block_ids[:, None]), axis=1)
            print("Concatenating completed: %s seconds" % (time.time() - start_time))

            # filter by block_id
            start_time = time.time()
            data_ = data_[np.isin(data_[:, -1], self.ids)]
            print("filtering ids completed: %s seconds" % (time.time() - start_time))
            
            self.X_ = data_[:, :-2].astype('int16')
            self.y_ = data_[:, -2].astype('int16')
            print("%s dataset shape: " % partition,self.X_.shape)
            
            del X
            del y
            del block_ids
            del data_

    def __len__(self):
        return len(self.y_)

    def __getitem__(self, idx):
        self.X = self.X_[idx]
        self.y = self.y_[idx]
        self.X = self.X.reshape(int(self.X.shape[0]/n_channel), n_channel)

        # transform
        if self.transform:
            self.X = self.transform(self.X)

        # from uint16 to float32
        self.X = np.array(self.X)
        self.y = np.array(self.y)
        
        return torch.from_numpy(self.X), torch.from_numpy(self.y)

In [6]:
class standardize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, sample):
        return (sample - self.mean) / self.std

In [7]:
# testing for a single domain (case 2 or 3)
case = 2
#read mean and std files
mean = np.loadtxt('mean_'+str(case)+'.txt')
std = np.loadtxt('std_'+str(case)+'.txt')
seed = 0
transform = transforms.Compose([standardize(mean, std)])

# paths
source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"

# start_time = time.time()
print('train dataset........')
start_time = time.time()
train_dataset = SITSData(case, source_path, target_path, seed, partition='train', transform=transform)
print('total running time: %s' % (time.time() - start_time))
print('Validation dataset.........')
start_time = time.time()
val_dataset = SITSData(case, source_path, target_path, seed, partition='val', transform=transform)
print('total running time: %s' % (time.time() - start_time))
print('Test dataset........')
start_time = time.time()
test_dataset = SITSData(case, source_path, target_path, seed, partition='test', transform=transform)
print('total running time: %s' % (time.time() - start_time))

read ids completed: 0.0010349750518798828 second
reading files....
load npz: 56.06220984458923 seconds
Concatenating completed: 4.451143980026245 seconds
filtering ids completed: 8.555372476577759 seconds
train dataset shape:  (10784283, 330)
read ids completed: 0.000713348388671875 second
reading files....
load npz: 55.75451397895813 seconds
Concatenating completed: 4.298412084579468 seconds
filtering ids completed: 1.3094518184661865 seconds
val dataset shape:  (1606360, 330)
read ids completed: 0.0011510848999023438 second
reading files....
load npz: 55.67504906654358 seconds
Concatenating completed: 4.300490140914917 seconds
filtering ids completed: 1.3801517486572266 seconds
test dataset shape:  (1701000, 330)
total running time: 195.5245544910431


In [8]:
batch_size = 128
train_loader = data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
val_loader = data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
test_loader = data.DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)

In [10]:
start_time = time.time()
x_train,y_train= next(iter(train_loader))
print('Train dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_val, y_val = next(iter(val_loader))
print('val dataLoader time: ', time.time() - start_time)
start_time = time.time()
x_test, y_test = next(iter(test_loader))
print('test dataLoader time: ', time.time() - start_time)

Train dataLoader time:  1.366302490234375
val dataLoader time:  0.3278977870941162
test dataLoader time:  0.3547360897064209


In [11]:
y_train

tensor([ 9,  9,  3,  5, 15,  6, 23, 16,  9,  9,  9,  9, 18,  9,  6,  6, 10,  8,
         2,  9,  9,  9,  8, 16,  9,  9, 23,  9,  8,  9,  5, 13, 10,  9,  2,  9,
         9,  9,  6,  2,  9,  5,  8,  5,  9,  9,  9,  9,  9,  2,  8, 23,  2,  9,
         5,  9, 23,  5,  9,  9,  2,  8,  9,  9,  5,  9,  9, 23, 10,  6,  9,  9,
         9,  8,  9,  9,  3, 10,  9,  8,  9,  5,  5,  6,  6, 10,  6,  2,  9,  6,
         8,  6,  9,  9,  9,  7,  9,  6,  2,  9,  9,  9,  5,  6,  9, 16,  9,  8,
         9,  9,  9,  9,  9,  9,  6,  7,  9,  9,  5,  7,  9,  9,  3, 15,  9, 23,
         9,  9], dtype=torch.int16)

In [32]:
print(x_train.shape)
x_train

torch.Size([128, 33, 10])


tensor([[[-1.4253, -1.5608, -1.5151,  ..., -0.2972, -0.3814, -0.6342],
         [-1.4253, -1.5608, -1.5151,  ..., -0.2972, -0.3814, -0.6342],
         [-1.4253, -1.5608, -1.5151,  ..., -0.2972, -0.3814, -0.6342],
         ...,
         [-1.4138, -1.4792, -1.7155,  ...,  1.1339, -0.5750, -1.1722],
         [-1.3749, -1.4792, -1.6722,  ...,  0.5066, -0.4960, -1.0249],
         [-1.4282, -1.5105, -1.4263,  ..., -0.0293, -0.2797, -0.6789]],

        [[-0.5632, -0.1255, -0.0384,  ...,  0.4385,  1.4067,  1.3153],
         [-0.5632, -0.1255, -0.0384,  ...,  0.4385,  1.4067,  1.3153],
         [-0.5632, -0.1255, -0.0384,  ...,  0.4385,  1.4067,  1.3153],
         ...,
         [-0.6843, -0.1786, -0.0316,  ...,  0.4600,  1.3786,  1.1969],
         [-1.0880, -0.8915, -0.7876,  ..., -0.3716,  0.0719, -0.2238],
         [-1.0764, -0.8466, -0.8650,  ..., -0.0974,  0.1541, -0.2159]],

        [[-1.4931, -1.6492, -1.7553,  ...,  0.0200, -0.5533, -0.8657],
         [-1.4931, -1.6492, -1.7553,  ...,  0

In [None]:
case = 1
#read mean and std files
mean = np.loadtxt('mean_'+str(case)+'.txt')
std = np.loadtxt('std_'+str(case)+'.txt')

transform = transforms.Compose([standardize(mean, std)])

source_path = "../../../data/theiaL2A_zip_img/output/2018/2018_SITS_data.npz"
target_path = "../../../data/theiaL2A_zip_img/output/2019/2019_SITS_data.npz"
start_time = time.time()
dataset = SIts(case, source_path, target_path, partition='train', transform=transform)
print('total running time: %s' % (time.time() - start_time))