# Read all the files in "ice" subfolder and creates a huge 4D numpy array to train the CNN model

In [None]:
from os import listdir

#Initiating the input directory
indir = r'../../data\ice'

#Finding all the processed geotiffs in the input directory
files = [f for f in listdir(indir) if f.startswith('S0_RS2') and f.endswith('.tif')]

#Prints out the number of files to filter
print('There are ' + str(len(files)) + ' files to filter...')

In [None]:
import numpy as np

#Function to resample the different bands into N x N subsamples
def resamp(arr, N):
    A = []
    for v in np.vsplit(arr, arr.shape[0] // N):
        A.extend([*np.hsplit(v, arr.shape[1] // N)])
    return np.array(A)

In [None]:
#Randomly selecting year for test dataset
test_year = np.random.choice(np.arange(2011,2019)) #2019 is not included here since it was the original test year
                                                   #2009 and 10 were excluded due to small amount of images for those years
print(test_year)

In [None]:
import gdal
from os.path import join
from datetime import datetime

#Initiating the output training data directory
outdir = r'../../data/ice'

#Initiating which class to assemble (NOTE: This step is necessary due to the large amount of data the code has to hold in
#memory)
#Possible inputs: ['ice', 'water']
cnn_class = 'ice'

#Specifying the sample size, e.g. number of pixels
numpix = 20

#Specifying the number of bands
numbands = 4

#Initiating training data array
trarray = np.empty([0,numpix,numpix,numbands], dtype=np.float16)

#Start filtering all images
for image in files:
    year = int(image.split('_')[6][0:4])
    month = int(image.split('_')[6][4:6])
    day = int(image.split('_')[6][6:8])
    
    #Images from 2019-08-01 and beyond are kept for testing the CNN model
    if (datetime(year,month,day) < datetime(test_year,8,1)) or (datetime(year,month,day) > datetime(test_year+1,8,1)):
        
        print('Adding ' + datetime(year,month,day).strftime("%m/%d/%Y") + ' to the ice training data array')

        #Reading the processed image
        ras = gdal.Open(join(indir, image))
        x = ras.RasterXSize
        y = ras.RasterYSize

        #Creating a temporary array
        temp_array = np.empty([int(int(x/numpix)*int(y/numpix)),numpix,numpix,numbands])

        #Subsampling the image by band
        for k in np.arange(numbands):

            #Getting the band from the image
            temp_array2 = ras.GetRasterBand(int(k+1)).ReadAsArray(0, 0, 
                                                                  int(int(x/numpix) * numpix), 
                                                                  int(int(y/numpix) * numpix))
            #Resampling the band into 20 x 20 subsamples
            temp_array[:,:,:,k] = resamp(temp_array2, numpix)

            del temp_array2

        #Initiating an empty list of indices to remove NoData subsamples
        idx = []

        #Finding the NoData subsamples
        for i in np.arange(len(temp_array)):
            if (~np.isnan(temp_array[i,:,:,0]).any() and 
                ~np.isnan(temp_array[i,:,:,1]).any() and 
                ~np.any(temp_array[i,:,:,0]==-90) and 
                ~np.any(temp_array[i,:,:,1]==-90)):
                idx.append(i)

        #Removing the NoData subsamples
        temp_array = temp_array[idx, :, :, :]

        #Appending the image temporary array to the final ice training data array
        trarray = np.append(trarray, temp_array.astype(np.float16), axis=0)
        del temp_array, ras, x, y

#Saving the Training Data
np.save(join(outdir, 'TrainingDataS0_' + cnn_class + '_rev.npy'), trarray)
        