# Calculate the mean for the bands of each field
*by Max*

Notebook to test how the data download works. For actually downloading the data please use the download_train_data.py or download_test_data.py respectively.

We first import all the needed modules.

In [1]:
# import the needed modules
import numpy as np
import pandas as pd
from scipy import stats
import os, random, pickle, time, glob, multiprocessing
from tqdm.auto import tqdm
from collections import OrderedDict

# import own modules from the scr folder
import sys
sys.path.append('../src/')
from preprocessing_functions_notebooks import get_clm, calculate_band_mode

Then we setup the directory environment and load the meta data.

In [2]:
# set the directory and the chunks in which the larger fields are splitted 
DATA_DIR = '../data'
DIR_BANDS = f'{DATA_DIR}/bands-raw/' 

# load the data frame and add the path information of the npz objects for each field to the data frame
df = pd.read_pickle(f'{DATA_DIR}/meta_data_fields_bands.pkl')
df['path'] = DIR_BANDS+df.field_id.astype(str)+'.npz'

Here we extract the field data from the npz files and calculate the mean of each field for each band on each date. 

In [16]:
field_ids = []
labels = []
dates = []
features = []
tile_ids = []

for _,row in tqdm(df.iterrows(), total=len(df)):
    bands = np.load(row.path)['arr_0']
    n = bands.shape[0]              # save the number of bands 
    n_dates = bands.shape[2]        # save the number of dates 

    bands, cloud = get_clm(bands)                       # get the cloud mask out of the bands and update
    cloud_mode = calculate_band_mode(cloud)             # calculate the mode over each pixel and for all dates for the cloud mask
    mean = np.mean(bands,axis=0)                        # calculate the mean over each pixel for the band and dates
    feature = mean.transpose(1,0)                       # switch the bands and dates
    feature = np.concatenate((feature, cloud_mode), axis=1)     # add the mode of the cloud mask back into the features
    features.append(feature)                            # add the features of each field to the features list
           
    field_id = np.repeat(row.field_id,feature.shape[0]) # get an array of the field ids, of the same size as the date array of the current feature
    field_ids.append(field_id)                          # add the field ids array to the field ids list
    tile_id = np.repeat(row.tile_id,feature.shape[0])
    tile_ids.append(tile_id)
    label = np.repeat(row.label,feature.shape[0])       # get an array of the labels, of the same size as the date array of the current feature
    labels.append(label)                                # add the label array to the labels list
    date = [str(d)[:10] for d in row.dates]             # goes through the dates in each row and saves them to a list without the time [-> [:10]]
    date = np.array(date)                               # convert the date list to an array
    dates.append(date)                                  # add the date array to the dates

100%|██████████| 87092/87092 [05:56<00:00, 244.44it/s]


Finally we put all the data together into one data frame and save it as a CSV file.

In [17]:
# put all of the list information into an array
all_features = np.concatenate(features)
all_field_ids = np.concatenate(field_ids)
all_tile_ids = np.concatenate(tile_ids)
all_dates = np.concatenate(dates)
all_labels = np.concatenate(labels)

# put all different information into one data frame
cols = ['B02', 'B03', 'B04', 'B08', 'B11', 'B12', 'CLM']
df_data = pd.DataFrame(all_features,columns=cols)
df_data.insert(0,'field_id',all_field_ids)
df_data.insert(1,'tile_id',all_tile_ids)
df_data.insert(2,'date',all_dates)
df_data.insert(3,'label',all_labels)

# save the data frame as CSV file
df_data.to_csv(f'{DATA_DIR}/mean_band_perField_perDate.csv', index=False)

---
### Numpy array experiments
This part is for experimentation with arrays in order to understand the manipulation of the arrays that is happening in the upper part.

In [3]:
arr = np.array([
                [[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 23, 12]], 
                [[13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24],]
                ])
print(arr.shape)
arr

(2, 3, 4)


array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 23, 12]],

       [[13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24]]])

In this test array we have 2 big array, each contains 3 sub array, and each of the sub array has 4 entries.
This is the same structure as the npz arrays per field have:
- We have for each pixel of the field a big array. 
- Each of these pixel arrays contains 7 band arrays. 
- And each of these band arrays contains a value for each date.

The next step is to transpose the big array with the sub arrays.
Or in terms of the npz arrays per field:
- We change the pixels and the bands position

In [4]:
arr_T = arr.transpose(1,0,2)
print(arr_T.shape)
arr_T

(3, 2, 4)


array([[[ 1,  2,  3,  4],
        [13, 14, 15, 16]],

       [[ 5,  6,  7,  8],
        [17, 18, 19, 20]],

       [[ 9, 10, 23, 12],
        [21, 22, 23, 24]]])

Then now we can easily access the single bands, for example the third like this:

In [5]:
band = arr_T[2]
print(band.shape)
band

(2, 4)


array([[ 9, 10, 23, 12],
       [21, 22, 23, 24]])

However we lost one dimension by that, what makes it impossible to transpose the data in a way we could work with that.

In [6]:
band = np.expand_dims(arr_T[2],axis=0)
print(band.shape)
band

(1, 2, 4)


array([[[ 9, 10, 23, 12],
        [21, 22, 23, 24]]])

And now we just transpose the data back to the old way.

In [7]:
band = band.transpose(1,0,2)
print(band.shape)
band

(2, 1, 4)


array([[[ 9, 10, 23, 12]],

       [[21, 22, 23, 24]]])

And we can calculate the mode or mean over all pixels for each band for each date.

In [8]:
print(arr.shape)
arr

(2, 3, 4)


array([[[ 1,  2,  3,  4],
        [ 5,  6,  7,  8],
        [ 9, 10, 23, 12]],

       [[13, 14, 15, 16],
        [17, 18, 19, 20],
        [21, 22, 23, 24]]])

In [9]:
# This is the mean of all field pixels for each band (here the rows)
mean = np.mean(arr,axis=0)
print(mean.shape)
mean

(3, 4)


array([[ 7.,  8.,  9., 10.],
       [11., 12., 13., 14.],
       [15., 16., 23., 18.]])

In [10]:
from scipy import stats
# This is the mode of all field pixels for each band (here the rows)
mode = stats.mode(arr)
print(np.squeeze(mode[0], axis=0).shape)
np.squeeze(mode[0], axis=0)

(3, 4)


array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 23, 12]])

In [11]:
print(band)
mode = stats.mode(band)
print(np.squeeze(mode[0],axis=0).shape)
np.squeeze(mode[0],axis=0)

[[[ 9 10 23 12]]

 [[21 22 23 24]]]
(1, 4)


array([[ 9, 10, 23, 12]])