In [167]:
# Import the needed modules
import numpy as np
import pandas as pd
from scipy import stats
import os,random,pickle,time,glob,sys, multiprocessing
from tqdm.auto import tqdm
from collections import OrderedDict

In [168]:
# Set the directory and the chunks in which the larger fields are splitted 
DATA_DIR = '../data'
DIR_BANDS = f'{DATA_DIR}/images/bands-raw/' 
# Load the data frame and add the path information of the npz objects for each field to the data frame
df = pd.read_pickle(f'{DATA_DIR}/meta_data_fields_bands.pkl')
df['path'] = DIR_BANDS+df.field_id.astype(str)+'.npz'

In [169]:
def get_clm(bands):
    """ Extracts the cloud mask band from an array of bands.

    Args:
        bands (numpy.ndarray): Array of pixel of the current field for each band and date.

    Returns:
        bands (numpy.ndarray): Array without the cloud mask band.
        cloud (numpy.ndarray): Array of the cloud mask band.
    """
    bands_T = bands.transpose(1,0,2)                        # switch the pixel and bands
    cloud = np.expand_dims(bands_T[len(bands_T)-1],axis=0)  # get the last band, which is the cloud mask
    bands_T = bands_T[0:len(bands_T)-1]                     # remove the cloud mask band from the array
    bands = bands_T.transpose(1,0,2)                        # switch bands and pixel
    cloud = cloud.transpose(1,0,2)                          # switch bands and pixel
    return bands, cloud

def calculate_band_mode(band):
    """Calculates the mode for a given band.

    Args:
        band (numpy.ndarray): Array of pixel of the current field for one band and each date.

    Returns:
        numpy.ndarray: Mode over all pixel for the band for each date.
    """
    mode = stats.mode(band)                             # calculate the mode
    mode = np.squeeze(mode[0],axis=0).transpose(1,0)    # reshapes the array into the form (dates, band)
    return mode

In [170]:
field_ids = []
labels = []
dates = []
features = []

for _,row in tqdm(df.iterrows(), total=len(df)):
    bands = np.load(row.path)['arr_0']
    n = bands.shape[0]              # save the number of bands 
    n_dates = bands.shape[2]        # save the number of dates 

    bands, cloud = get_clm(bands)                       # get the cloud mask out of the bands and update
    cloud_mode = calculate_band_mode(cloud)             # calculate the mode over each pixel and for all dates for the cloud mask
    mean = np.mean(bands,axis=0)                        # calculate the mean over each pixel for the band and dates
    feature = mean.transpose(1,0)                       # switch the bands and dates
    feature = np.concatenate((feature, cloud_mode), axis=1)     # add the mode of the cloud mask back into the features
    features.append(feature)                            # add the features of each field to the features list
           
    field_id = np.repeat(row.field_id,feature.shape[0]) # get an array of the field ids, of the same size as the date array of the current feature
    field_ids.append(field_id)                          # add the field ids array to the field ids list
    label = np.repeat(row.label,feature.shape[0])       # get an array of the labels, of the same size as the date array of the current feature
    labels.append(label)                                # add the label array to the labels list
    date = [str(d)[:10] for d in row.dates]             # goes through the dates in each row and saves them to a list without the time [-> [:10]]
    date = np.array(date)                               # convert the date list to an array
    dates.append(date)                                  # add the date array to the dates

100%|██████████| 87092/87092 [05:48<00:00, 250.05it/s]


In [171]:
all_features = np.concatenate(features)
all_field_ids = np.concatenate(field_ids)
all_labels = np.concatenate(labels)
all_dates = np.concatenate(dates)

cols = ['B02', 'B03', 'B04', 'B08', 'B11', 'B12', 'CLM']
df_data = pd.DataFrame(all_features,columns=cols)
df_data.insert(0,'field_id',all_field_ids)
df_data.insert(1,'date',all_dates)
df_data.insert(2,'label',all_labels)

df_data.to_csv(f'{DATA_DIR}/mean_band_perField_perDate.csv', index=False)

In [172]:
df_data

Unnamed: 0,field_id,date,label,B02,B03,B04,B08,B11,B12,CLM
0,1,2017-04-01,4,21.934084,29.180065,35.554661,62.490353,68.397102,46.040192,255.0
1,1,2017-04-11,4,14.844051,23.114147,30.607718,58.736336,73.435692,48.863342,0.0
2,1,2017-04-21,4,13.385852,21.596462,29.223473,57.065918,73.668808,49.313503,0.0
3,1,2017-05-01,4,15.408360,22.471062,29.371382,56.434082,71.057877,46.557877,0.0
4,1,2017-05-11,4,54.829582,65.739548,72.906754,95.672028,66.147911,58.643085,255.0
...,...,...,...,...,...,...,...,...,...,...
4301222,122736,2017-11-17,9,23.995344,40.734951,65.776855,92.758232,129.440308,111.960426,0.0
4301223,122736,2017-11-20,9,254.999664,254.366150,251.179916,248.818085,158.322586,111.898239,255.0
4301224,122736,2017-11-22,9,30.892584,45.352844,69.422012,98.832390,139.207855,120.539078,0.0
4301225,122736,2017-11-27,9,25.569672,42.917194,67.971733,94.232460,131.371460,114.247421,0.0
