In [1]:
# Import the needed modules
import numpy as np
import pandas as pd
import os,random,pickle,time,glob,sys, multiprocessing
from tqdm.auto import tqdm
from collections import OrderedDict

In [2]:
# Set the directory and the chunks in which the larger fields are splitted 
DATA_DIR = '../data'
DIR_BANDS = f'{DATA_DIR}/train/bands-raw/' 
# Load the data frame and add the path information of the npz objects for each field to the data frame
df = pd.read_pickle(f'{DATA_DIR}/train/field_meta_train.pkl')
df['path'] = DIR_BANDS+df.field_id.astype(str)+'.npz'

In [26]:
df.head()

Unnamed: 0,field_id,tile_id,label,dates,path
0,1,2171,4,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:...",../data/train/bands-raw/1.npz
1,2,1703,7,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:...",../data/train/bands-raw/2.npz
2,3,2214,6,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:...",../data/train/bands-raw/3.npz
3,4,2526,8,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:...",../data/train/bands-raw/4.npz
4,6,544,4,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:...",../data/train/bands-raw/6.npz


In [72]:
field_ids = []
labels = []
dates = []
features = [] 
for _,row in tqdm(df.iterrows(), total=len(df)):
    bands = np.load(row.path)['arr_0']
    n = bands.shape[0]              # save the number of bands 
    n_dates = bands.shape[2]        # save the number of dates 

    mean = np.mean(bands,axis=0)    # calculate the mean over each pixel for the band and dates
    feature = mean.transpose(1,0)   # switch the bands and dates
    features.append(feature)        # add the features of each field to the features list

    field_id = np.repeat(row.field_id,feature.shape[0]) # get an array of the field ids, of the same size as the date array of the current feature
    field_ids.append(field_id)                          # add the field ids array to the field ids list
    label = np.repeat(row.label,feature.shape[0])       # get an array of the labels, of the same size as the date array of the current feature
    labels.append(label)                                # add the label array to the labels list
    date = [str(d)[:10] for d in row.dates]             # goes through the dates in each row and saves them to a list without the time [-> [:10]]
    date = np.array(date)                               # convert the date list to an array
    dates.append(date)                                  # add the date array to the dates

100%|██████████| 87092/87092 [08:23<00:00, 173.01it/s]


In [73]:
all_features = np.concatenate(features)
all_field_ids = np.concatenate(field_ids)
all_labels = np.concatenate(labels)
all_dates = np.concatenate(dates)

cols = ['B02', 'B03', 'B04', 'B08', 'B11', 'B12', 'CLM']
df_data = pd.DataFrame(all_features,columns=cols)
df_data.insert(0,'field_id',all_field_ids)
df_data.insert(1,'date',all_dates)
df_data.insert(2,'label',all_labels)

df_data.to_csv(f'{DATA_DIR}/mean_band_perField_perDate.csv', index=False)


In [74]:
df_data

Unnamed: 0,field_id,date,label,B02,B03,B04,B08,B11,B12,CLM
0,1,2017-04-01,4,21.934084,29.180065,35.554661,62.490353,68.397102,46.040192,253.770096
1,1,2017-04-11,4,14.844051,23.114147,30.607718,58.736336,73.435692,48.863342,0.000000
2,1,2017-04-21,4,13.385852,21.596462,29.223473,57.065918,73.668808,49.313503,0.000000
3,1,2017-05-01,4,15.408360,22.471062,29.371382,56.434082,71.057877,46.557877,36.897106
4,1,2017-05-11,4,54.829582,65.739548,72.906754,95.672028,66.147911,58.643085,255.000000
...,...,...,...,...,...,...,...,...,...,...
4301222,122736,2017-11-17,9,23.995344,40.734951,65.776855,92.758232,129.440308,111.960426,0.000000
4301223,122736,2017-11-20,9,254.999664,254.366150,251.179916,248.818085,158.322586,111.898239,255.000000
4301224,122736,2017-11-22,9,30.892584,45.352844,69.422012,98.832390,139.207855,120.539078,0.000000
4301225,122736,2017-11-27,9,25.569672,42.917194,67.971733,94.232460,131.371460,114.247421,0.000000
