# Radiant Earth Spot the Crop Challenge
*Max Notebook*


In [1]:
# Import the needed modules
import os
import numpy as np
import pandas as pd
import datetime
import rasterio

In [16]:
# Load the base data from the CSV files
train_df = pd.read_csv('./data/train_data.csv')
test_df = pd.read_csv('./data/test_data.csv')

## Get overview over the base data

In [19]:
train_df

Unnamed: 0,tile_id,datetime,satellite_platform,asset,file_path
0,2587,,,documentation,./data/ref_south_africa_crops_competition_v1_t...
1,2587,,,field_ids,./data/ref_south_africa_crops_competition_v1_t...
2,2587,,,field_info_train,./data/ref_south_africa_crops_competition_v1_t...
3,2587,,,labels,./data/ref_south_africa_crops_competition_v1_t...
4,2587,,,raster_values,./data/ref_south_africa_crops_competition_v1_t...
...,...,...,...,...,...
1024507,2198,2017-11-30T00:00:00Z,s2,B04,./data/ref_south_africa_crops_competition_v1_t...
1024508,2198,2017-11-30T00:00:00Z,s2,B08,./data/ref_south_africa_crops_competition_v1_t...
1024509,2198,2017-11-30T00:00:00Z,s2,B11,./data/ref_south_africa_crops_competition_v1_t...
1024510,2198,2017-11-30T00:00:00Z,s2,B12,./data/ref_south_africa_crops_competition_v1_t...


In [30]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1024512 entries, 0 to 1024511
Data columns (total 5 columns):
 #   Column              Non-Null Count    Dtype 
---  ------              --------------    ----- 
 0   tile_id             1024512 non-null  int64 
 1   datetime            1011262 non-null  object
 2   satellite_platform  1011262 non-null  object
 3   asset               1024512 non-null  object
 4   file_path           1024512 non-null  object
dtypes: int64(1), object(4)
memory usage: 39.1+ MB


In [20]:
train_df['asset'].unique()

array(['documentation', 'field_ids', 'field_info_train', 'labels',
       'raster_values', 'B02', 'B03', 'B04', 'B08', 'B11', 'B12', 'CLM'],
      dtype=object)

In [None]:
train_df[train_df['asset'] == 'labels']

In [39]:
train_df['datetime'].unique()

array([nan, '2017-04-01T00:00:00Z', '2017-04-11T00:00:00Z',
       '2017-04-21T00:00:00Z', '2017-05-01T00:00:00Z',
       '2017-05-11T00:00:00Z', '2017-05-21T00:00:00Z',
       '2017-05-31T00:00:00Z', '2017-06-10T00:00:00Z',
       '2017-06-20T00:00:00Z', '2017-06-30T00:00:00Z',
       '2017-07-05T00:00:00Z', '2017-07-10T00:00:00Z',
       '2017-07-15T00:00:00Z', '2017-07-20T00:00:00Z',
       '2017-07-25T00:00:00Z', '2017-07-30T00:00:00Z',
       '2017-08-04T00:00:00Z', '2017-08-09T00:00:00Z',
       '2017-08-14T00:00:00Z', '2017-08-19T00:00:00Z',
       '2017-08-24T00:00:00Z', '2017-08-29T00:00:00Z',
       '2017-09-08T00:00:00Z', '2017-09-18T00:00:00Z',
       '2017-09-23T00:00:00Z', '2017-09-28T00:00:00Z',
       '2017-10-03T00:00:00Z', '2017-10-08T00:00:00Z',
       '2017-10-13T00:00:00Z', '2017-10-18T00:00:00Z',
       '2017-10-23T00:00:00Z', '2017-10-28T00:00:00Z',
       '2017-11-02T00:00:00Z', '2017-11-07T00:00:00Z',
       '2017-11-12T00:00:00Z', '2017-11-17T00:00:00Z',
     

In [34]:
train_df['datetime_formatted'] = pd.to_datetime(train_df['datetime'])

In [35]:
train_df

Unnamed: 0,tile_id,datetime,satellite_platform,asset,file_path,datetime_formatted
0,2587,,,documentation,./data/ref_south_africa_crops_competition_v1_t...,NaT
1,2587,,,field_ids,./data/ref_south_africa_crops_competition_v1_t...,NaT
2,2587,,,field_info_train,./data/ref_south_africa_crops_competition_v1_t...,NaT
3,2587,,,labels,./data/ref_south_africa_crops_competition_v1_t...,NaT
4,2587,,,raster_values,./data/ref_south_africa_crops_competition_v1_t...,NaT
...,...,...,...,...,...,...
1024507,2198,2017-11-30T00:00:00Z,s2,B04,./data/ref_south_africa_crops_competition_v1_t...,2017-11-30 00:00:00+00:00
1024508,2198,2017-11-30T00:00:00Z,s2,B08,./data/ref_south_africa_crops_competition_v1_t...,2017-11-30 00:00:00+00:00
1024509,2198,2017-11-30T00:00:00Z,s2,B11,./data/ref_south_africa_crops_competition_v1_t...,2017-11-30 00:00:00+00:00
1024510,2198,2017-11-30T00:00:00Z,s2,B12,./data/ref_south_africa_crops_competition_v1_t...,2017-11-30 00:00:00+00:00


## Load the date of the individual images

In [139]:
# For simplicty of this baseline model, we will use only 5 images throughout the growing season
# You can choose to use all of them, select a few of them at specifc intervals, or 
# load as many as you want and interpolate between them to have a regular temporal frequency.

# Another assumption is that we are selecting the first 5 cloud free images. Ideally, you should
# select the images across the different tiles with the same temporal frequency. 
n_obs = 1

X = np.empty((0, 6 * n_obs))
y = np.empty((0, 1))
field_ids = np.empty((0, 1))

# Create an array of the tile ids
tile_ids_train = train_df['tile_id'].unique()

# Loop through each tile 
for tile_id in tile_ids_train:
    tile_df = train_df[train_df['tile_id']==tile_id]

    label_src = rasterio.open(tile_df[tile_df['asset']=='labels']['file_path'].values[0])
    label_array = label_src.read(1)
    y = np.append(y, label_array.flatten())

    field_id_src = rasterio.open(tile_df[tile_df['asset']=='field_ids']['file_path'].values[0])
    field_id_array = field_id_src.read(1)
    field_ids = np.append(field_ids, field_id_array.flatten())

    tile_date_times = tile_df[tile_df['satellite_platform']=='s2']['datetime'].unique()

    X_tile = np.empty((256 * 256, 0))
    n_X = 0

    # Here we retrieve the cloud band, and check if it's cloud free we will load the other bands
    # Otherwise we will pass on to the next observation
    for date_time in tile_date_times:
        
        # Load the CLM data and look for the maximum value in each tile for the given date time
        clm_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='CLM')]['file_path'].values[0])  
        clm_max = np.max(clm_src.read(1))

        # We choose data below 1 since this should represent cloud free data
        if clm_max < 25:
            n_X+=1

            b2_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B02')]['file_path'].values[0])
            b2_array = np.expand_dims(b2_src.read(1).flatten(), axis=1)

            b3_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B03')]['file_path'].values[0])
            b3_array = np.expand_dims(b3_src.read(1).flatten(), axis=1)

            b4_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B04')]['file_path'].values[0])
            b4_array = np.expand_dims(b4_src.read(1).flatten(), axis=1)

            b8_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B08')]['file_path'].values[0])
            b8_array = np.expand_dims(b8_src.read(1).flatten(), axis=1)

            b11_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B11')]['file_path'].values[0])
            b11_array = np.expand_dims(b11_src.read(1).flatten(), axis=1)

            b12_src = rasterio.open(tile_df[(tile_df['datetime']==date_time) & (tile_df['asset']=='B12')]['file_path'].values[0])
            b12_array = np.expand_dims(b12_src.read(1).flatten(), axis=1)

            X_tile = np.append(X_tile, b2_array, axis = 1)
            X_tile = np.append(X_tile, b3_array, axis = 1)
            X_tile = np.append(X_tile, b4_array, axis = 1)
            X_tile = np.append(X_tile, b8_array, axis = 1)
            X_tile = np.append(X_tile, b11_array, axis = 1)
            X_tile = np.append(X_tile, b12_array, axis = 1)
            
            print(date_time)
        if n_X == n_obs:
            break
        
    X = np.append(X, X_tile, axis=0)

2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-01T00:00:00Z
2017-04-11T00:00:00Z
2017-04-01T00:00:00Z
2017-04-01T00:00:00Z
2017-04-01T00:00:00Z
2017-04-11T00:00:00Z
2017-04-21T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-01T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-01T00:00:00Z
2017-04-01T00:00:00Z
2017-04-01T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-01T00:00:00Z
2017-04-14T00:00:00Z
2017-04-01T00:00:00Z
2017-04-01T00:00:00Z
2017-04-11T00:00:00Z
2017-04-01T00:00:00Z
2017-04-01T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-14T00:00:00Z
2017-04-11T00:00:00Z
2017-04-11T00:00:00Z
2017-04-01T00:00:00Z
2017-04-11T00:00:00Z
2017-04-01T00:00:00Z
2017-04-01T00:00:00Z
2017-04-11T00

In [None]:
data_train = pd.DataFrame(X)
data_train['label'] = y.astype(int)
data_train['field_id'] = field_ids
# If the field id is 0 than there is no field, so we can filter these out
data_train = data_train[data_train.label != 0] #this filters the pixels that don't have a label (or corresponding field ID)
data_train

## Building the baseline model

In [None]:
# Each field has several pixels in the data. Here our goal is to build a Random Forest (RF) model using the average values
# of the pixels within each field. So, we use `groupby` to take the mean for each field_id
data_grouped = data_train.groupby('field_id').mean().reset_index()
data_grouped

In [None]:
# Split train and test
# We use field_ids to split the data to train and test. Note that the test portion for training is different than the test 
# portion provided as part of the competition. 
train_per = 0.7

n_fields = len(data_grouped['field_id'])
np.random.seed(10)
train_fields = np.random.choice(data_grouped['field_id'], int(n_fields * train_per), replace=False)
test_fields = data_grouped['field_id'][~np.in1d(data_grouped['field_id'], train_fields)]