# Radiant Earth Spot the Crop Challenge
*Max Notebook*


In [3]:
# Import the needed modules
import os
from tqdm.auto import tqdm
import numpy as np
import pandas as pd
import datetime
import rasterio

In [7]:
OUTPUT_DIR = './data'
OUTPUT_DIR = f'{OUTPUT_DIR}/train'
OUTPUT_DIR_BANDS = f'{OUTPUT_DIR}/bands-raw' 

In [8]:
# Load the base data from the CSV files
df_train = pd.read_csv(f'{OUTPUT_DIR}/train_data.csv')
df_meta = pd.read_pickle(f'{OUTPUT_DIR}/field_meta_train.pkl')

In [17]:
df_meta

Unnamed: 0,field_id,tile_id,label,dates
0,1,2171,4,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:..."
1,2,1703,7,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:..."
2,3,2214,6,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:..."
3,4,2526,8,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:..."
4,6,544,4,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:..."
...,...,...,...,...
87087,122731,2298,4,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:..."
87088,122732,2225,5,"[2017-04-04T00:00:00.000000000, 2017-04-14T00:..."
87089,122733,1986,2,"[2017-04-01T00:00:00.000000000, 2017-04-11T00:..."
87090,122735,997,3,"[2017-04-01T00:00:00.000000000, 2017-04-04T00:..."


In [61]:
df_meta['dates'].map(len).unique()

array([38, 76, 49, 51, 40, 55, 68, 56, 59, 70, 47, 41, 39, 48, 75, 50, 42,
       67, 73, 54, 62, 58, 74, 65, 37, 60])

In [62]:
df_meta['dates'].map(set).map(len).unique()

array([38, 76, 49, 51, 40, 55, 68, 56, 59, 70, 47, 41, 39, 48, 75, 50, 42,
       67, 73, 54, 62, 58, 74, 65, 37, 60])

## Get overview over the base data

In [19]:
train_df

Unnamed: 0,tile_id,datetime,satellite_platform,asset,file_path
0,2587,,,documentation,./data/ref_south_africa_crops_competition_v1_t...
1,2587,,,field_ids,./data/ref_south_africa_crops_competition_v1_t...
2,2587,,,field_info_train,./data/ref_south_africa_crops_competition_v1_t...
3,2587,,,labels,./data/ref_south_africa_crops_competition_v1_t...
4,2587,,,raster_values,./data/ref_south_africa_crops_competition_v1_t...
...,...,...,...,...,...
1024507,2198,2017-11-30T00:00:00Z,s2,B04,./data/ref_south_africa_crops_competition_v1_t...
1024508,2198,2017-11-30T00:00:00Z,s2,B08,./data/ref_south_africa_crops_competition_v1_t...
1024509,2198,2017-11-30T00:00:00Z,s2,B11,./data/ref_south_africa_crops_competition_v1_t...
1024510,2198,2017-11-30T00:00:00Z,s2,B12,./data/ref_south_africa_crops_competition_v1_t...


In [34]:
train_df['datetime_formatted'] = pd.to_datetime(train_df['datetime'])

In [None]:
start_of_season = pd.to_datetime('2016-10-01')
train_df['days_till_season_start'] = train_df['datetime_formatted'] - start_of_season

In [35]:
train_df[train_df['datetime_formatted'].dt.month == 10]

Unnamed: 0,tile_id,datetime,satellite_platform,asset,file_path,datetime_formatted
0,2587,,,documentation,./data/ref_south_africa_crops_competition_v1_t...,NaT
1,2587,,,field_ids,./data/ref_south_africa_crops_competition_v1_t...,NaT
2,2587,,,field_info_train,./data/ref_south_africa_crops_competition_v1_t...,NaT
3,2587,,,labels,./data/ref_south_africa_crops_competition_v1_t...,NaT
4,2587,,,raster_values,./data/ref_south_africa_crops_competition_v1_t...,NaT
...,...,...,...,...,...,...
1024507,2198,2017-11-30T00:00:00Z,s2,B04,./data/ref_south_africa_crops_competition_v1_t...,2017-11-30 00:00:00+00:00
1024508,2198,2017-11-30T00:00:00Z,s2,B08,./data/ref_south_africa_crops_competition_v1_t...,2017-11-30 00:00:00+00:00
1024509,2198,2017-11-30T00:00:00Z,s2,B11,./data/ref_south_africa_crops_competition_v1_t...,2017-11-30 00:00:00+00:00
1024510,2198,2017-11-30T00:00:00Z,s2,B12,./data/ref_south_africa_crops_competition_v1_t...,2017-11-30 00:00:00+00:00


## Building the baseline model

In [None]:
# Each field has several pixels in the data. Here our goal is to build a Random Forest (RF) model using the average values
# of the pixels within each field. So, we use `groupby` to take the mean for each field_id
data_grouped = data_train.groupby('field_id').mean().reset_index()
data_grouped

In [None]:
# Split train and test
# We use field_ids to split the data to train and test. Note that the test portion for training is different than the test 
# portion provided as part of the competition. 
train_per = 0.7

n_fields = len(data_grouped['field_id'])
np.random.seed(10)
train_fields = np.random.choice(data_grouped['field_id'], int(n_fields * train_per), replace=False)
test_fields = data_grouped['field_id'][~np.in1d(data_grouped['field_id'], train_fields)]