In [1]:
import os
import pandas as pd
import numpy as np
import json
import rasterio

In [2]:
data_dir = '../data/train_test'

main = 'ref_agrifieldnet_competition_v1'

source_collection = f'{main}_source'
train_label_collection = f'{main}_labels_train'
test_label_collection = f'{main}_labels_test'

In [6]:
with open (f'{data_dir}/{main}/{train_label_collection}/collection.json') as f:
    train_json = json.load(f)

with open (f'{data_dir}/{main}/{test_label_collection}/collection.json') as f: 
    test_json = json.load(f)

print(train_json.keys())
print(test_json.keys())

dict_keys(['type', 'stac_version', 'stac_extensions', 'id', 'title', 'description', 'license', 'sci:doi', 'sci:citation', 'extent', 'links'])
dict_keys(['type', 'stac_version', 'stac_extensions', 'id', 'title', 'description', 'license', 'sci:doi', 'sci:citation', 'extent', 'links'])


In [4]:
train_folder_ids = [i['href'].split('_')[-1].split('.')[0] for i in train_json['links'][4:]]
train_field_paths = [f'{data_dir}/{main}/{train_label_collection}/{train_label_collection}_{i}/field_ids.tif' for i in train_folder_ids]
train_label_paths = [f'{data_dir}/{main}/{train_label_collection}/{train_label_collection}_{i}/raster_labels.tif' for i in train_folder_ids]

print(train_folder_ids[0])
print(train_field_paths[0])
print(train_label_paths[0])

28852
../data/train_test/ref_agrifieldnet_competition_v1/ref_agrifieldnet_competition_v1_labels_train/ref_agrifieldnet_competition_v1_labels_train_28852/field_ids.tif
../data/train_test/ref_agrifieldnet_competition_v1/ref_agrifieldnet_competition_v1_labels_train/ref_agrifieldnet_competition_v1_labels_train_28852/raster_labels.tif


In [5]:
len(train_folder_ids)

1165

In [7]:
test_folder_ids = [i['href'].split('_')[-1].split('.')[0] for i in test_json['links'][4:]]
len([f'{data_dir}/{main}/{train_label_collection}/{test_label_collection}_{i}/field_ids.tif' for i in test_folder_ids])

707

In [None]:
competition_train_data = pd.DataFrame(train_folder_ids, columns=['unique_folder_id'])
competition_train_data['field_paths'] = train_field_paths
competition_train_data.head()

Unnamed: 0,unique_folder_id,field_paths
0,28852,../data/train_test/ref_agrifieldnet_competitio...
1,d987c,../data/train_test/ref_agrifieldnet_competitio...
2,ca1d4,../data/train_test/ref_agrifieldnet_competitio...
3,2ec18,../data/train_test/ref_agrifieldnet_competitio...
4,7575d,../data/train_test/ref_agrifieldnet_competitio...


In [None]:
with rasterio.open(f'{data_dir}/{main}/{train_label_collection}/{train_label_collection}_{train_folder_ids[0]}/field_ids.tif') as src:
    field_data = src.read()

print(field_data.shape)
print(np.unique(field_data, return_counts=True))

field_data = field_data[0]

(1, 256, 256)
(array([   0,  756,  757, 1372, 1374], dtype=uint16), array([65461,    14,    21,    38,     2]))


In [None]:
from collections import defaultdict

field_crops = defaultdict(lambda : [])

In [None]:
with rasterio.open(f'{data_dir}/{main}/{train_label_collection}/{train_label_collection}_{train_folder_ids[0]}/raster_labels.tif') as src:
    crop_data = src.read()

print(crop_data.shape)
print(np.unique(crop_data, return_counts=True))

# zero because we index a one-dimension input
crop_data = crop_data[0]

(1, 256, 256)
(array([0, 1, 5, 6], dtype=uint16), array([65461,     2,    38,    35]))


In [None]:
for x in range(crop_data.shape[0]):
    for y in range(crop_data.shape[1]):
        # get the field id and label of the pixel
        field_id = str(field_data[x][y])
        field_crop = crop_data[x][y]

        if field_crop not in field_crops[field_id]:
            field_crops[field_id].append(field_crop)

print(field_crops)

defaultdict(<function <lambda> at 0x7f47fb539900>, {'0': [0], '757': [6], '756': [6], '1372': [5], '1374': [1]})


In [None]:
field_crop_map  =[[k, v[0]]  for k, v in field_crops.items() ]
# why first item?
print(field_crop_map)

[['0', 0], ['757', 6], ['756', 6], ['1372', 5], ['1374', 1]]


In [None]:
field_crop = pd.DataFrame(field_crop_map , columns=['field_id','crop_id'])
field_crop.head()

Unnamed: 0,field_id,crop_id
0,0,0
1,757,6
2,756,6
3,1372,5
4,1374,1


In [None]:
field_crop[field_crop['field_id']!='0']

Unnamed: 0,field_id,crop_id
1,757,6
2,756,6
3,1372,5
4,1374,1


### Pixel-based Random Forest model

In [None]:
Full_bands = ['B01', 'B02', 'B03', 'B04','B05', 'B06', 'B07', 'B08','B8A', 'B09', 'B11', 'B12']

img_sh = 256
n_selected_bands= len(Full_bands[:3])
n_obs = 1  # imagery per chip (no time series)

X = np.empty((0, n_selected_bands * n_obs))
X_tile = np.empty((img_sh * img_sh, 0))
X_arrays = []

field_src = rasterio.open(competition_train_data['field_paths'].values[0])
field_array = field_src.read(1)
field_array.shape

(256, 256)

In [None]:
field_ids = np.empty((0, 1))
print(field_ids.shape)

field_ids = np.append(field_ids, field_array.flatten())
print(field_ids.shape)
print(np.unique(field_ids, return_counts=True))

(0, 1)
(65536,)
(array([   0.,  756.,  757., 1372., 1374.]), array([65461,    14,    21,    38,     2]))


In [91]:
bands_src = [rasterio.open(f'{data_dir}/{main}/{source_collection}/{source_collection}_{competition_train_data["unique_folder_id"][0]}/{band}.tif') for band in Full_bands[:3]]

In [93]:
bands_src

[<open DatasetReader name='../data/train_test/ref_agrifieldnet_competition_v1/ref_agrifieldnet_competition_v1_source/ref_agrifieldnet_competition_v1_source_28852/B01.tif' mode='r'>,
 <open DatasetReader name='../data/train_test/ref_agrifieldnet_competition_v1/ref_agrifieldnet_competition_v1_source/ref_agrifieldnet_competition_v1_source_28852/B02.tif' mode='r'>,
 <open DatasetReader name='../data/train_test/ref_agrifieldnet_competition_v1/ref_agrifieldnet_competition_v1_source/ref_agrifieldnet_competition_v1_source_28852/B03.tif' mode='r'>]

In [98]:
bands_src[0].read(1)

array([[43, 43, 44, ..., 44, 45, 45],
       [43, 43, 44, ..., 44, 45, 45],
       [43, 43, 44, ..., 44, 45, 45],
       ...,
       [44, 44, 43, ..., 44, 43, 43],
       [44, 44, 43, ..., 44, 43, 43],
       [44, 44, 43, ..., 44, 43, 43]], dtype=uint8)

In [99]:
bands_array = [np.expand_dims(band.read(1).flatten(), axis=1) for band in bands_src]

In [100]:
X_tile = np.hstack(bands_array)
X_arrays.append(X_tile)

In [102]:
X = np.concatenate(X_arrays)
data = pd.DataFrame(X, columns=Full_bands[:3])

In [103]:
data.head()

Unnamed: 0,B01,B02,B03
0,43,39,38
1,43,38,37
2,44,37,36
3,44,38,36
4,44,37,36


In [104]:
data['field_id'] = field_ids

In [105]:
data.head()

Unnamed: 0,B01,B02,B03,field_id
0,43,39,38,0.0
1,43,38,37,0.0
2,44,37,36,0.0
3,44,38,36,0.0
4,44,37,36,0.0


In [106]:
data = data[data['field_id']!=0]

In [108]:
data.groupby(['field_id']).mean().reset_index()

Unnamed: 0,field_id,B01,B02,B03
0,756.0,42.642857,37.357143,35.857143
1,757.0,43.47619,38.666667,37.952381
2,1372.0,43.131579,39.578947,39.078947
3,1374.0,44.0,40.0,40.0


In [109]:
train_data_grouped = data.groupby(['field_id']).mean().reset_index()
train_data_grouped.field_id = [str(int(i)) for i in train_data_grouped.field_id.values]

In [110]:
train_data_grouped.head()

Unnamed: 0,field_id,B01,B02,B03
0,756,42.642857,37.357143,35.857143
1,757,43.47619,38.666667,37.952381
2,1372,43.131579,39.578947,39.078947
3,1374,44.0,40.0,40.0


In [111]:
train_df = pd.merge(train_data_grouped, field_crop, on='field_id' )

In [112]:
train_df.head()

Unnamed: 0,field_id,B01,B02,B03,crop_id
0,756,42.642857,37.357143,35.857143,6
1,757,43.47619,38.666667,37.952381,6
2,1372,43.131579,39.578947,39.078947,5
3,1374,44.0,40.0,40.0,1
