# Eo-learn hoeffding tree example

### Requirements

To run the example you'll need a Sentinel Hub account. 
If you do not have one: [Sentinel Hub webpage](https://services.sentinel-hub.com/oauth/subscription).

Once you have the account set up, login to [Sentinel Hub Configurator](https://apps.sentinel-hub.com/configurator/).
For this tutorial create a new configuration (`"Add new configuration"`) and set the configuration to be based on **Python scripts template**.

Then put configuration's **instance ID** into `sentinelhub` package's configuration file following the [configuration instructions](http://sentinelhub-py.readthedocs.io/en/latest/configure.html).

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

from eolearn.core import EOTask, EOPatch, LinearWorkflow, FeatureType, OverwritePermission, LoadFromDisk, SaveToDisk
from eolearn.io import S2L1CWCSInput
from eolearn.geometry import VectorToRaster

from sentinelhub import BBox, transform_bbox, CRS, GeopediaFeatureIterator

import geopandas as gpd

import ml_rapids

Define bbox and time range.

BBox and time range are defined manually, for two patches in Slovenia.
For more examples how to do use more EOPatches look at [eo-learn examples](https://eo-learn.readthedocs.io/en/latest/examples.html)

In [2]:
bbox_0 = BBox(((543473.5912890476, 5139156.267383285), (546805.1886957382, 5142522.075033964)), crs='EPSG:32633')
bbox_1 = BBox(((543473.5912890476, 5142522.075033964), (546805.1886957382, 5145887.882684642)), crs='EPSG:32633')

In [3]:
time_interval = ['2017-04-01', '2017-07-01']

## Define tasks to download data

First, we define a task to download Sentinel-2 data.

In [4]:
add_data = S2L1CWCSInput(
    layer='BANDS-S2-L1C', 
    feature=(FeatureType.DATA, 'bands'), 
    resx='10m', 
    resy='10m', 
    maxcc=0.80
)

save = SaveToDisk(folder='eopatches', overwrite_permission=OverwritePermission.OVERWRITE_PATCH)

The ask to get data from Geopedia. Downloading data about crop types for Slovenia in the year 2017

In [5]:
class AddGeopediaVectorFeature(EOTask):
    """
    Add vector data from Geopedia.
    """
    def __init__(self, feature, layer, drop_duplicates=False):
        self.feature_type, self.feature_name = next(self._parse_features(feature)())
        self.layer = layer
        self.drop_duplicates = drop_duplicates
                
    def execute(self, eopatch):
        # convert to 3857 CRS
        bbox_3857 = BBox.transform(eopatch.bbox, CRS.POP_WEB)
        
        # get iterator over features
        gpd_iter = GeopediaFeatureIterator(layer=self.layer, bbox=bbox_3857)

        features = list(gpd_iter)
        if len(features):
            gdf = gpd.GeoDataFrame.from_features(features)
            gdf.crs = {'init': 'epsg:4326'}
            # convert back to EOPatch CRS
            gdf = gdf.to_crs({'init': f'epsg:{eopatch.bbox.crs.value}'})
            
            if self.drop_duplicates:
                sel = gdf.drop('geometry', axis=1)
                sel = sel.drop_duplicates()
                gdf = gdf.loc[sel.index]
                
            eopatch[self.feature_type][self.feature_name] = gdf 

        return eopatch

# Slovenia 2017 data
layer_id = 2038
add_lpis = AddGeopediaVectorFeature((FeatureType.VECTOR_TIMELESS, 'LPIS_2017'),
                                     layer=layer_id, drop_duplicates=True)

Downloaded data are in vectorized form, therefore we have to rasterize it.

In [6]:
rasterization_task = VectorToRaster(vector_input=(FeatureType.VECTOR_TIMELESS,'LPIS_2017'), 
                                    raster_feature=(FeatureType.MASK_TIMELESS,'LPIS_2017'),
                                    values_column='SIFRA_KMRS', 
                                    raster_shape=(FeatureType.MASK, 'IS_DATA'),
                                    raster_dtype=np.uint8)

## Using streaming models

In [7]:
#EOTask for fiting.
class StreamingModelFit(EOTask):
    """
        Task to fit model.
    """
    def __init__(self, time=0, feature_name='bands', label_name='land_cov', model=ml_rapids.HoeffdingTree() ):
        self.time=time
        self.feature_name = feature_name
        self.label_name = label_name
        self.learner = model

    def fit_eopatch(self, eopatch):
        X = eopatch.data[self.feature_name][self.time]
        Y = eopatch.mask_timeless[self.label_name]
        X_train = X.reshape(X.shape[0] * X.shape[1], X.shape[2])
        Y_train = Y.reshape(X.shape[0] * X.shape[1]).astype(int)
        
        self.learner.fit(X_train, Y_train)
        
    def execute(self, eopatch):
        self.fit_eopatch(eopatch)
        return eopatch
            
    def get_model(self):
        return self.learner

In [8]:
#EOTask for prediction.
class StreamingModelPredict(EOTask):
    """
        Task to predict with model.
    """
    def __init__(self, time=0, feature_name='bands', label_name='land_cov', model=ml_rapids.HoeffdingTree() ):
        self.time=time
        self.feature_name = feature_name
        self.label_name = label_name
        self.learner = model
    
    def predict_eopatch(self, eopatch):
        X = eopatch.data[self.feature_name][self.time]
        X_predict = X.reshape(X.shape[0] * X.shape[1], X.shape[2])
        
        Y_predict = self.learner.predict(X_predict)
        Y_predict = Y_predict.reshape(X.shape[0], X.shape[1], 1)
        
        eopatch.add_feature(FeatureType.MASK_TIMELESS, self.label_name + '_predict', Y_predict)

        return eopatch
    
    def execute(self, eopatch):
        eopatch = self.predict_eopatch(eopatch)
        return eopatch
    
    def get_model(self):
        return self.learner

Define task for fitting. Used model is Hoeffding tree from ml-rapids.

In [9]:
fit_task = StreamingModelFit(time =0, feature_name='bands', label_name='LPIS_2017', model=ml_rapids.HoeffdingTree())

Define workflow for fitting and execute it. We can use the same workflow and HoeffdingTree() model will update itself on each new EOPatch.

In [10]:
workflow_fit = LinearWorkflow(
    add_data,
    add_lpis,
    rasterization_task,
    fit_task,
    #save
)

In [11]:
extra_param = {
    add_data:{'bbox': bbox_0, 
              'time_interval': time_interval
             },
    #save: {'eopatch_folder': 'test'}
}
workflow_fit.execute(extra_param)

WorkflowResults(
  Dependency(StreamingModelFit):
    EOPatch(
      data: {
        bands: numpy.ndarray(shape=(15, 337, 333, 13), dtype=float32)
      }
      mask: {
        IS_DATA: numpy.ndarray(shape=(15, 337, 333, 1), dtype=bool)
      }
      scalar: {}
      label: {}
      vector: {}
      data_timeless: {}
      mask_timeless: {
        LPIS_2017: numpy.ndarray(shape=(337, 333, 1), dtype=uint8)
      }
      scalar_timeless: {}
      label_timeless: {}
      vector_timeless: {
        LPIS_2017: geopandas.GeoDataFrame(columns=['GERK_PID', 'GERK_POVR', 'KMG_MID', 'KRA_MEJ', 'LETO', 'POLJINA_ID', 'POLJ_LETO', 'PONO35', 'PONO50', 'POVR_POLJI', 'RABA_ID', 'SIFINFOGIS', 'SIFRA_KMRS', 'geometry'], length=643, crs=epsg:32633)
      }
      meta_info: {
        maxcc: 0.8
        service_type: 'wcs'
        size_x: '10m'
        size_y: '10m'
        time_difference: datetime.timedelta(days=-1, seconds=86399)
        time_interval: ['2017-04-01', '2017-07-01']
      }
      bbox: BB

Get model and use it in future workflows.

In [12]:
model = fit_task.get_model()

Define prediction task with previously fitted model.

In [13]:
predict_task = StreamingModelPredict(time =0, feature_name='bands', label_name='LPIS_2017', model=model)

Define workflow for prediction and execute it. If to more than one 

In [14]:
workflow_predict = LinearWorkflow(
    add_data,
    add_lpis,
    rasterization_task,
    predict_task,
    save
)

In [15]:
extra_param = {
    add_data:{'bbox': bbox_1, 
              'time_interval': time_interval
             },
    save: {'eopatch_folder': 'predicted'}
}
workflow_predict.execute(extra_param)

WorkflowResults(
  Dependency(SaveToDisk):
    EOPatch(
      data: {
        bands: numpy.ndarray(shape=(15, 337, 333, 13), dtype=float32)
      }
      mask: {
        IS_DATA: numpy.ndarray(shape=(15, 337, 333, 1), dtype=bool)
      }
      scalar: {}
      label: {}
      vector: {}
      data_timeless: {}
      mask_timeless: {
        LPIS_2017: numpy.ndarray(shape=(337, 333, 1), dtype=uint8)
        LPIS_2017_predict: numpy.ndarray(shape=(337, 333, 1), dtype=int32)
      }
      scalar_timeless: {}
      label_timeless: {}
      vector_timeless: {
        LPIS_2017: geopandas.GeoDataFrame(columns=['GERK_PID', 'GERK_POVR', 'KMG_MID', 'KRA_MEJ', 'LETO', 'POLJINA_ID', 'POLJ_LETO', 'PONO35', 'PONO50', 'POVR_POLJI', 'RABA_ID', 'SIFINFOGIS', 'SIFRA_KMRS', 'geometry'], length=427, crs=epsg:32633)
      }
      meta_info: {
        maxcc: 0.8
        service_type: 'wcs'
        size_x: '10m'
        size_y: '10m'
        time_difference: datetime.timedelta(days=-1, seconds=86399)
      

## Use predictions

We load saved eopatch and use for evaluation of models.

In [16]:
eopatch = EOPatch.load('./eopatches/predicted/')

In [17]:
predicted = eopatch.mask_timeless['LPIS_2017_predict'].ravel()
original = eopatch.mask_timeless['LPIS_2017'].ravel()

print('Accuracy:', accuracy_score(original, predicted))

Accuracy: 0.6485773607435329
