# Training Data Extraction

This notebook has been tested and used in Google Colab only.

The end result of this notebook are CSV-files, with the following structure:
```
,volume,agbm,coords,B2_mean,B2_std,B2_max,B2_min,B3_mean,B3_std,B3_max,B3_min,B4_mean,B4_std,B4_max,B4_min,B8_mean,B8_std,B8_max,B8_min
0,77.45,75368.88236931057,"[[[22.047808501617933, 62.14306685369401], [22.048114271765243, 62.143077814760005], [22.048090848918434, 62.14322090081012], [22.047785077339235, 62.14320993967812], [22.047808501617933, 62.14306685369401]]]",295.8791366906476,9.85900603509299,307.0,280.0,479.49856115107923,24.66090022687755,504.0,439.0,310.19784172661883,20.90550166822127,332.0,275.0,2533.8589928057563,124.24491941322992,2680.0,2344.0
1,106.94,113532.58180660434,"[[[22.048114271765243, 62.143077814760005], [22.04842004216451, 62.143088775152], [22.048396620749585, 62.14323186126803], [22.048090848918434, 62.14322090081012], [22.048114271765243, 62.143077814760005]]]",275.5542774982027,9.959292143521045,280.0,257.0,443.4392523364486,9.433981132056603,453.0,431.0,265.90797987059665,4.69041575982343,275.0,263.0,2438.8856937455066,58.532042506647585,2486.0,2344.0
...
```

In [None]:
# Install missing dependencies from Google Colab
!pip install geopandas zipfile_deflate64

In [None]:
import io
from pathlib import Path
import threading
import concurrent.futures

import ee
import geopandas as gpd
import numpy as np
from google.colab import files

import requests
import zipfile_deflate64 as zipfile
from lxml import html

In [None]:
ee.Authenticate()
ee.Initialize()

## Sentinel

In [None]:
# Recommembed settings for satellite image filtering
CLOUD_FILTER = 60
CLD_PRB_THRESH = 40
NIR_DRK_THRESH = 0.15
CLD_PRJ_DIST = 2
BUFFER = 100

In [None]:
def get_s2_img(aoi, start_date='2021-07-01', end_date='2021-08-01'):    
    s2_sr_cld_col = get_s2_sr_cld_col(aoi, start_date, end_date)
    return (s2_sr_cld_col.map(add_cld_shdw_mask)
                .map(apply_cld_shdw_mask)
                .median())

In [None]:
def get_s2_sr_cld_col(aoi, start_date, end_date):
    # Import and filter S2 SR.
    s2_sr_col = (ee.ImageCollection('COPERNICUS/S2_SR')
        .filterBounds(aoi)
        .filterDate(start_date, end_date)
        .filter(ee.Filter.lte('CLOUDY_PIXEL_PERCENTAGE', CLOUD_FILTER)))

    # Import and filter s2cloudless.
    s2_cloudless_col = (ee.ImageCollection('COPERNICUS/S2_CLOUD_PROBABILITY')
        .filterBounds(aoi)
        .filterDate(start_date, end_date))

    # Join the filtered s2cloudless collection to the SR collection by the 'system:index' property.
    return ee.ImageCollection(ee.Join.saveFirst('s2cloudless').apply(**{
        'primary': s2_sr_col,
        'secondary': s2_cloudless_col,
        'condition': ee.Filter.equals(**{
            'leftField': 'system:index',
            'rightField': 'system:index'
        })
    }))

In [None]:
def add_shadow_bands(img):
    # Identify water pixels from the SCL band.
    not_water = img.select('SCL').neq(6)

    # Identify dark NIR pixels that are not water (potential cloud shadow pixels).
    SR_BAND_SCALE = 1e4
    dark_pixels = img.select('B8').lt(NIR_DRK_THRESH*SR_BAND_SCALE).multiply(not_water).rename('dark_pixels')

    # Determine the direction to project cloud shadow from clouds (assumes UTM projection).
    shadow_azimuth = ee.Number(90).subtract(ee.Number(img.get('MEAN_SOLAR_AZIMUTH_ANGLE')));

    # Project shadows from clouds for the distance specified by the CLD_PRJ_DIST input.
    cld_proj = (img.select('clouds').directionalDistanceTransform(shadow_azimuth, CLD_PRJ_DIST*10)
        .reproject(**{'crs': img.select(0).projection(), 'scale': 100})
        .select('distance')
        .mask()
        .rename('cloud_transform'))

    # Identify the intersection of dark pixels with cloud shadow projection.
    shadows = cld_proj.multiply(dark_pixels).rename('shadows')

    # Add dark pixels, cloud projection, and identified shadows as image bands.
    return img.addBands(ee.Image([dark_pixels, cld_proj, shadows]))


def add_cloud_bands(img):
    # Get s2cloudless image, subset the probability band.
    cld_prb = ee.Image(img.get('s2cloudless')).select('probability')

    # Condition s2cloudless by the probability threshold value.
    is_cloud = cld_prb.gt(CLD_PRB_THRESH).rename('clouds')

    # Add the cloud probability layer and cloud mask as image bands.
    return img.addBands(ee.Image([cld_prb, is_cloud]))


def apply_cld_shdw_mask(img):
    # Subset the cloudmask band and invert it so clouds/shadow are 0, else 1.
    not_cld_shdw = img.select('cloudmask').Not()

    # Subset reflectancemethods bands and update their masks, return the result.
    return img.select('B.*').updateMask(not_cld_shdw)


def add_cld_shdw_mask(img):
    # Add cloud component bands.
    img_cloud = add_cloud_bands(img)

    # Add cloud shadow component bands.
    img_cloud_shadow = add_shadow_bands(img_cloud)

    # Combine cloud and shadow mask, set cloud and shadow as value 1, else 0.
    is_cld_shdw = img_cloud_shadow.select('clouds').add(img_cloud_shadow.select('shadows')).gt(0)

    # Remove small cloud-shadow patches and dilate remaining pixels by BUFFER input.
    # 20 m scale is for speed, and assumes clouds don't require 10 m precision.
    is_cld_shdw = (is_cld_shdw.focalMin(2).focalMax(BUFFER*2/20)
        .reproject(**{'crs': img.select([0]).projection(), 'scale': 20})
        .rename('cloudmask'))

    # Add the final cloud-shadow mask to the image.
    return img_cloud_shadow.addBands(is_cld_shdw)

## Bands and Biomass Functions

In [None]:
#extract bands from sentinel data
def hila_bands(x, band):
    aoi = ee.Geometry.Polygon(x)
    img = get_s2_img(aoi)

    mean_band = img.select(band).reduceRegion(**{
    "reducer": ee.Reducer.mean(),
    "geometry": aoi,
    "scale": 10,  # A nominal scale in meters of the projection to work in
    })

    std_band = img.select(band).reduceRegion(**{
    "reducer": ee.Reducer.stdDev(),
    "geometry": aoi,
    "scale": 10,  # A nominal scale in meters of the projection to work in
    })

    max_band = img.select(band).reduceRegion(**{
    "reducer": ee.Reducer.max(),
    "geometry": aoi,
    "scale": 10,  # A nominal scale in meters of the projection to work in
    })

    min_band = img.select(band).reduceRegion(**{
    "reducer": ee.Reducer.min(),
    "geometry": aoi,
    "scale": 10,  # A nominal scale in meters of the projection to work in
    })

    return [mean_band.getNumber(band), std_band.getNumber(band), max_band.getNumber(band), min_band.getNumber(band)]

In [None]:
#Approximating above ground biomass

#https://www.silvafennica.fi/article/184
def biomass_pine(x):
    d_S = 2 + x['meandiameterpine']*1.25
    h = x['meanheightpine']
    stm = x['stemcountpine']

    #average biomass per stem
    mean_sbm = -3.198 + 9.547*(d_S/(d_S+12)) + 3.241*(h/(h+20))
    #biomass per hila -- log(average biomas) * stem count/ha
    agbm = np.exp(mean_sbm)*stm
    return agbm

def biomass_spruce(x):
    d_S = 2 + x['meandiameterspruce']*1.25
    h = x['meanheightspruce']
    stm = x['stemcountspruce']

    mean_sbm = -1.808 + 9.482*(d_S/(d_S+20)) + 0.469*(np.log(h))
    agbm = np.exp(mean_sbm)*stm
    return agbm

#https://www.silvafennica.fi/article/236
#estimate for decidous trees
def biomass_birch(x):
    d_S = 2 + x['meandiameterpine']*1.25
    h = x['meanheightpine']
    stm = x['stemcountpine']

    mean_sbm = -3.654 + 10.582*(d_S/(d_S+12)) + 3.018*(h/(h+22))
    agbm = np.exp(mean_sbm)*stm
    return agbm


def est_agbm(x):
    pine = biomass_pine(x)
    spr = biomass_spruce(x)
    bir = biomass_birch(x)

    return (pine + spr + bir)

In [None]:
#extract band values for the hila-grids
#Collect Numbers to List, evaluate once to save time in getInfo() calls
def extract_bands(coords, band):
    ln = len(coords)
    # Changed from 1000 to 500, because ran out of memory
    chunk = 500
    res = []

    # due to user memory limit, need to work in 1K chunks of data
    for i in range(0, ln, chunk):
    lim = i+chunk
    if lim > ln:
        lim = ln

    tmp = coords[i:lim].apply(lambda x: hila_bands(x, band))
    res.extend(ee.List([x for x in tmp]).getInfo())

    return res

In [None]:
# Convert GeoPandas polygons to list of coordinates for EE
def conv_Polygon(poly):
    x,y = poly.exterior.coords.xy
    coords = np.dstack((x,y)).tolist()
    return coords

## Forest Grid Download
Download grid data from the Forest Center for specific map cells.

In [None]:
# Finds all available map cell download links
def get_hila_links():
    root_url = 'https://aineistot.metsaan.fi'
    karttalehti_url = root_url + '/avoinmetsatieto/Hila/Karttalehti/'
    res = requests.get(karttalehti_url)
    page = html.fromstring(res.content)
    # Get all href links
    links = page.xpath('//a/@href')
    # Add root url to them
    links = [root_url + link for link in links]
    # Skip first link (link to parent directory)
    links = links[1:]
    return links

In [None]:
# Downlads link to specified directory
def download_hila(link, out_dir):
    r = requests.get(link)
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall(out_dir)
    file_path = out_dir/z.namelist()[0]
    return file_path

In [None]:
# Reads and transforms a grid data file
def read_hila(path):
    gdf = gpd.read_file(path)
    gdf = gdf[['volume',
               'geometry',
               'meandiameterpine',
               'meanheightpine',
               'stemcountpine',
               'meandiameterspruce',
               'meanheightspruce',
               'stemcountspruce',
               'meandiameterdeciduous',
               'meanheightdeciduous',
               'stemcountdeciduous']]
    gdf = gdf.to_crs(epsg=4326)
    gdf['coords'] = gdf['geometry'].map(conv_Polygon)
    gdf['agbm'] = gdf.apply(est_agbm, axis=1)
    gdf = gdf[['volume', 'agbm', 'coords']]
    return gdf

In [None]:
# All available map cells
all_links = get_hila_links()

You can visually select specific map cells here: https://tiedostopalvelu.maanmittauslaitos.fi/tp/kartta?lang=en

Press `+ Elevation model (2)` and `+ Elevation model 2 m`. This will show the names of the map cells, when you zoom close enough.

When you click a cell, its name will appear on the right, from where you can copy the name. Paste them to the below list.

In [None]:
# Names of map cells to use
cells = ['T4312C', 'Q4233E', 'N4243C', 'P5333E']

In [None]:
# Filter download links for selected cells
links = [link for link in all_links if [cell for cell in cells if cell in link]]
links

In [None]:
# Download selected links to out_dir
paths = []
out_dir = Path('hila-map')
out_dir.mkdir(exist_ok=True)
for link in links:
    path = download_hila(link, out_dir=out_dir)
    paths.append(path)
paths

## Satellite Data Extraction
Join Sentinel images with the coordinates of forest grids and extract the band information.

In [None]:
bands = ['B2', 'B3', 'B4', 'B8']

One map cell takes ~2h to extract.

In [None]:
for path in (paths):
    print(f'Extracting S2 data for {path.stem}')
    gdf = read_hila(path)
    with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(extract_bands, gdf['coords'], band) for band in bands]

    for i, future in enumerate(futures):
        band = bands[i]
        gdf[[f'{band}_mean',f'{band}_std', f'{band}_max', f'{band}_min']] = future.result()

    gdf.to_csv(f'{path.stem}.csv')