# Processing the final dataset



In [2]:
import ee
import geemap
from gee_0_utils import *

initialize()
config = ProjectConfig()
roi = config.roi
data_folder = config.data_folder

In [2]:
def keep_rare_lu_types(unified_data):
    # Define the list of bands to check
    bands_to_check = ['lulc_sum_20', 'lulc_sum_21', 'lulc_sum_35', 
                    'lulc_sum_39', 'lulc_sum_40', 'lulc_sum_41', 
                    'lulc_sum_46', 'lulc_sum_48', 'lulc_sum_9']

    # Create a mask where at least one of the specified bands is non-zero
    mask = unified_data.select(bands_to_check).reduce(ee.Reducer.anyNonZero())

    # # Apply the mask to the unified_data
    return unified_data.updateMask(mask)

# to try and reduce some level of spatial autocorrelation:
def sample_by_grid(unified_data):

    region = unified_data.geometry()
    grid = region.coveringGrid(region.projection(), 1000)

    # Function to check if there's at least one cell of unified_data in the grid cell
    def check_overlap(grid_cell):
        geometry = grid_cell.geometry()
        
        sampled = unified_data.stratifiedSample(
            numPoints = 1, classBand = 'biome', region = geometry, scale = 50000
        )

        return sampled.first()

    # Apply the function to each grid cell
    return grid.map(check_overlap, dropNulls = True)



In [3]:
nitro = ee.Image("projects/soilgrids-isric/nitrogen_mean").clip(roi)
cec = ee.Image("projects/soilgrids-isric/cec_mean") # cation exchange capacity
clay = ee.Image("projects/soilgrids-isric/clay_mean").clip(roi)
sand = ee.Image("projects/soilgrids-isric/sand_mean").clip(roi)
soc = ee.Image("projects/soilgrids-isric/soc_mean").clip(roi) # soil organic carbon
phh2o = ee.Image("projects/soilgrids-isric/phh2o_mean").clip(roi)
ocd = ee.Image("projects/soilgrids-isric/ocd_mean").clip(roi) # organic carbon density

# Function to select and sum specific depth bands
def sum_specific_depths(image, name):
    depths = ['0-5cm', '15-30cm', '5-15cm']
    selected_bands = image.select([f'.*_{depth}_mean' for depth in depths])
    summed = selected_bands.reduce(ee.Reducer.sum()).rename(name)
    return summed

# Apply the function to each image
nitro = sum_specific_depths(nitro, "nitro")
cec = sum_specific_depths(cec, "cat_exch_cap")
clay = sum_specific_depths(clay, "clay")
sand = sum_specific_depths(sand, "sand")
soc = sum_specific_depths(soc, "soc")
phh2o = sum_specific_depths(phh2o, "phh2o")
ocd = sum_specific_depths(ocd, "org_c_dens")

# Combine all the soil properties into a single image
soil_properties = nitro.addBands([cec, clay, sand, soc, phh2o, ocd])

# export_image(soil_properties, "soilgrids")

In [5]:

def export_csv(method, year = None):
        
    secondary = ee.Image(f"{data_folder}/{method}/secondary")
    agbd = secondary.select('agbd').toInt16()
    secondary = secondary.addBands(agbd, overwrite = True)
    secondary = secondary.select(['age', 'agbd'])

    nearest_mature = ee.Image(f"{data_folder}/{method}/nearest_mature_biomass_image").rename("nearest_mature_biomass")
    distance_to_nearest_mature = ee.Image(f"{data_folder}/{method}/distance_to_nearest_mature")

    cwd = ee.Image(f"{data_folder}/raw/cwd_chave").int16()
    sur_cover = ee.Image(f"{data_folder}/{method}/sur_cover").float()
    categorical = ee.Image(f"{data_folder}/categorical").select(["biome", "ecoreg", "protec", "indig"])
    topography = ee.Image("CSP/ERGo/1_0/Global/ALOS_landforms").rename("topography")
    soil = ee.Image(f"{data_folder}/soilgrids").select(['nitro', 'sand', 'phh2o']) # drop multicollinear bands
    # climate = ee.Image(f"{data_folder}/ERA5_L").select('.*mean.*')
    climate = ee.Image(f"{data_folder}/yearly_terraclim").select('.*mean.*')

    distance_to_border_mask = ee.Image(f"{data_folder}/distance_to_border_mask").byte()
    combined_mask = nearest_mature.mask().And(distance_to_border_mask)\
        .And(categorical.select("biome").lt(5))

    # Select all bands except the ones to be dropped
    unified_data = (
        secondary.addBands(
            [nearest_mature, sur_cover, cwd, #yearly_mean_prec, yearly_SI,
            distance_to_nearest_mature, categorical,
            soil, topography, climate
        ]).updateMask(combined_mask)
    )

    if method == "mapbiomas":

        if (aggregate_LU):
            suffix = "aggregated"
        else:
            suffix = "non_aggregated"

        if (year != 0):
            suffix += f"_{year}yr"
        else:
            suffix += "_all"

        one_hectare_mask = ee.Image(f"{data_folder}/mapbiomas/one_hectare_mask").byte()
        land_use_years = ee.Image(f"{data_folder}/{method}/land_use_{suffix}")
        num_fires = ee.Image(f"{data_folder}/{method}/num_fires")
        unburned_mask = num_fires.select("num_fires_after_regrowth").eq(0)
        all_bands = land_use_years.bandNames()
        selected_bands = all_bands.filter(ee.Filter.Not(ee.Filter.inList('item', \
                        ['lulc_sum_62', 'lulc_sum_47'])))
        land_use_years = land_use_years.select(selected_bands)
        unified_data = unified_data.addBands([land_use_years,
                                              num_fires.select("num_fires_before_regrowth")]) \
                                                .updateMask(unburned_mask.And(one_hectare_mask))
    else:
        suffix = "eu"

    # unified_data = keep_rare_lu_types(unified_data)

    unified_data_sampled = unified_data.stratifiedSample(
        numPoints = 40000, classBand = 'biome', geometries = False
    )

    to_remove = ['.geo', 'system:index']

    # # Select all properties except the ones to exclude
    all_properties = unified_data.bandNames().getInfo()
    properties_to_export = list(filter(lambda x: x not in to_remove, all_properties))

    task = ee.batch.Export.table.toDrive(
        collection = unified_data_sampled,
        description = suffix,
        fileFormat = 'CSV',
        selectors = properties_to_export
    )
    task.start()

    # task = ee.batch.Export.table.toAsset(
    #     collection = unified_data_sampled, description = f"{suffix}_toAsset", \
    #     assetId = f"{data_folder}/{method}/{suffix}"
    # )
    # task.start()

    return unified_data


aggregate_LU = False
export_csv("mapbiomas", 5)
export_csv("mapbiomas", 10)
export_csv("mapbiomas", 15)
# export_csv("eu")

In [None]:
secondary_mapbiomas = ee.Image(f"{data_folder}/mapbiomas/secondary")
secondary_eu = ee.Image(f"{data_folder}/eu/secondary")

secondary_mapbiomas = secondary_mapbiomas.updateMask(secondary_eu.select("age"))
secondary_eu = secondary_eu.updateMask(secondary_mapbiomas.select("age"))
secondary_mapbiomas = secondary_mapbiomas.updateMask(secondary_eu.select("age"))

categorical = ee.Image(f"{data_folder}/categorical")
biome_mask = categorical.select('biome').inList([1, 4, 6])
categorical = categorical.updateMask(biome_mask)
cwd = ee.Image(f"{data_folder}/raw/cwd_chave")

unified_data = secondary_mapbiomas.addBands([categorical, cwd, \
                                             secondary_mapbiomas.select("age").rename("age_eu")])\
                                                .updateMask(secondary_mapbiomas.select("age"))

unified_data_sampled = unified_data.stratifiedSample(
    numPoints = 10000, classBand = 'biome', region = roi
)

task = ee.batch.Export.table.toDrive(
    collection = unified_data_sampled, description = "mapbiomas_eu", fileFormat = 'CSV'
)
task.start()
