<!-- # Processing the final dataset
 -->


# Export data as CSV

In [2]:
import ee
import geemap
from gee_0_utils import *

initialize()
config = ProjectConfig()
roi = config.roi
data_folder = config.data_folder
last_year = config.last_year

## Data for all biomass and age data comparisons

In [None]:
biome =  ee.Image(f"{data_folder}/categorical").select("biome")

# Ages
mapbiomas = ee.Image(f"{data_folder}/mapbiomas_{last_year}").rename(["age_mapbiomas"])
tmf = ee.Image(f"{data_folder}/tmf_{last_year}").rename(["age_tmf"])
silva = ee.Image("projects/ee-regrowth/assets/MB_secondary_forest_Silva_Junior_v2/sforestAge_brazil_V8").select("classification_2020").rename("age_silva")

# Response variables
GEDI_L2A = ee.Image(f"{data_folder}/GEDI_L2A_{last_year}")
GEDI_L4A = ee.Image(f"{data_folder}/GEDI_L4A_{last_year}")
ESA_CCI = (ee.Image(f"projects/sat-io/open-datasets/ESA/ESA_CCI_AGB/CCI_BIOMASS_100m_AGB_{last_year}_v51")
           .select("AGB").rename(f"ESA_CCI_{last_year}"))
heinrich = (ee.Image("projects/ee-regrowth/assets/Heinrich_etal_2021_updates/sforestAGC_climate_only_v1_1")
            .select("classification_2020").rename("heinrich_biomass_2020"))
lang_height = ee.Image('users/nlang/ETH_GlobalCanopyHeight_2020_10m_v1').clip(roi).rename('lang_height')

# Combine all images into a single multi-band image
comparisons = mapbiomas.addBands([tmf, silva, ESA_CCI, GEDI_L2A, GEDI_L4A, heinrich, biome])
mask = comparisons.reduce(ee.Reducer.allNonZero())
masked_image = comparisons.updateMask(mask)

comparisons_sampled = comparisons.stratifiedSample(numPoints = 10000, classBand = 'biome')

export_csv(comparisons_sampled, "comparisons_sampled")

## Mature forest biomass comparisons

In [None]:
mature_biomass = ee.Image(f"{data_folder}/mature_biomass").addBands(biome)
mature_biomass_sampled = mature_biomass.stratifiedSample(numPoints = 10000, classBand = 'biome')
export_csv(mature_biomass_sampled, "mature_biomass_sampled")

## Field Data

In [None]:
field_data = ee.FeatureCollection(f"{data_folder}/field_biomass") # from https://github.com/forc-db/GROA/tree/master/data
biomes = ee.FeatureCollection(f"{data_folder}/raw/biomes_br").select('CD_Bioma')

# Check in which biome each field plot is located
def determine_biome(feature):
    bioma_number = biomes.filterBounds(feature.geometry()).first().get('CD_Bioma')
    return feature.set('biome', bioma_number)

field_biomass = field_data.map(determine_biome)

export_csv(field_biomass, "field_biomass")

## Main Model Dataset

### Age, Biomass

In [None]:
# Fire and Land Use
age = ee.Image(f"{data_folder}/mapbiomas_{last_year}")

biomass = ee.Image(f"{data_folder}/ESA_CCI_{last_year}")

fire = (ee.Image("projects/mapbiomas-public/assets/brazil/fire/collection3/mapbiomas_fire_collection3_annual_burned_coverage_v1")
    .select([f"burned_coverage_{year}" for year in config.range_1985_2020])
    .byte()
    .rename([str(year) for year in config.range_1985_2020])
    .gt(0)
    .reduce('sum').rename("num_fires")).unmask(0)

floodable_forests = (ee.Image("projects/mapbiomas-public/assets/brazil/lulc/collection9/mapbiomas_collection90_integration_v1")
        .select(f"classification_{last_year}").eq(6))

suffixes = [
    "aggregated_all",
    "non_aggregated_all",
    "non_aggregated_15yr",
    "non_aggregated_10yr",
    "non_aggregated_5yr"
]

land_use_list = [
    ee.Image(f"{data_folder}/land_use_{suffix}")
    for suffix in suffixes
]

### Surrounding Landscape

In [None]:
quarters_ecoreg_biomass = ee.Image("projects/amazon-forest-regrowth/assets/quarters_ecoreg_biomass")
distance_to_forest_edge = ee.Image(f"{data_folder}/distance_to_forest_edge")
sur_cover = ee.Image(f"{data_folder}/sur_cover")
# nearest_mature = ee.Image(f"{data_folder}/mapbiomas/nearest_mature_biomass_image_neighborhood").rename("nearest_mature_biomass")



### Environmental

In [None]:
categorical = ee.Image(f"{data_folder}/categorical")

cwd = ee.Image(f"{data_folder}/raw/cwd_chave").int16()

topography = ee.Image("CSP/ERGo/1_0/Global/ALOS_landforms").rename("topography") # 90m resolution

soil = ee.Image(f"{data_folder}/soilgrids")

climate = ee.Image(f"{data_folder}/yearly_terraclim").select( # 10,000m resolution
 'mean_pr',
 'mean_srad',
 'mean_temp',
 'mean_vpd',
 'mean_soil',
 'mean_aet',
 'mean_si')


### Export Sampled

In [None]:
# List to store the sampled data for each land use image
sampled_data_list = []

# Loop through each land use image
for land_use in land_use_list:
    # Add bands and update the mask
    unified_data = age.addBands([
        biomass, fire, floodable_forests, land_use, 
        quarters_ecoreg_biomass, distance_to_forest_edge, sur_cover,
        cwd, categorical, topography, soil, climate
    ]).updateMask(age.And(land_use.select("last_lu"))
)
    
    # Perform stratified sampling
    unified_data_sampled = unified_data.stratifiedSample(
        numPoints=15000, 
        classBand='biome', 
        geometries=False
    )
    
    # Append the sampled data to the list
    sampled_data_list.append(unified_data_sampled)

for sampled_data, suffix in zip(sampled_data_list, suffixes):
    export_csv(sampled_data, f"sampled_{suffix}")

# Export Tilewise

In [None]:
unified_data = age.addBands([age.pixelLonLat().float().rename(['lon', 'lat']),
        biomass, fire, floodable_forests, land_use, 
        quarters_ecoreg_biomass, distance_to_forest_edge, sur_cover,
        cwd, categorical, topography, soil, climate
    ]).updateMask(age)

# Filter properties to export
to_remove = ['.geo', 'system:index']
all_properties = unified_data.bandNames().getInfo()
properties_to_export = [p for p in all_properties if p not in to_remove]

# Load region of interest (ROI) and create a grid over the ROI
grid = roi.coveringGrid("EPSG:4326", 1000000)
tile_ids = grid.aggregate_array('system:index').getInfo()
count = 0

# Loop over IDs
for feature_id in tile_ids:
    count = count + 1
    feat = grid.filter(ee.Filter.eq('system:index', feature_id))

    unified_data_sampled = unified_data.sample(region = feat.geometry(), scale = 100, geometries = False)

    # Export task to Google Drive
    task = ee.batch.Export.table.toDrive(
        collection=unified_data_sampled,
        description=f'{suffix}_{count}',
        fileFormat='CSV',
        selectors=properties_to_export,
        folder = "tiled_data"
    )
    task.start()
    print(f'Started export task {count}')

In [None]:
age = ee.Image(f"{data_folder}/mapbiomas_{last_year}")

# Load region of interest (ROI) and create a grid over the ROI
grid = roi.coveringGrid("EPSG:4326", 1000000)
tile_ids = grid.aggregate_array('system:index').getInfo()
count = 0

feat = grid.first()

unified_data_sampled = age.sample(region = feat.geometry(), scale = 100, geometries = True)

# map = geemap.Map()
# map.addLayer(unified_data_sampled, {}, 'sampled')
# map.addLayer(age, {'min': 0, 'max': 34, 'palette': 'gray'}, 'age')
# map

Map(center=[0, 0], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchDataGUI(childr…

In [None]:



def keep_rare_lu_types(unified_data):
    # Define the list of bands to check
    bands_to_check = ['lulc_sum_20', 'lulc_sum_21', 'lulc_sum_35', 
                    'lulc_sum_39', 'lulc_sum_40', 'lulc_sum_41', 
                    'lulc_sum_46', 'lulc_sum_48', 'lulc_sum_9']

    # Create a mask where at least one of the specified bands is non-zero
    mask = unified_data.select(bands_to_check).reduce(ee.Reducer.anyNonZero())

    # # Apply the mask to the unified_data
    return unified_data.updateMask(mask)


