<!-- # Processing the final dataset
 -->


# Export data as CSV

In [22]:
import ee
import geemap
from gee_0_utils import *

initialize()
config = ProjectConfig()
roi = config.roi
data_folder = config.data_folder
last_year = config.last_year


## Main Model Dataset

### Age, Biomass

In [23]:
# Fire and Land Use
age = ee.Image(f"{data_folder}/mapbiomas_{last_year}")

ESA_CCI = ee.Image(f"projects/sat-io/open-datasets/ESA/ESA_CCI_AGB/CCI_BIOMASS_100m_AGB_{last_year}_v51").select("AGB").rename("biomass")

fire = (ee.Image("projects/mapbiomas-public/assets/brazil/fire/collection3/mapbiomas_fire_collection3_annual_burned_coverage_v1")
    .select([f"burned_coverage_{year}" for year in config.range_1985_2020])
    .byte()
    .rename([str(year) for year in config.range_1985_2020])
    .gt(0)
    .reduce('sum').rename("num_fires")).unmask(0)

floodable_forests = (ee.Image("projects/mapbiomas-public/assets/brazil/lulc/collection9/mapbiomas_collection90_integration_v1")
        .select(f"classification_{last_year}").eq(6)).rename("floodable_forests")

secondary_area = ee.Image(f"{data_folder}/secondary_area")
pasture_area = ee.Image(f"{data_folder}/pasture_area")


### Surrounding Landscape

In [24]:
quarters_ecoreg_biomass = ee.Image("projects/amazon-forest-regrowth/assets/quarters_ecoreg_biomass")
ecoreg = ee.Image("projects/amazon-forest-regrowth/assets/ecoreg")
distance_deep_forest = ee.Image(f"{data_folder}/distance_deep_forest").rename("dist")
sur_cover = ee.Image(f"{data_folder}/sur_cover")

distance_gt_1000 = distance_deep_forest.gt(1000).selfMask()
mature_biomass = ee.Image(f"{data_folder}/mature_biomass")
mature_biomass_10k = ee.Image(f"{data_folder}/mature_biomass_10k")
nearest_mature = ee.Image(f"{data_folder}/nearest_mature")

### Environmental

In [25]:
categorical = ee.Image(f"{data_folder}/categorical")

topography = ee.Image("CSP/ERGo/1_0/Global/ALOS_landforms").rename("topography") # 90m resolution

soil = ee.Image(f"{data_folder}/soilgrids")

terraclim = ee.Image(f"{data_folder}/terraclim_1958_2019")

### Export CSV


In [12]:
def reduce_reproject(image, reducer):
    """Reproject image using mean."""
    return image.reduceResolution(
        reducer=reducer
    ).reproject(
        crs=age.projection(),
        scale=age.projection().nominalScale()
    )


# Reproject continuous variables using mean
continuous_vars = [
    ESA_CCI.rename("biomass"), # double
    nearest_mature.rename("nearest_mature"), # double
    soil, # float
    terraclim.select(["mean_soil", "mean_vpd", "mean_temp", "mean_def", 
                      "mean_srad", "mean_pr", "mean_aet", "mean_pdsi"]) # float and int16
]

# Reproject categorical variables using first
categorical_vars = [
    fire, # int64
    ecoreg, # int16
    categorical, # int8
    topography # int8
]

# Create unified dataset by directly combining bands
unified_data = ee.Image.cat([
    age, # int8
    floodable_forests,
    distance_deep_forest, # int16
    sur_cover, # float
    *[reduce_reproject(var, ee.Reducer.mean()) for var in continuous_vars],
    *[reduce_reproject(var, ee.Reducer.first()) for var in categorical_vars],
    ee.Image.pixelLonLat().rename(['lon', 'lat'])
])

unified_data_pasture = ee.Image.cat([
    unified_data,
    pasture_area.rename('pasture_area')
])

unified_data = ee.Image.cat([
    unified_data,
    secondary_area, #double
    quarters_ecoreg_biomass
])

In [27]:
image_utm = unified_data.reproject(
    crs=age.projection(),
    scale=age.projection().nominalScale()
)

averaged_biomass = ESA_CCI.reduceResolution(
    reducer=ee.Reducer.mean()).reproject(
    crs=age.projection(),
    scale=age.projection().nominalScale()
).rename("biomass")


grid_10k_secondary = ee.FeatureCollection(f"{data_folder}/grid_10k_secondary")



# UTM zones grid
utm_zones = ee.FeatureCollection(
    "projects/amazon-forest-regrowth/assets/raw/UTM_grid"
).filterBounds(amazon)


map = geemap.Map()
# map.addLayer(unified_data.select("biomass"), {'min':30, 'max':400, 'palette': ['blue', 'red']}, "Unified Data")
# map.addLayer(image_utm.select("biomass"), {'min':30, 'max':400, 'palette': ['blue', 'red']}, "Unified Data UTM")
# map.addLayer(averaged_biomass, {'min':30, 'max':400, 'palette': ['blue', 'red']}, "Averaged Biomass")\
map.addLayer(grid_10k_secondary, {}, "Grid 10k Secondary")
map.addLayer(utm_zones, {}, "UTM Zones")
map.addLayer(age, {}, "Age")
map

Map(center=[0, 0], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchDataGUI(childr…

In [None]:
# ----------------
# INPUTS
# ----------------

# Amazon biome
amazon = ee.FeatureCollection(
    "projects/amazon-forest-regrowth/assets/raw/biomes_br"
).filter(ee.Filter.eq('CD_Bioma', 1)).geometry()

# UTM zones grid
utm_zones = ee.FeatureCollection(
    "projects/amazon-forest-regrowth/assets/raw/UTM_grid"
).filterBounds(amazon)


In [15]:
grid = amazon.coveringGrid(proj=age.projection(), scale = 100000)

In [16]:
# ----------------
# ADD EPSG CODE TO ZONES
# ----------------
def add_epsg_code(feature):
    zone_num = ee.Number(feature.get('ZONE'))
    row_letter = ee.String(feature.get('ROW_'))
    north_letters = ee.List(['N','P','Q','R','S','T','U','V','W','X'])

    epsg = ee.Number(
        ee.Algorithms.If(
            north_letters.contains(row_letter),
            ee.Number(32600).add(zone_num),  # North
            ee.Number(32700).add(zone_num)   # South
        )
    )
    return feature.set('EPSG', epsg)

utm_with_epsg = utm_zones.map(add_epsg_code)

In [17]:
def sample_pixels_per_cell(image, grid):
    """Sample pixels from an image so that each grid cell gets samples."""
    def sample_cell(cell):
        sampled = image.stratifiedSample(
            numPoints=1000,
            classBand='biome',
            region=cell.geometry(),
            scale=image.projection().nominalScale(),
            geometries=False,
            dropNulls=True,
            tileScale=2
        )
        return sampled
    return grid.map(sample_cell).flatten()

In [None]:
# ----------------
# FUNCTIONS
# ----------------

def create_grid_and_export(utm_zone, side_km = None):
    """Create grid for a UTM zone and export as CSV."""
    epsg_code = f"EPSG:{utm_zone.get('EPSG').getInfo()}"
    row = utm_zone.get('ROW_').getInfo()
    zone = utm_zone.get('ZONE').getInfo()

    # Clip & reproject to UTM zone
    image_clip = unified_data.clip(utm_zone.geometry())
    image_utm = image_clip.reproject(
        crs=age.projection(),
        scale=age.projection().nominalScale()
    )

    if (side_km is None):
        sampled_fc = image_utm.stratifiedSample(
            numPoints=1000,
            classBand='biome',
            region=utm_zone.geometry(),
            scale=image_utm.projection().nominalScale(),
            geometries=False,
            dropNulls=True,
            tileScale=2
        )
        suffix = "stratified"
    else:
        # Create grid in UTM projection
        utm_proj = image_utm.projection()
        grid = utm_zone.geometry().coveringGrid(
            utm_proj, scale = side_km * 1000
        ).filterBounds(amazon)

        # Sample points
        sampled_fc = sample_pixels_per_cell(image_utm, grid)
        suffix = f"_{side_km}k"


    # Export
    property_names = image_utm.bandNames().getInfo()
    # Exclude 'system:index' and '.geo'
    filtered_properties = [p for p in property_names if p not in ['system:index', '.geo']]

    desc = f"{row}{zone}_{suffix}"
    task = ee.batch.Export.table.toDrive(
        collection=sampled_fc,
        description=desc,
        fileFormat="CSV",
        selectors=filtered_properties
    )
    task.start()

    print(f"Export started: {desc}")

# ----------------
# LOOP OVER ZONES
# ----------------
zones_list = utm_with_epsg.toList(utm_with_epsg.size())

for i in range(utm_with_epsg.size().getInfo()):
    zone = ee.Feature(zones_list.get(i))
    create_grid_and_export(zone, side_km = 100)


['age', 'floodable_forests', 'dist', 'sur_cover', 'biomass', 'nearest_mature', 'cec', 'clay', 'cfvo', 'nitro', 'ocd', 'ocs', 'phh2o', 'sand', 'soc', 'mean_soil', 'mean_vpd', 'mean_temp', 'mean_def', 'mean_srad', 'mean_pr', 'mean_aet', 'mean_pdsi', 'num_fires', 'ecoreg', 'biome', 'indig', 'protec', 'topography', 'lon', 'lat', 'secondary_area', 'quarter_biomass', 'quarter', 'ecoreg_biomass']
['age', 'floodable_forests', 'dist', 'sur_cover', 'biomass', 'nearest_mature', 'cec', 'clay', 'cfvo', 'nitro', 'ocd', 'ocs', 'phh2o', 'sand', 'soc', 'mean_soil', 'mean_vpd', 'mean_temp', 'mean_def', 'mean_srad', 'mean_pr', 'mean_aet', 'mean_pdsi', 'num_fires', 'ecoreg', 'biome', 'indig', 'protec', 'topography', 'lon', 'lat', 'secondary_area', 'quarter_biomass', 'quarter', 'ecoreg_biomass']
Export started: M18__100k
['age', 'floodable_forests', 'dist', 'sur_cover', 'biomass', 'nearest_mature', 'cec', 'clay', 'cfvo', 'nitro', 'ocd', 'ocs', 'phh2o', 'sand', 'soc', 'mean_soil', 'mean_vpd', 'mean_temp', '

Since processing gets too heavy if we try to extract each point at a time for the entire Amazon at 1km resolution, we do it in chunks:

In [7]:
def export_csv(name, image, grid_size, n_chunks = 1, lu_name = None):

    properties_to_export = image.bandNames().getInfo()
    
    # Get FeatureCollection size and calculate chunk size
    grid = ee.FeatureCollection(f"{data_folder}/grid_{grid_size}k_amazon_{name}")
    total_features = grid.size().getInfo()
    chunk_size = int(total_features * 1/n_chunks)

    if lu_name:
        name = f"{name}_{lu_name}"

    def process_chunk(chunk_index):
        start = chunk_index * chunk_size
        chunk = grid.toList(chunk_size, start)
        selected_pixels = ee.FeatureCollection(chunk)

        unified_fc = image.reduceRegions(selected_pixels, ee.Reducer.first(), 30)

        task = ee.batch.Export.table.toDrive(
            collection = unified_fc,
            description = f"grid_{grid_size}k_{name}_{chunk_index}",
            fileFormat = "CSV",
            selectors = [p for p in properties_to_export if p not in ['system:index', '.geo']]
        )
        task.start()

    for i in range(n_chunks):
        if i*chunk_size < total_features:
            process_chunk(i)

export_csv("pastureland", unified_data_pasture, 1, n_chunks = 60)
# export_csv("secondary", unified_data_secondary, 1, n_chunks = 30)
# export_csv("secondary", unified_data_secondary, 10, n_chunks = 30)


In [None]:
suffixes = [
    "aggregated_all",
    "non_aggregated_all",
    "non_aggregated_10yr",
    "non_aggregated_5yr"
]

for suffix in suffixes:
    combined_image = ee.Image.cat([
        unified_data, 
        ee.Image(f"{data_folder}/land_use_{suffix}")
    ])

# export_csv("secondary", combined_image, 10, n_chunks=30, lu_name=suffix)


## Different land use aggregations


## Field Data

In [None]:
field_data = ee.FeatureCollection(f"{data_folder}/field")

# Reproject continuous variables using mean
continuous_vars = [
    nearest_mature.rename("nearest_mature"),
    soil,
    terraclim.select(["mean_soil", "mean_vpd", "mean_temp", "mean_def", 
                      "mean_srad", "mean_pr", "mean_aet", "mean_pdsi"])
]

unified_data_field = ee.Image.cat([
    floodable_forests,
    distance_deep_forest,
    sur_cover,
    continuous_vars,  # Exclude ESA_CCI from field data
    categorical_vars,
    ee.Image.pixelLonLat().rename(['lon', 'lat'])
])

unified_fc = unified_data_field.reduceRegions(field_data, ee.Reducer.first(), 30)

properties_to_export = unified_data_field.bandNames().getInfo()
properties_to_export = properties_to_export + ["age", "site_id", "plot_id", "field_biom"]

# Export task to Google Drive
task = ee.batch.Export.table.toDrive(
    collection = unified_fc,
    description = 'field_predictors',
    fileFormat = "CSV",
    selectors = [p for p in properties_to_export if p not in ['system:index', '.geo']]
)
task.start()

In [None]:
mature_biomass = ee.Image(f"{data_folder}/mature_biomass")
mature_biomass_area_2020 = ee.Image(f"{data_folder}/mature_biomass_area_2020")


# Most efficient - uses reduceRegion with bestEffort
total_biomass = mature_biomass.reduceRegion(
    reducer=ee.Reducer.sum(),
    geometry=mature_biomass.geometry(),
    scale=mature_biomass.projection().nominalScale(),
    maxPixels=1e9,
    bestEffort=True
).getInfo()

print(f"Total biomass: {total_biomass}")

Total biomass: {'mature_biomass': 19499281737}


In [16]:
# Load the categorical image and select the 'biome' band
biomes = ee.Image(f"{data_folder}/categorical").select("biome")
biomes_mask = biomes.eq(1).rename("biome_mask")
lulc = (ee.Image("projects/mapbiomas-public/assets/brazil/lulc/collection9/mapbiomas_collection90_integration_v1")
            .select([f"classification_{year}" for year in config.range_1985_2020])
            .byte()
            .rename([str(year) for year in config.range_1985_2020]))
mature_mask = lulc.eq(3).reduce(ee.Reducer.allNonZero()).unmask(0).updateMask(biomes_mask)

# Get area coverage in million hectares
pixel_area = ee.Image.pixelArea()
valid_pixels = mature_mask
covered_area = pixel_area.updateMask(valid_pixels).reduceRegion(
    reducer=ee.Reducer.sum(),
    geometry=mature_biomass.geometry(),
    scale=mature_biomass.projection().nominalScale(),
    maxPixels=1e9,
    bestEffort=True
).getInfo()

# Convert from m² to million hectares
# 1 hectare = 10,000 m²
# 1 million hectares = 10,000,000,000 m²
area_million_hectares = covered_area['area'] / 1e10

print(f"Covered area: {area_million_hectares:.2f} million hectares")

Covered area: 422.36 million hectares


In [17]:
# Load the categorical image and select the 'biome' band
biomes = ee.Image(f"{data_folder}/categorical").select("biome")
biomes_mask = biomes.eq(1).rename("biome_mask")
lulc = (ee.Image("projects/mapbiomas-public/assets/brazil/lulc/collection9/mapbiomas_collection90_integration_v1")
            .select([f"classification_{year}" for year in config.range_1985_2020])
            .byte()
            .rename([str(year) for year in config.range_1985_2020]))
mature_mask = lulc.eq(15).reduce(ee.Reducer.allNonZero()).unmask(0).updateMask(biomes_mask)

# Get area coverage in million hectares
pixel_area = ee.Image.pixelArea()
valid_pixels = mature_mask
covered_area = pixel_area.updateMask(valid_pixels).reduceRegion(
    reducer=ee.Reducer.sum(),
    geometry=mature_biomass.geometry(),
    scale=mature_biomass.projection().nominalScale(),
    maxPixels=1e9,
    bestEffort=True
).getInfo()

# Convert from m² to million hectares
# 1 hectare = 10,000 m²
# 1 million hectares = 10,000,000,000 m²
area_million_hectares = covered_area['area'] / 1e10

print(f"Covered area: {area_million_hectares:.2f} million hectares")

KeyboardInterrupt: 