In [None]:
import geopandas as gpd
import rasterio
from rasterio.features import rasterize
import numpy as np
from typing import Tuple, Dict
import os
from tqdm import tqdm
import pandas as pd
from utils import greco_regions_fr_en 

def create_value_maps(species: gpd.GeoDataFrame) -> Dict[str, Dict[str, int]]:
    """
    Creates a consistent mapping from unique categorical values to integers for all specified columns.
    """
    columns = ['specie_en', 'genus_en', 'phen_en', 'year', 'source']
    value_maps = {}
    
    for column in columns:
        unique_values = species[column].unique()
        value_map = {val: idx + 1 for idx, val in enumerate(unique_values)}
        value_maps[column] = value_map
    
    #add 'bdforet' to species_en 
    # value_maps['source'] = {**value_maps['source'], **{'bdforet': max(list(value_maps['source'].values())) + 1}}
    return value_maps

def process_tile(tile: gpd.GeoDataFrame, 
                 dict_species: Dict[str, gpd.GeoDataFrame], 
                 value_maps: Dict[str, Dict[str, int]], 
                 transform: rasterio.Affine, 
                 shape: Tuple[int, int], 
                 crs: str, 
                 perc: float, 
                 dict_bdforet: Dict[str, gpd.GeoDataFrame]) -> np.ndarray:
    """
    Clips species data with the tile and rasterizes specified columns using provided value maps.
    If `perc` is less than 10, also processes `bdforet` data by excluding overlapping areas with species.
    
    Args:
    - tile: The tile GeoDataFrame to clip with.
    - species: The species GeoDataFrame containing the data to be rasterized.
    - value_maps: A dictionary mapping column names to value maps for rasterization.
    - transform: The affine transform for rasterization.
    - shape: The shape of the output raster.
    - crs: Coordinate reference system to be used.
    - perc: A percentage value determining whether to process `bdforet`.
    - bdforet: A GeoDataFrame containing additional data to process if `perc` < 10.
    
    Returns:
    - A numpy array containing the rasterized data.
    """
    species = dict_species[str(crs)]
    clipped_species = gpd.clip(species, tile.geometry)

    if perc < 10 or clipped_species.empty:
        bdforet = dict_bdforet[str(crs)]
        clipped_bdforet = gpd.clip(bdforet, tile.geometry)
        #apply a negative buffer of 100m to account for inaccuracies on the edges. 
        clipped_bdforet.geometry = clipped_bdforet.geometry.buffer(-100)

        # Compute the difference where bdforet does not overlap with species
        diff_bdforet = gpd.overlay(clipped_bdforet, clipped_species, how='difference')

        # Add the 'bdforet' geometries to clipped_species
        # diff_bdforet['source'] = max(value_maps['source'].values())
        clipped_species = pd.concat([clipped_species, diff_bdforet], ignore_index=True)

    if clipped_species.empty:
        return np.zeros((len(value_maps), *shape), dtype=np.uint8)

    rasters = np.zeros((len(value_maps), *shape), dtype=np.uint8)
    for i, (column, value_map) in enumerate(value_maps.items()):
        shapes = ((geom, value_map[val]) for geom, val in zip(clipped_species.geometry, clipped_species[column]))
        rasters[i] = rasterize(
            shapes,
            out_shape=shape,
            transform=transform,
            fill=0,
            all_touched=True,
            dtype=np.uint8
        )

    return rasters


def main(gdf: gpd.GeoDataFrame,
          dict_species: Dict[str, gpd.GeoDataFrame],
            dict_bdforet : Dict[str, gpd.GeoDataFrame],
              output_dir: str,
                name: str = 'tiles_2_5_km',
                  value_maps: Dict[str, Dict[str, int]] = None,
                    already_computed = []) -> None:
    """
    Main function to process all tiles, clip species data, rasterize and save the results.
    """
    os.makedirs(output_dir, exist_ok=True)

    for index, tile_row in tqdm(gdf.iterrows(), total=gdf.shape[0]):
        # s = tile_row['set']

        if str(index) in already_computed :
            continue

        ser = greco_regions_fr_en.get(tile_row['NomSER'], tile_row['NomSER']).replace(' ', '_')
        perc = tile_row['perc_sarah']

        
        # Find folder inside subfolder tiles that has name which starts with tile_{index}
        folder_match = [f for f in os.listdir(os.path.join(output_dir, name)) if f.startswith(f"tile_{index}")]
        # Sort folder according to the tile numerous
        folder_match = sorted(folder_match, key=lambda x: int(x.split('_')[1]))
        
        if len(folder_match) > 0:
            folder_match = folder_match[0]
            rgb_folder = os.path.join(output_dir, name, folder_match, 'rgb')
            if os.path.isdir(rgb_folder):
                # Get the first raster file in the 'rgb' folder
                raster_files = [f for f in os.listdir(rgb_folder) if f.endswith('.tif')]
                if raster_files:
                    with rasterio.open(os.path.join(rgb_folder, raster_files[0])) as src:
                        transform = src.transform
                        shape = src.shape
                        crs = src.crs

                        # Convert the tile to the same CRS as the raster
                        tile = gpd.GeoDataFrame(geometry=[tile_row.geometry], crs=gdf.crs).to_crs(crs)
                        
                        rasters = process_tile(tile, dict_species, value_maps, transform, shape, crs, perc, dict_bdforet)
                        
                        out_meta = src.meta.copy()
                        out_meta.update({
                            'count': rasters.shape[0],
                            'dtype': 'uint8'
                        })

                        output_folder = os.path.join(output_dir, name, folder_match, 'reference_species')
                        os.makedirs(output_folder, exist_ok=True)
                        output_path = os.path.join(output_folder, f"tile_{index}_{ser}_ps{perc :.0f}.tif")
                        with rasterio.open(output_path, 'w', **out_meta) as dest:
                            dest.write(rasters)

    # Write the mapping information to a single text file
    txt_output_path = f"{output_dir}/{name}/value_mappings.txt"
    with open(txt_output_path, 'w') as f:
        for column, value_map in value_maps.items():
            f.write(f"Column '{column}' index mapping:\n")
            for value, idx in value_map.items():
                f.write(f"  {idx}: {value}\n")
            f.write("\n")


# output_dir = "/Users/arthurcalvi/Data/species/validation"
# result_gdf = gpd.read_parquet("/Users/arthurcalvi/Data/species/validation/tiles_2_5_km_final.parquet")
# species = gpd.read_file('/Users/arthurcalvi/Data/species/france_species.shp')
# bdforet = gpd.read_parquet('/Users/arthurcalvi/Data/Disturbances_maps/BDForet/bdforet_10_FF1_FF2_EN_year.parquet')

# main(result_gdf, species, output_dir, name='tiles_2_5_km')


In [None]:
output_dir = "/Users/arthurcalvi/Data/species/validation"
result_gdf = gpd.read_parquet("/Users/arthurcalvi/Data/species/validation/tiles_2_5_km_final.parquet")
species = gpd.read_file('/Users/arthurcalvi/Data/species/france_species.shp')
bdforet = gpd.read_parquet('/Users/arthurcalvi/Data/Disturbances_maps/BDForet/bdforet_10_FF1_FF2_EN_year.parquet')
bdforet.dropna(subset=['year'], inplace=True)
#convert year in float to int and then str
bdforet['year'] = bdforet['year'].astype(int).astype(str)
bdforet['source'] = 'bdforet'

import os
import rasterio
from collections import Counter
from tqdm import tqdm 

def get_most_common_crs(directory: str):
    crs_counter = Counter()

    # Traverse the directory structure
    for root, dirs, files in tqdm(os.walk(directory)):
        # Look for 'rgb' subfolder in the path
        if 'rgb' in root:
            for file in files:
                if file.endswith('.tif'):
                    # Get the full file path
                    file_path = os.path.join(root, file)
                    
                    # Read the CRS from the .tif file
                    with rasterio.open(file_path) as src:
                        crs = src.crs.to_string()
                        crs_counter[crs] += 1

                    break 
    
    # Print the value counts of each CRS
    for crs, count in crs_counter.items():
        print(f"CRS: {crs}, Count: {count}")
    
    # Determine the most common CRS
    most_common_crs = crs_counter.most_common(1)[0]
    print(f"\nMost common CRS: {most_common_crs[0]} with {most_common_crs[1]} occurrences")
    
    return crs_counter

# Assuming 'dir' is the directory containing your tiles
crs_counter = get_most_common_crs('/Users/arthurcalvi/Data/species/validation/tiles_2_5_km')

dict_species = {}
dict_bdforet = {}
for crs in crs_counter.keys():
    dict_species[crs] = species.to_crs(crs)
    dict_bdforet[crs] = bdforet.to_crs(crs)

In [None]:
value_maps = create_value_maps(pd.concat([species, bdforet]))

In [None]:
value_maps.keys()

In [None]:
value_maps['source']

In [None]:
path_tiles = os.path.join(output_dir, 'tiles_2_5_km')
tile_numbers = []

for subfolder in tqdm(os.listdir(path_tiles)):
    subfolder_path = os.path.join(path_tiles, subfolder)
    
    if os.path.isdir(subfolder_path):
        # Extract the tile number from the subfolder name
        tile_number = subfolder.split('_')[1]

        # Construct the path to the reference_species subsubfolder
        reference_species_path = os.path.join(subfolder_path, 'reference_species')

        # Check if reference_species subfolder exists
        if os.path.exists(reference_species_path) and os.path.isdir(reference_species_path):
            # Check for the presence of .tif files in the reference_species subsubfolder
            if any(file.endswith('.tif') for file in os.listdir(reference_species_path)):
                tile_numbers.append(tile_number)

tile_numbers 

In [None]:
main(result_gdf, dict_species, dict_bdforet, output_dir, name='tiles_2_5_km', value_maps=value_maps, already_computed=tile_numbers)