# Intersection with 500 samples

In [None]:
#bbox to validate the algorithm1 and to create the dataset
import geopandas as gpd 

gdf_sampled_bbox = gpd.read_parquet('~/repo/Disturbance-Attribution-Dataset-Joining/data/results/sampling/sampled_bboxes.parquet').to_crs('EPSG:32631')
gdf_features = gpd.read_parquet('~/repo/Disturbance-Attribution-Dataset-Joining/data/results/sampling/sampled_features.parquet').to_crs('EPSG:32631')

from tqdm import tqdm 
for i, row in tqdm(enumerate(gdf_sampled_bbox.itertuples())):
    bbox = gdf_features[ gdf_features['bbox'] == i ]
    start_date = bbox['start_date'].min()
    end_date = bbox['end_date'].max()
    gdf_sampled_bbox.loc[i, 'start_date'] = start_date
    gdf_sampled_bbox.loc[i, 'end_date'] = end_date

#add tzinfo to start_date and end_date
from datetime import timezone
gdf_sampled_bbox.start_date = gdf_sampled_bbox.start_date.dt.tz_localize(timezone.utc) 
gdf_sampled_bbox.end_date = gdf_sampled_bbox.end_date.dt.tz_localize(timezone.utc) 

In [None]:
species = gpd.read_file('/Users/arthurcalvi/Data/species/france_species.shp').to_crs('EPSG:32631')

In [None]:
species

In [None]:
import geopandas as gpd
import pandas as pd

def calculate_intersection_percentage(sentinel_gdf: gpd.GeoDataFrame, tree_species_gdf: gpd.GeoDataFrame) -> pd.DataFrame:
    """
    Calculate the percentage of area of Sentinel-2 bounding boxes that contain tree species information for the corresponding years.

    Parameters:
    sentinel_gdf (gpd.GeoDataFrame): Geodataframe containing Sentinel-2 bounding boxes with start and end dates.
    tree_species_gdf (gpd.GeoDataFrame): Geodataframe containing tree species polygons with date information.

    Returns:
    pd.DataFrame: A dataframe with Sentinel-2 bounding box IDs, years, and intersection percentages.
    """
    
    results = []
    for sentinel_row in sentinel_gdf.itertuples():
        # Filter tree species data for the year of the current sentinel row within the date range
        start_year = sentinel_row.start_date.year
        end_year = sentinel_row.end_date.year
        
        tree_species_year_gdf = tree_species_gdf[
            tree_species_gdf['year'].dt.year.between(start_year, end_year)
        ]

        # Find intersections
        intersections = tree_species_year_gdf[tree_species_year_gdf.intersects(sentinel_row.geometry)]
        
        # Calculate intersection area
        total_intersection_area = intersections.geometry.intersection(sentinel_row.geometry).area.sum()
        num_pixels = total_intersection_area / 100  # since 1 pixel = 100 square meters

        results.append({
            'sentinel_id': sentinel_row.Index,  # Assuming there's an 'id' column in your sentinel data
            'year_range': f"{start_year}-{end_year}",
            'intersection_pixels': num_pixels,
            'geometry': intersections.geometry.unary_union
        })
    
    return gpd.GeoDataFrame(results, crs=sentinel_gdf.crs)

# Load your geodataframes
sentinel_gdf = gdf_sampled_bbox
tree_species_gdf = species # Update with your file path

# Convert date columns to datetime
sentinel_gdf['start_date'] = pd.to_datetime(sentinel_gdf['start_date'])
sentinel_gdf['end_date'] = pd.to_datetime(sentinel_gdf['end_date'])
tree_species_gdf['year'] = pd.to_datetime(tree_species_gdf['year'])

# Compute the intersection percentages
intersection_df = calculate_intersection_percentage(sentinel_gdf, tree_species_gdf)

# Display the results
from IPython.display import display
display(intersection_df)




In [None]:
intersection_df.intersection_pixels.sum() 

# Select new AOIs 

## Merge BDFORET > 2010 and Sarah compilation of datasets

In [None]:
# Load Sarah Dataset
import geopandas as gpd
species = gpd.read_file('/Users/arthurcalvi/Data/species/france_species.shp').to_crs('EPSG:32631')
removed_sources = ['FrenchNFI', 'DSF']
species = species[~species.source.isin(removed_sources)]

In [None]:
# Load BDforet
bdforet = gpd.read_parquet('/Users/arthurcalvi/Repo/Disturbance-Attribution-Dataset-Joining/data/processed_datasets/BDFORET_EPSG2154_FR_simplified10.parquet')
from thefuzz import process

tree_phenology = {
    'Pin maritime': 'Evergreen',
    'NC': 'Unknown',
    'Mixte': 'Mixed',
    'Feuillus': 'Deciduous',
    'Pins mélangés': 'Mixed',
    'NR': 'Unknown',
    'Conifères': 'Evergreen',
    'Sapin, épicéa': 'Evergreen',
    'Peuplier': 'Deciduous',
    'Douglas': 'Evergreen',
    'Pin sylvestre': 'Evergreen',
    'Châtaignier': 'Deciduous',
    'Chênes décidus': 'Deciduous',
    'Pin laricio, pin noir': 'Evergreen',
    'Mélèze': 'Deciduous',
    'Pin autre': 'Evergreen',
    'Hêtre': 'Deciduous',
    'Robinier': 'Deciduous',
    'Chênes sempervirents': 'Evergreen',
    'Pin d\'Alep': 'Evergreen',
    'Pin à crochets, pin cembro': 'Evergreen',
    'Pins mélangés': 'Mixed',
    'Sapin, épicéa': 'Evergreen',
    'Châtaignier': 'Deciduous',
    'Chênes sempervirents': 'Evergreen',
    'Pin à crochets, pin cembro': 'Evergreen',
    'Hêtre': 'Deciduous',
    'Conifères': 'Evergreen',
    'Pin maritime': 'Evergreen',
    'Mélèze': 'Deciduous',
    'Chênes sempervirents': 'Evergreen'
}

# Function to map species to phenology using fuzzy matching
def get_phenology(specie: str) -> str:
    match = process.extractOne(specie, tree_phenology.keys(), scorer=process.fuzz.ratio)
    if match and match[1] > 50:  # Adjust the threshold as needed
        return tree_phenology[match[0]].lower()
    else:
        print(f'No match found for {specie}')
        return 'Unknown'

# Create the new phenology column
bdforet['phen_en'] = bdforet['ESSENCE'].apply(get_phenology)
bdforet = bdforet[(bdforet.phenology != 'unknown') & (bdforet.phenology != 'mixed')]

In [None]:
# Keep only the departments of the BDForet created after 2010. 
import pandas as pd
dep = gpd.read_file('/Users/arthurcalvi/Data/Disturbances_maps/BDForet/contour-des-departements.geojson')

year = 2010
num_dep = ['02',
 '09',
 '11',
 '15',
 '16',
 '17',
 '19',
 '2A',
 '2B',
 '21',
 '22',
 '23',
 '24',
 '25',
 '28',
 '30',
 '31',
 '32',
 '34',
 '35',
 '37',
 '39',
 '41',
 '43',
 '45',
 '46',
 '48',
 '50',
 '55',
 '60',
 '61',
 '66',
 '70',
 '71',
 '75',
 '76',
 '77',
 '78',
 '80',
 '82',
 '87',
 '88',
 '90',
 '91',
 '92',
 '93',
 '94',
 '95']
dep = dep[ dep['code'].isin(num_dep) ].to_crs(bdforet.crs)
dep.geometry = dep.geometry.simplify(1000)

bdforet_ = bdforet.clip(dep.geometry)
bdforet_.geometry = bdforet_.geometry.buffer(-100)
bdforet_cleaned = gpd.overlay(bdforet_, species.to_crs(bdforet_.crs), how='difference')

In [None]:
#Add year and dataset 
bdforet_cleaned['source'] = 'bdforet'
bdforet_cleaned['year'] = 2010 

bdforet_year = pd.read_csv('/Users/arthurcalvi/Data/Disturbances_maps/BDForet/Année_reference_BDForet.csv', sep=';')

# Ensure the same CRS for spatial joins
if bdforet_cleaned.crs != dep.crs:
    dep = dep.to_crs(bdforet_cleaned.crs)

# Step 1: Spatial join bdforet_cleaned with dep to get the region code
bdforet_with_code = gpd.sjoin(bdforet_cleaned[['source', 'phen_en', 'geometry']], dep[['code', 'geometry']], how='left', op='intersects')

# Step 2: Join bdforet_with_code with bdforet_year to get the year
bdforet_with_year = bdforet_with_code.merge(bdforet_year, left_on='code', right_on='N° Dep', how='left')

# Step 3: Add the 'year' column from the joined dataframe
bdforet_with_year['year'] = bdforet_with_year['Année de référence (PVA)']

# Step 4: Drop unnecessary columns and keep only relevant ones
bdforet_cleaned_final = bdforet_with_year[['source', 'phen_en', 'geometry', 'year']]


In [None]:
#Write dataset
species_ = species[['source', 'phen_en', 'geometry', 'year']].to_crs(bdforet_cleaned_final.crs)
merge_dataset = gpd.GeoDataFrame(pd.concat([bdforet_cleaned_final, species_], ignore_index=True), crs=bdforet_cleaned_final.crs)
merge_dataset['year'] = merge_dataset['year'].astype(int)
merge_dataset.to_parquet('/Users/arthurcalvi/Data/species/dataset_merge_SarahBrood_BDForet-sup2010.parquet')

## Create the 2.5km grid

In [None]:
merge_dataset.geometry.iloc[0].is_valid

In [None]:
# Identify and handle problematic geometries
for geom in merge_dataset.geometry:
    try:
        # Try buffering the geometry to fix potential issues
        buffered_geom = geom.buffer(0)
        valid_geometries.append(buffered_geom)
    except Exception as e:
        print(f"Problematic geometry found: {geom.wkt} with error {e}")
        # Optionally log or remove the problematic geometry

# Attempt the unary union with the valid geometries
try:
    boundary_species = unary_union(valid_geometries)
except Exception as e:
    print(f"Error during unary union: {e}")

In [None]:
import numpy as np 
import geopandas as gpd
from shapely.geometry import box 
from shapely.ops import unary_union
from tqdm import tqdm
import matplotlib.pyplot as plt
# Calculate the different grid sizes
initial_grid_size = 100e3
aoi_size = 2.5e3

sizes = [initial_grid_size]
while sizes[-1] > 3*aoi_size:
    a = sizes[-1] // 2
    sizes.append(max(a, aoi_size))

sizes.append(aoi_size)
print(sizes)
minx, miny, maxx, maxy = merge_dataset.envelope.total_bounds

# Step 1: Drop empty geometries (already done)
merge_dataset = merge_dataset[~merge_dataset.geometry.is_empty]

# Step 2: Ensure all geometries are valid
merge_dataset['geometry'] = merge_dataset['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)

# Step 3: Drop null geometries (in case they exist)
merge_dataset = merge_dataset[merge_dataset['geometry'].notnull()]

# Step 4: Simplify the geometries
simplified_geometries = merge_dataset.geometry.simplify(2500, preserve_topology=True)

# Step 5: Create unary union
try:
    boundary_species = simplified_geometries.unary_union
except Exception as e:
    print(f"Error during unary union: {e}")
    # Handle exception if necessary
    
# Create a list to hold selected AOIs
new_boundaries = gpd.GeoDataFrame(geometry=[box(minx, miny, maxx, maxy)], crs=merge_dataset.crs)


for i, current_grid_size in enumerate(sizes):
    print(f"Processing grid size: {current_grid_size}")
    # Determine the bounds for the current iteration
    
    # Generate a grid of AOIs
    x_coords = np.arange(minx, maxx, current_grid_size)
    y_coords = np.arange(miny, maxy, current_grid_size)
    
    # Create polygons for the AOIs
    aoi_polygons = []
    for x in x_coords:
        for y in y_coords:
            aoi = box(x, y, x + current_grid_size, y + current_grid_size)
            aoi_polygons.append(aoi)
    
    # Create a GeoDataFrame from the AOIs
    aois = gpd.GeoDataFrame(geometry=aoi_polygons, crs=merge_dataset.crs)
   
    # Calculate the intersection of each AOI with the species dataset
    selected_aois = []
    bounds = new_boundaries.unary_union
    for row in tqdm(aois.itertuples()):
        if row.geometry.intersects(bounds) and row.geometry.intersects(boundary_species):
            selected_aois.append(row.geometry)
    
    print(F'Conversion : {len(selected_aois)/aois.shape[0] :.0%}')
    
    # Update new_boundaries for the next iteration
     
    new_boundaries = gpd.GeoDataFrame(geometry=selected_aois, crs=merge_dataset.crs)
    fig, ax = plt.subplots(1, 1, figsize=(5, 5))
    new_boundaries.plot(ax=ax, edgecolor='k')
    ax.set_axis_off()
    plt.show()
    

In [None]:
perc = []
geom = []
for row in tqdm(new_boundaries.itertuples()):
    intersection = boundary_species.intersection(row.geometry)
    intersection_area = intersection.area
    aoi_area = aoi.area
    intersection_percent = (intersection_area / aoi_area) * 100
    perc.append(intersection_percent)
    geom.append(row.geometry)

gdf_intersection = gpd.GeoDataFrame(data=perc, geometry=geom, columns=['perc'], crs=species.crs)
gdf_intersection.to_parquet('/Users/arthurcalvi/Data/species/2c5km_cell_percentage.parquet')
    

In [None]:
import geopandas as gpd
gdf_intersection = gpd.read_parquet('/Users/arthurcalvi/Data/species/2c5km_cell_percentage.parquet').to_crs('epsg:2154')
gdf_intersection

In [None]:
greco = gpd.read_file('/Users/arthurcalvi/Data/eco-regions/France/ser_l93_new/ser_l93_new.dbf')
greco['greco'] = greco.codeser.apply(lambda x:x[0])
greco = greco.dissolve(by='greco', aggfunc='first')
greco = greco.reset_index().iloc[1:].to_crs('EPSG:2154')
greco.plot(column='codeser')

In [None]:
gdf_intersection_2154 = gdf_intersection.to_crs('EPSG:2154')
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 10))
greco.plot(ax=ax, column='codeser', edgecolor='black')
gdf_intersection_2154.plot(column='perc', ax=ax, legend=True, legend_kwds={'label': "Intersection percentage"})

In [None]:
import geopandas as gpd
import pandas as pd
from tqdm import tqdm

def sample_gdf_intersection(gdf_intersection: gpd.GeoDataFrame, greco: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Sample from gdf_intersection based on greco regions, selecting the top 32 rows with the highest 'perc'.
    """
    result = []

    # Ensure both GeoDataFrames have the same CRS
    gdf_intersection.to_crs('epsg:2154', inplace=True)
    greco.to_crs('epsg:2154', inplace=True)

    for region in tqdm(greco.itertuples(), total=len(greco)):
        # Find intersections with the current greco region
        intersecting = gdf_intersection[gdf_intersection.intersects(region.geometry)]
        
        if not intersecting.empty:
            # Sort by 'perc' and keep the 32 highest rows
            intersecting_sorted = intersecting.sort_values(by='perc', ascending=False)
            
            # Assign the region name to the intersecting data
            intersecting_sorted['NomSER'] = region.NomSER
            
            # Append the intersecting_sorted to the result
            result.append(intersecting_sorted)
    
    # Combine all results into a single GeoDataFrame
    result_gdf = gpd.GeoDataFrame(pd.concat(result, ignore_index=True), crs=gdf_intersection.crs)
    return result_gdf[['perc', 'NomSER', 'geometry']]

result_gdf_all = sample_gdf_intersection(gdf_intersection, greco)


# import geopandas as gpd
# import pandas as pd
# from shapely.geometry import Polygon
# import numpy as np
# from tqdm import tqdm

# def sample_gdf_intersection(gdf_intersection: gpd.GeoDataFrame, greco: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
#     """
#     Sample from gdf_intersection based on greco regions, create training and validation sets.
#     """
#     result = []

#     for region in tqdm(greco.itertuples()):
#         # Clip gdf_intersection with the current greco region
#         clipped = gpd.clip(gdf_intersection, region.geometry)
        
#         if not clipped.empty:
#             # Sort by 'perc' and keep the 8 highest rows
#             clipped_sorted = clipped.sort_values(by='perc', ascending=False).head(32)
            
#             # Randomly split into training and validation sets
#             clipped_sorted['set'] = np.where(np.random.rand(len(clipped_sorted)) < 0.5, 'training', 'validation')
            
#             # Ensure 4 for training and 4 for validation
#             training_set = clipped_sorted[clipped_sorted['set'] == 'training'].head(16)
#             validation_set = clipped_sorted[clipped_sorted['set'] == 'validation'].head(16)
            
#             if len(training_set) < 4:
#                 additional_training = clipped_sorted[clipped_sorted['set'] == 'validation'].tail(4 - len(training_set))
#                 training_set = pd.concat([training_set, additional_training])
            
#             if len(validation_set) < 4:
#                 additional_validation = clipped_sorted[clipped_sorted['set'] == 'training'].tail(4 - len(validation_set))
#                 validation_set = pd.concat([validation_set, additional_validation])
            
#             # Append the training and validation sets to the result
#             for set_type, data in zip(['training', 'validation'], [training_set, validation_set]):
#                 data['set'] = set_type
#                 data['NomSER'] = region.NomSER
#                 result.append(data)
    
#     # Combine all results into a single GeoDataFrame
#     result_gdf = gpd.GeoDataFrame(pd.concat(result, ignore_index=True), crs=gdf_intersection.crs)
#     return result_gdf[['perc', 'set', 'NomSER', 'geometry']]

# result_gdf = sample_gdf_intersection(gdf_intersection, greco)




In [None]:
result_gdf_all.to_parquet("/Users/arthurcalvi/Data/species/validation/2c5km_val_train_tiles_all.parquet")

In [None]:
import geopandas as gpd
result_gdf = gpd.read_parquet("/Users/arthurcalvi/Data/species/validation/val_train_tiles.parquet")
print(result_gdf.crs)

In [None]:
species

In [None]:
import matplotlib.pyplot as plt
import contextily as ctx
import geopandas as gpd
import os 

# Sample data for gdf4
# gdf4 = gpd.read_file("path_to_your_shapefile.shp")
#same crs
greco = greco.to_crs('epsg:2154')
result_gdf = result_gdf.to_crs('epsg:2154')
# Create the figure and axis
fig, ax = plt.subplots(1, 1, figsize=(7, 7))

# Define the colormap
# colormap = {'validation': 'yellow', 'training': 'blue'}
# result_gdf['color'] = result_gdf['set'].map(colormap)

greco.plot(ax=ax, column='NomSER', edgecolor='k', alpha=0.25, cmap='tab20')
# Plot the GeoDataFrame with the specified colors
#get fraction of a cmap to set the color
result_gdf.plot(ax=ax, cmap='viridis', column='perc', legend=True, legend_kwds={'label': "Intersection percentage", 'shrink': 0.5, 'orientation': 'horizontal'})

# Add basemap and remove axis
ax.set_axis_off()

# Add a legend
# handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=colormap[key], markersize=10, label=key) for key in colormap]
# ax.legend(handles=handles, title='Set', loc='upper right')

# Display the plot
plt.tight_layout()
plt.show()

os.makedirs('images', exist_ok=True)
fig.savefig('images/validation_data.png', dpi=300)


# Modify result_gdf for chronos


In [None]:
import geopandas as gpd
import numpy as np
from typing import Tuple, Dict
import os
from tqdm import tqdm

def create_value_maps(species: gpd.GeoDataFrame) -> Dict[str, Dict[str, int]]:
    """
    Creates a consistent mapping from unique categorical values to integers for all specified columns.
    """
    columns = ['specie_en', 'genus_en', 'phen_en', 'year', 'source']
    value_maps = {}
    
    for column in columns:
        unique_values = species[column].unique()
        value_map = {val: idx + 1 for idx, val in enumerate(unique_values)}
        value_maps[column] = value_map
    
    return value_maps

def calculate_percentages(clipped: gpd.GeoDataFrame, value_maps: Dict[str, Dict[str, int]]) -> Tuple[int, float, float, float]:
    """
    Calculates the most frequent year and the percentages of deciduous and evergreen types.
    """
    if clipped.empty:
        return np.nan, 0.0, 0.0, 0.0
    
    # Year calculations
    year_counts = clipped['year'].map(value_maps['year']).value_counts()
    most_frequent_year = year_counts.idxmax()
    perc_year = year_counts.max() / year_counts.sum()
    
    # Phenology calculations
    phen_counts = clipped['phen_en'].map(value_maps['phen_en']).value_counts()
    deciduous_count = phen_counts.get(value_maps['phen_en'].get('deciduous', 0), 0)
    evergreen_count = phen_counts.get(value_maps['phen_en'].get('evergreen', 0), 0)
    total_count = phen_counts.sum()
    
    perc_deciduous = deciduous_count / total_count
    perc_evergreen = evergreen_count / total_count
    
    return most_frequent_year, perc_year, perc_deciduous, perc_evergreen

def main(gdf: gpd.GeoDataFrame, species: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Main function to process all tiles, clip species data, and update the GeoDataFrame with calculated percentages.
    """
    # Create value maps for categorical columns
    value_maps = create_value_maps(species)

    # Initialize lists to hold new column data
    years = []
    perc_years = []
    perc_deciduous = []
    perc_evergreen = []

    for tile_row in tqdm(gdf.itertuples()):
        # Convert the tile to the same CRS as the species data
        tile = gpd.GeoDataFrame(geometry=[tile_row.geometry], crs=gdf.crs).to_crs(species.crs)
        
        # Clip species data with the tile
        clipped = gpd.clip(species, tile.geometry)
        
        # Calculate the necessary percentages and year information
        most_frequent_year, perc_year, perc_deciduous_val, perc_evergreen_val = calculate_percentages(clipped, value_maps)
        
        # Append results to lists
        years.append(most_frequent_year)
        perc_years.append(perc_year)
        perc_deciduous.append(perc_deciduous_val)
        perc_evergreen.append(perc_evergreen_val)

    # Update the GeoDataFrame with the new columns
    gdf['year'] = years
    gdf['perc_year'] = perc_years
    gdf['perc_deciduous'] = perc_deciduous
    gdf['perc_evergreen'] = perc_evergreen
    
    return gdf

# Example usage
 # Replace with actual species data file path
# result_gdf = gpd.read_parquet("/Users/arthurcalvi/Data/species/validation/2c5km_val_train_tiles.parquet")

# updated_gdf = main(result_gdf.to_crs(species.crs), species)
# updated_gdf.to_crs('epsg:2154').to_parquet("/Users/arthurcalvi/Data/species/validation/val_train_tiles_2_5_km.parquet")


updated_gdf_all = main(result_gdf_all.to_crs(species.crs), species)
updated_gdf_all.to_crs('epsg:2154').to_parquet("/Users/arthurcalvi/Data/species/validation/val_train_tiles_2_5_km_all.parquet")


In [None]:
mapping_real_greco = {'Côtes_et_plateaux_de_la_Manche': 'Centre Nord semi-océanique',
                      'Côtes_et_plateaux_de_la_Manche': 'Centre Nord semi-océanique',
                      'Ardenne_primaire': 'Grand Est semi-continental',
                      'Préalpes_du_Nord': 'Alpes',
                      'Préalpes_du_Nord': 'Alpes',
                      'Garrigues' : 'Méditerranée',
                      'Massif_vosgien_central': 'Vosges',
                        'Premier_plateau_du_Jura': 'Jura',
                        'Piémont_pyrénéen' : 'Pyrénées',
                        'Terres_rouges': 'Sud-Ouest océanique' ,
                          'Corse_occidentale': 'Corse',
                        "Châtaigneraie_du_Centre_et_de_l'Ouest": 'Massif central' ,
                        'Ouest-Bretagne_et_Nord-Cotentin': 'Grand Ouest cristallin et océanique', 
                        'Total': 'Total'}

mapping_real_greco = {k.replace('_', ' '): v for k, v in mapping_real_greco.items()}

# updated_gdf['NomSER'] = updated_gdf['NomSER'].map(mapping_real_greco)
# updated_gdf.to_parquet("/Users/arthurcalvi/Data/species/validation/val_train_tiles_2_5_km.parquet")
updated_gdf_all['NomSER'] = updated_gdf_all['NomSER'].map(mapping_real_greco)
updated_gdf_all.to_parquet("/Users/arthurcalvi/Data/species/validation/val_train_tiles_2_5_km_all.parquet")

In [None]:
import geopandas as gpd

# Load the result_gdf5 with 5km wide tiles
result_gdf5 = gpd.read_parquet("/Users/arthurcalvi/Data/species/validation/val_train_tiles.parquet")

# Load the result_gdf with 2.5km wide tiles
result_gdf = gpd.read_parquet("/Users/arthurcalvi/Data/species/validation/val_train_tiles_2_5_km_all.parquet")

def select_non_intersecting_tiles(gdf2_5: gpd.GeoDataFrame, gdf5: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
    """
    Select one tile for each unique value of the column 'NomSER' from gdf2_5 that does not intersect any tile in gdf5
    and has 'deciduous' and 'evergreen' values greater than 0.1.

    Args:
    gdf2_5 (gpd.GeoDataFrame): GeoDataFrame with 2.5 km wide tiles.
    gdf5 (gpd.GeoDataFrame): GeoDataFrame with 5 km wide tiles.

    Returns:
    gpd.GeoDataFrame: Selected non-intersecting tiles from gdf2_5.
    """
    selected_indices = []

    unique_values = gdf2_5['NomSER'].unique()
    
    for value in unique_values:
        tiles = gdf2_5[gdf2_5['NomSER'] == value]
        for idx, tile in tiles.iterrows():
            if (tile['perc_deciduous'] > 0.25 and tile['perc_evergreen'] > 0.25 and 
                not gdf5.intersects(tile.geometry).any()):
                selected_indices.append(idx)
                break
    
    return selected_indices

# Ensure both GeoDataFrames have the same CRS
result_gdf5 = result_gdf5.to_crs(epsg=2154)
result_gdf = result_gdf.to_crs(epsg=2154)

selected_indices = select_non_intersecting_tiles(result_gdf, result_gdf5)

# Add 'set' column with default value None
result_gdf['set'] = None

# Update the 'set' column for the selected tiles
result_gdf.loc[selected_indices, 'set'] = 'validation'

# Save the updated result_gdf
result_gdf.to_parquet("/Users/arthurcalvi/Data/species/validation/val_train_tiles_2_5_km.parquet")


In [None]:
result_gdf[result_gdf['set'] == 'validation']

# Write validation data

In [None]:
import geopandas as gpd
import rasterio
from rasterio.features import rasterize
import numpy as np
from typing import Tuple, Dict
import os
from tqdm import tqdm

def create_value_maps(species: gpd.GeoDataFrame) -> Dict[str, Dict[str, int]]:
    """
    Creates a consistent mapping from unique categorical values to integers for all specified columns.
    """
    columns = ['specie_en', 'genus_en', 'phen_en', 'year', 'source']
    value_maps = {}
    
    for column in columns:
        unique_values = species[column].unique()
        value_map = {val: idx + 1 for idx, val in enumerate(unique_values)}
        value_maps[column] = value_map
    
    return value_maps

def process_tile(tile: gpd.GeoDataFrame, species: gpd.GeoDataFrame, value_maps: Dict[str, Dict[str, int]], transform: rasterio.Affine, shape: Tuple[int, int], crs: str) -> np.ndarray:
    """
    Clips species data with the tile and rasterizes specified columns using provided value maps.
    """
    species = species.to_crs(crs)
    clipped = gpd.clip(species, tile.geometry)
    if clipped.empty:
        return np.zeros((len(value_maps), *shape), dtype=np.int32)
    
    rasters = np.zeros((len(value_maps), *shape), dtype=np.int32)
    for i, (column, value_map) in enumerate(value_maps.items()):
        shapes = ((geom, value_map[val]) for geom, val in zip(clipped.geometry, clipped[column]))
        rasters[i] = rasterize(
            shapes,
            out_shape=shape,
            transform=transform,
            fill=0,
            all_touched=True,
            dtype=np.int32
        )
    
    return rasters

def main(gdf: gpd.GeoDataFrame, species: gpd.GeoDataFrame, output_dir: str, resolution: int = 10, name: str = 'tiles_2_5_km'):
    """
    Main function to process all tiles, clip species data, rasterize and save the results.
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Create value maps for categorical columns
    value_maps = create_value_maps(species)

    for index, tile_row in tqdm(gdf.iterrows(), total=gdf.shape[0]):
        # s = tile_row['set']
        ser = tile_row['NomSER'].replace(' ', '_')
        perc = tile_row['perc']
        
        # Find folder inside subfolder tiles that has name which starts with tile_{index}
        folder_match = [f for f in os.listdir(os.path.join(output_dir, name)) if f.startswith(f"tile_{index}")]
        # Sort folder according to the tile numerous
        folder_match = sorted(folder_match, key=lambda x: int(x.split('_')[1]))
        
        if len(folder_match) > 0:
            folder_match = folder_match[0]
            rgb_folder = os.path.join(output_dir, name, folder_match, 'rgb')
            if os.path.isdir(rgb_folder):
                # Get the first raster file in the 'rgb' folder
                raster_files = [f for f in os.listdir(rgb_folder) if f.endswith('.tif')]
                if raster_files:
                    with rasterio.open(os.path.join(rgb_folder, raster_files[0])) as src:
                        transform = src.transform
                        shape = src.shape
                        crs = src.crs

                        # Convert the tile to the same CRS as the raster
                        tile = gpd.GeoDataFrame(geometry=[tile_row.geometry], crs=gdf.crs).to_crs(crs)
                        
                        rasters = process_tile(tile, species, value_maps, transform, shape, crs)
                        
                        out_meta = src.meta.copy()
                        out_meta.update({
                            'count': rasters.shape[0],
                            'dtype': 'int32'
                        })

                        output_folder = os.path.join(output_dir, name, folder_match, 'reference_species')
                        os.makedirs(output_folder, exist_ok=True)
                        output_path = os.path.join(output_folder, f"tile_{index}_{ser}_{perc :.0f}.tif")
                        with rasterio.open(output_path, 'w', **out_meta) as dest:
                            dest.write(rasters)

    # Write the mapping information to a single text file
    txt_output_path = f"{output_dir}/tiles/value_mappings.txt"
    with open(txt_output_path, 'w') as f:
        for column, value_map in value_maps.items():
            f.write(f"Column '{column}' index mapping:\n")
            for value, idx in value_map.items():
                f.write(f"  {idx}: {value}\n")
            f.write("\n")


output_dir = "/Users/arthurcalvi/Data/species/validation"

main(result_gdf, species, output_dir, name='tiles_2_5_km')


# Figure and table for article

In [None]:
exclude = [51, 61, 36, 52, 59, 62, 63, 69, 70, 77]
#exclude those index from the result_gdf

safe_result_gdf = result_gdf[~result_gdf.index.isin(exclude)]

In [None]:
import matplotlib.pyplot as plt
import contextily as ctx
import geopandas as gpd
import os 

# Sample data for gdf4
# gdf4 = gpd.read_file("path_to_your_shapefile.shp")

# Create the figure and axis
fig, ax = plt.subplots(1, 1, figsize=(7, 7))

# Define the colormap
colormap = {'validation': 'yellow', 'training': 'blue'}
# result_gdf['color'] = result_gdf['set'].map(colormap)

greco.plot(ax=ax, column='NomSER', edgecolor='k', alpha=0.25, cmap='tab20', legend=True)
# Plot the GeoDataFrame with the specified colors
safe_result_gdf.to_crs(greco.crs).plot(ax=ax, color=safe_result_gdf['color'])

# Add basemap and remove axis
ax.set_axis_off()

# Add a legend
handles = [plt.Line2D([0], [0], marker='o', color='w', markerfacecolor=colormap[key], markersize=10, label=key) for key in colormap]
# ax.legend(handles=handles, title='Set', loc='upper right')

# Display the plot
plt.tight_layout()
plt.show()

# os.makedirs('images', exist_ok=True)
# fig.savefig('images/validation_data.png', dpi=300)


In [None]:
import geopandas as gpd
import pandas as pd
from tqdm import tqdm

def calculate_total_pixels(perc: float, resolution: int = 10, tile_size: int = 5000) -> int:
    """
    Calculate the total number of pixels based on percentage of coverage and tile size.
    """
    total_area = tile_size * tile_size
    total_pixels = total_area * (perc / 100)
    pixel_count = total_pixels / (resolution * resolution)
    return int(pixel_count)

def intersect_and_summarize(gdf: gpd.GeoDataFrame, species: gpd.GeoDataFrame) -> pd.DataFrame:
    """
    Perform intersection and summarize counts of phenology and total pixel counts per NomSER and set.
    """
    gdf = gdf.to_crs(species.crs)
    results = []

    for tile in tqdm(gdf.itertuples(), total=len(gdf)):
        tile_set = tile.set
        tile_perc = tile.perc
        tile_nomser = tile.NomSER
        tile_geom = tile.geometry

        # Perform intersection
        intersected = gpd.overlay(species, gpd.GeoDataFrame(geometry=[tile_geom], crs=species.crs), how='intersection')
        if not intersected.empty:
            intersected['area'] = intersected.area  # Calculate area of intersection
            phen_area = intersected.groupby('phen_en')['area'].sum()
            phen_total_pixels = phen_area / 100  # since 1 pixel = 100 square meters 

            for phen, pixels in phen_total_pixels.items():
                results.append({
                    'set': tile_set,
                    'NomSER': tile_nomser,
                    'phen_en': phen,
                    'total_pixels': pixels
                })
        else:
            results.append({
                'set': tile_set,
                'NomSER': tile_nomser,
                'phen_en': 'None',
                'total_pixels': 0
            })

    return pd.DataFrame(results)

def summarize_to_table(summary_df: pd.DataFrame) -> pd.DataFrame:
    """
    Summarize the intersection results into a table.
    """
    table = summary_df.groupby(['NomSER', 'set', 'phen_en']).agg({
        'total_pixels': 'sum'
    }).reset_index()
    return table

def main(gdf, species):
    # Example usage
    summary_df = intersect_and_summarize(gdf, species)
    summary_table = summarize_to_table(summary_df)
    
    # Save or display the summary table
    summary_table.to_csv("summary_table.csv", index=False)

    return summary_table

# Assuming gdf4 and species are already defined
summary_table = main(safe_result_gdf, species)
print(summary_table)
