# Loading 5 and 2.5 gridcells

In [None]:
import geopandas as gpd 
tiles_5 = gpd.read_parquet('/Users/arthurcalvi/Data/species/validation/val_train_tiles.parquet')
tiles_2c5 = gpd.read_parquet('/Users/arthurcalvi/Data/species/validation/val_train_tiles_2_5_km.parquet')

In [None]:
import matplotlib.pyplot as plt 
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(20, 10))
tiles_5.plot(ax=ax1, column='perc', cmap='viridis', legend=True)
tiles_2c5.plot(ax=ax2, column='perc', cmap='viridis', legend=True)

# Sampling 2.5km gridcells

In [None]:
import geopandas as gpd
import pandas as pd
from tqdm import tqdm 
from shapely.geometry import box
import numpy as np

# Load the dataset
gdf = tiles_2c5

# Ensure that the 'perc' column is interpreted as a percentage
gdf['effective_pixels'] = (gdf['perc'] / 100) * (250 * 250)  # Assuming each cell is 2.5x2.5 km


# Sort by the number of effective pixels in descending order
gdf_sorted = gdf.sort_values(by='effective_pixels', ascending=False)

# Initialize an empty list to hold the selected rows
selected_rows = []
cumulative_pixels = 0

# Select cells to reach at least 2 million pixels
for row in tqdm(gdf_sorted.itertuples(index=False)):
    if cumulative_pixels >= 2_000_000:
        break
    if row.perc > 10 : 
        selected_rows.append(row)
        cumulative_pixels += row.effective_pixels

# Convert the selected rows into a GeoDataFrame
selected_gdf = gpd.GeoDataFrame(selected_rows, columns=gdf.columns)

# Ensure at least 50k pixels per eco-region
region_pixels = selected_gdf.groupby('NomSER')['effective_pixels'].sum()

for region in gdf['NomSER'].unique():
    c = selected_gdf[selected_gdf['NomSER'] == region].shape[0]
    if region_pixels.get(region, 0) < 50_000 or c < 10:
        additional_cells = gdf_sorted[(gdf_sorted['NomSER'] == region) & (~gdf_sorted.index.isin(selected_gdf.index))]

        for row in additional_cells.itertuples(index=False):
            if c > 10:
                break

            selected_rows.append(row)
            c+=1

# Recreate the selected GeoDataFrame after adding necessary rows
selected_gdf = gpd.GeoDataFrame(selected_rows, columns=gdf.columns, crs=gdf.crs)

# Optimize spatial coverage by minimizing spatial correlation
# Remove cells that are too close to others, keeping only those that enhance spatial coverage
def spatial_filter(df, min_distance=100):  # Adjust distance threshold as necessary
    remaining = []
    for row in df.itertuples():
        if all([row.geometry.distance(df.loc[i].geometry) > min_distance for i in remaining]):
            remaining.append(row.Index)
    return df.loc[remaining]

# selected_gdf = spatial_filter(selected_gdf)
total_pixels_deicudous = (selected_gdf.perc_deciduous * selected_gdf.effective_pixels).sum()
total_pixels_evergreen = (selected_gdf.perc_evergreen * selected_gdf.effective_pixels).sum()
total_pixels = selected_gdf['effective_pixels'].sum()

print(f"Total pixels: {total_pixels :.0f}")
print(f"Total deciduous pixels: {total_pixels_deicudous :.0f}")
print(f"Total evergreen pixels: {total_pixels_evergreen :.0f}")

selected_gdf


In [None]:
import matplotlib.pyplot as plt
dep = gpd.read_file('/Users/arthurcalvi/Data/Disturbances_maps/BDForet/contour-des-departements.geojson')

fig, ax = plt.subplots(figsize=(10, 10))
selected_gdf.plot(ax=ax, column='perc', cmap='viridis', legend=True)
dep.to_crs(selected_gdf.crs).boundary.plot(ax=ax, color='black', linewidth=0.05)

In [None]:
greco_groups = selected_gdf.groupby('NomSER')
a = greco_groups['effective_pixels'].sum()
#add all NomSER of gdf to a even the ones not in selected_gdf
for i in gdf['NomSER'].unique():
    if i not in a.index:
        a[i] = 0
greco_low = a[a < 50_000].index.values

from utils import mapping_real_greco
inv_map = {v: k for k, v in mapping_real_greco.items()}
greco_low_names = [inv_map[i].replace('_', ' ') for i in greco_low]
greco_low_names

greco = gpd.read_file('/Users/arthurcalvi/Data/eco-regions/France/ser_l93_new/ser_l93_new.dbf')
greco['greco'] = greco.codeser.apply(lambda x:x[0])
greco = greco.dissolve(by='greco', aggfunc='first')
greco = greco.reset_index().iloc[1:].to_crs('EPSG:2154')
greco_under_50k = greco[greco.NomSER.isin(greco_low_names)]

print('Eco-regions with less than 50k pixels:')
greco_under_50k


# Adding BDForet

In [None]:
import pandas as pd
bdforet_year = pd.read_csv('/Users/arthurcalvi/Data/Disturbances_maps/BDForet/Année_reference_BDForet.csv', sep=';')
dep = gpd.read_file('/Users/arthurcalvi/Data/Disturbances_maps/BDForet/contour-des-departements.geojson')

In [None]:
year = 2010
num_dep = bdforet_year[bdforet_year['Année de référence (PVA)'] >= year]['N° Dep'].to_list()

print('Visualisation of the regions with less than 50k pixels and the departments with BDForet after {}.'.format(year))

fig, ax = plt.subplots(figsize=(7, 7))
dep[ dep['code'].isin(num_dep) ].plot(ax=ax)
dep.plot(alpha=0.1, ax=ax)
greco_under_50k.to_crs('EPSG:4326').plot(ax=ax, edgecolor='black', facecolor='none')
ax.set_axis_off()
#add legend : blue patches for bdforet after 2014 and black edge for greco under 50k
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
legend_elements = [Patch(facecolor='blue', edgecolor='white', label=f'Département avec BDForet après {year}'),
                   Line2D([0], [0], color='black', lw=2, label='Eco-région avec moins de 50k pixels')]
ax.legend(handles=legend_elements, loc='upper left', fontsize=8)
plt.show()


Conclusion : We could used the BDforet data to better balance the dataset.

In [None]:
selected_gdf

In [None]:
#count number of row per nomSER
selected_gdf['NomSER'].value_counts()


To improve the spatial distribution of your sampled tiles, you can add new tiles by selecting those that are closest to underrepresented areas in France. Here’s a strategy to do this:

Compute the coverage density: Calculate the spatial density or coverage of your already selected tiles across the entire grid. This will help identify areas with lower tile density.
Identify underrepresented areas: Find the centroids of grid cells that have less coverage or are farthest from already selected tiles.
Select additional tiles: Add the tiles closest to these underrepresented areas until you've added the desired number of tiles.

In [None]:
import geopandas as gpd
from shapely.geometry import Point
import pandas as pd

def add_tiles_to_improve_coverage(gdf: gpd.GeoDataFrame, selected_gdf: gpd.GeoDataFrame, x: int, min_distance: float = 25000) -> gpd.GeoDataFrame:
    """
    Add X tiles to the selected_gdf to improve spatial coverage across France.
    
    Args:
    gdf (gpd.GeoDataFrame): GeoDataFrame containing all available tiles.
    selected_gdf (gpd.GeoDataFrame): GeoDataFrame containing already selected tiles.
    x (int): Number of additional tiles to select.
    min_distance (float): Minimum distance in meters that a tile must be from the selected tiles to be considered.

    Returns:
    gpd.GeoDataFrame: GeoDataFrame with additional tiles selected.
    """
    # Compute centroids of all grid cells
    gdf['centroid'] = gdf.geometry.centroid

    # Compute centroids of already selected tiles
    selected_gdf['centroid'] = selected_gdf.geometry.centroid

    # Filter out tiles that are within the minimum distance from any selected tile
    def min_distance_to_selected(centroid, selected_centroids):
        return selected_centroids.geometry.distance(centroid).min()
    
    gdf['min_dist_to_selected'] = gdf['centroid'].apply(min_distance_to_selected, selected_centroids=selected_gdf)
    gdf = gdf[gdf['min_dist_to_selected'] > min_distance]

    # Initialize the GeoDataFrame to store the additional selected tiles
    additional_tiles = gpd.GeoDataFrame(columns=gdf.columns, crs=gdf.crs)

    for _ in tqdm(range(x)):
        # Select the tile with the maximum distance
        next_tile = gdf.loc[gdf['min_dist_to_selected'].idxmax()]
        
        # Add this tile to the selected list and remove it from available options
        additional_tiles = pd.concat([additional_tiles, next_tile.to_frame().T], ignore_index=True)
        gdf = gdf.drop(next_tile.name)

        # Update distances only for tiles within the radius of the last selected tile
        buffer_radius = next_tile['min_dist_to_selected']
        nearby_tiles = gdf[gdf.geometry.distance(next_tile.geometry) <= buffer_radius]
        
        if not nearby_tiles.empty:
            gdf.loc[nearby_tiles.index, 'min_dist_to_selected'] = nearby_tiles['centroid'].apply(
                min_distance_to_selected, selected_centroids=additional_tiles
            )

        gdf = gdf[gdf['min_dist_to_selected'] > min_distance]

    # Combine the already selected tiles with the new tiles
    final_selected_gdf = pd.concat([selected_gdf, additional_tiles], ignore_index=True)

    return final_selected_gdf

# Usage:
final_selected_gdf = add_tiles_to_improve_coverage(gdf, selected_gdf, x=50)


In [None]:
fig, ax = plt.subplots(figsize=(7, 7))
final_selected_gdf.plot(ax=ax, column='perc', cmap='viridis', legend=False, edgecolor='black', linewidth=0.5)
dep.to_crs(selected_gdf.crs).boundary.plot(ax=ax, color='black', linewidth=0.05)
ax.set_axis_off()

In [None]:
0.1 * 250 * 250

In [None]:
(final_selected_gdf.perc < 10).sum()

In [None]:
# final_selected_gdf.drop(columns=['centroid', 'min_dist_to_selected'], inplace=True)
final_selected_gdf.rename(columns={'perc': 'perc_sarah'}, inplace=True)
final_selected_gdf.to_parquet('/Users/arthurcalvi/Data/species/validation/tiles_2_5_km_final.parquet')