In [None]:
import geopandas as gpd

bdforet = gpd.read_parquet('/Users/arthurcalvi/Repo/Disturbance-Attribution-Dataset-Joining/data/processed_datasets/BDFORET_EPSG2154_FR_simplified10.parquet')

In [None]:
from thefuzz import process

tree_phenology = {
    'Pin maritime': 'Evergreen',
    'NC': 'Unknown',
    'Mixte': 'Mixed',
    'Feuillus': 'Deciduous',
    'Pins mélangés': 'Mixed',
    'NR': 'Unknown',
    'Conifères': 'Evergreen',
    'Sapin, épicéa': 'Evergreen',
    'Peuplier': 'Deciduous',
    'Douglas': 'Evergreen',
    'Pin sylvestre': 'Evergreen',
    'Châtaignier': 'Deciduous',
    'Chênes décidus': 'Deciduous',
    'Pin laricio, pin noir': 'Evergreen',
    'Mélèze': 'Deciduous',
    'Pin autre': 'Evergreen',
    'Hêtre': 'Deciduous',
    'Robinier': 'Deciduous',
    'Chênes sempervirents': 'Evergreen',
    'Pin d\'Alep': 'Evergreen',
    'Pin à crochets, pin cembro': 'Evergreen',
    'Pins mélangés': 'Mixed',
    'Sapin, épicéa': 'Evergreen',
    'Châtaignier': 'Deciduous',
    'Chênes sempervirents': 'Evergreen',
    'Pin à crochets, pin cembro': 'Evergreen',
    'Hêtre': 'Deciduous',
    'Conifères': 'Evergreen',
    'Pin maritime': 'Evergreen',
    'Mélèze': 'Deciduous',
    'Chênes sempervirents': 'Evergreen'
}

# Function to map species to phenology using fuzzy matching
def get_phenology(specie: str) -> str:
    match = process.extractOne(specie, tree_phenology.keys(), scorer=process.fuzz.ratio)
    if match and match[1] > 50:  # Adjust the threshold as needed
        return tree_phenology[match[0]].lower()
    else:
        print(f'No match found for {specie}')
        return 'Unknown'

# Create the new phenology column
bdforet['phenology'] = bdforet['ESSENCE'].apply(get_phenology)

In [None]:
#drop when phenology == unknown or mixed 
bdforet = bdforet[(bdforet.phenology != 'unknown') & (bdforet.phenology != 'mixed')]

In [None]:
import os
import rasterio
from rasterio.mask import mask
import geopandas as gpd
import numpy as np
from shapely.geometry import box
from sklearn.metrics import confusion_matrix, accuracy_score, cohen_kappa_score, precision_score, recall_score, f1_score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm 
dir_ = '/Users/arthurcalvi/Data/species/validation/tiles'
model_name = 'XGBoost'
config = "no_resample_cloud_disturbance_weights_3Y"
extra = config + '_Group'
# bdforets = {'EPSG:32631': bdforet.to_crs('EPSG:32631'), 'EPSG:32632': bdforet.to_crs('EPSG:32632')}

# DataFrame to store metrics
metrics_df = pd.DataFrame(columns=['tile_id', 'overall_accuracy', 'kappa', 'precision', 'recall', 'f1_score'])

for filename in os.listdir(dir_):
    path = os.path.join(dir_, filename)
    if not os.path.isdir(path):
        continue

    tile_id = filename.split('_')[1]
    print(path)
    classification_map_path = os.path.join(path, 'results', f'{model_name}_{extra}.tif')
    reference_data = os.path.join(path, 'reference_species') 
    f = [x for x in os.listdir(reference_data) if x.startswith('tile')][0]
    reference_data_path = os.path.join(reference_data, f)
    try:
    # if True:
        ref_data = rasterio.open(reference_data_path).read(3)
        mask_ref_data = (ref_data != 1) & (ref_data != 2)
        with rasterio.open(classification_map_path) as src:
            crs = src.crs
            print(f"CRS: {crs}")
            raster = src.read(1)  # 0 no forest, 1 deciduous, 2 evergreen
            raster_bounds = src.bounds
            raster_transform = src.transform
            raster_meta = src.meta

            # Ensure the BDFORET GeoDataFrame has the same CRS as the raster
            if crs.data['init'].upper() in bdforets.keys():
                crs = crs.data['init'].upper()
                bdforet_ = bdforets[crs]
            else:
                crs = crs.data['init'].upper()
                print(f"Reprojecting BDFORET to {crs}")
                bdforets[crs] = bdforet.to_crs(crs)
                bdforet_ = bdforets[crs]
            
            # Clip the BDFORET GeoDataFrame to the extent of the raster
            bbox = box(*raster_bounds)
            bdforet_clipped = bdforet_[bdforet_.geometry.intersects(bbox)]

            # Rasterize the clipped GeoDataFrame
            bdforet_rasterized = rasterio.features.rasterize(
                ((row.geometry, 1 if row.phenology == 'deciduous' else 2) for row in bdforet_clipped.itertuples()),
                out_shape=raster.shape,
                transform=raster_transform,
                fill=0,
                all_touched=True,
                dtype=rasterio.uint8
            )

            # Compute metrics
            y_true = bdforet_rasterized.flatten()
            y_pred = raster.flatten()

            # Filter out non-forest pixels
            mask = (y_true > 0) & mask_ref_data.flatten()
            y_true = y_true[mask]
            y_pred = y_pred[mask]

            # overall_accuracy = accuracy_score(y_true, y_pred)
            # kappa = cohen_kappa_score(y_true, y_pred)
            # precision = precision_score(y_true, y_pred, average='weighted')
            # recall = recall_score(y_true, y_pred, average='weighted')
            # f1 = f1_score(y_true, y_pred, average='weighted')
            # conf_matrix = confusion_matrix(y_true, y_pred)

            # Compute agreement percentage
            same_class = y_true == y_pred
            agreement_percentage = np.sum(same_class) / len(y_true) * 100

            support = len(y_true)


            # Print metrics
            print(f"Tile ID: {tile_id}")
            print(f"Agreement: {agreement_percentage:.2f}%")
            print(f"Support: {support}")

            # Save metrics to DataFrame
            metrics_df = pd.concat([metrics_df, pd.DataFrame({
                'tile_id': [tile_id],
                'agreement_percentage': [agreement_percentage],
                'support': [support]
            })], ignore_index=True)

            # # Write the BDFORET raster to a .tif file
            # bdforet_raster_path = os.path.join(path, 'reference_species', 'bdforet.tif')
            # os.makedirs(os.path.dirname(bdforet_raster_path), exist_ok=True)

            # bdforet_meta = raster_meta.copy()
            # bdforet_meta.update({
            #     'dtype': 'uint8',
            #     'count': 1
            # })

            # with rasterio.open(bdforet_raster_path, 'w', **bdforet_meta) as dst:
            #     dst.write(bdforet_rasterized, 1)

    except Exception as e:
        print(f"Error for {classification_map_path}: {e}")
        continue

# Save metrics DataFrame to a CSV file
metrics_df.to_csv('results/metrics_bdforet.csv', index=False)

print("Metrics saved to results/metrics_bdforet.csv")
