In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, roc_curve
from utils import compute_metrics

# Load the training and validation datasets
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')



# Define features and target
features = ['amplitude_red', 'phase_red', 'offset_red', 'amplitude_crswir', 'phase_crswir', 'offset_crswir', 'amplitude_rcc', 'phase_rcc', 'offset_rcc', 'elevation', 'aspect']
X_train = train_data[features]
y_train = train_data['phen']
X_val = val_data[features]
y_val = val_data['phen']



# Select the best feature based on AUC-ROC
best_feature = None
best_roc_auc = 0
list_of_roc_auc = []
for feature in features:
    roc_auc = roc_auc_score(y_train, train_data[feature])
    list_of_roc_auc.append(roc_auc)
    if roc_auc > best_roc_auc:
        best_roc_auc = roc_auc
        best_feature = feature

print(f"Best feature for distinguishing evergreen and deciduous: {best_feature} with ROC-AUC score: {best_roc_auc}")

# Compute the optimal threshold for the best feature
fpr, tpr, thresholds = roc_curve(y_train, train_data[best_feature], pos_label=2)
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal threshold for {best_feature}: {optimal_threshold}")

# Apply the threshold to the validation set
val_data['predicted_phen'] = np.where(val_data[best_feature] >= optimal_threshold, 2, 1)

# Compute metrics on the validation set
metrics = compute_metrics(y_val, val_data['predicted_phen'])
print("Validation metrics:")
print(metrics)

# Compute breakdown per GRECO region
metrics_per_greco = []
for greco_region in val_data['greco_region'].unique():
    region_data = val_data[val_data['greco_region'] == greco_region]
    y_val_region = region_data['phen']
    y_pred_region = region_data['predicted_phen']
    
    metrics = compute_metrics(y_val_region, y_pred_region)
    
    metrics_per_greco.append({
        'GRECO Region': greco_region,
        **metrics
    })

metrics_df = pd.DataFrame(metrics_per_greco)
print(metrics_df)

# Save metrics breakdown to CSV
import os 
os.makedirs('results', exist_ok=True)
metrics_df.to_csv('results/exp1_validation_metrics_per_greco_region.csv', index=False)

print("Validation metrics breakdown per GRECO region saved.")

metrics_per_tile = []
for tile in val_data['tile_id'].unique():
    tile_data = val_data[val_data['tile_id'] == tile]
    y_val_tile = tile_data['phen']
    y_pred_tile = tile_data['predicted_phen']
    
    metrics = compute_metrics(y_val_tile, y_pred_tile)
    
    metrics_per_tile.append({
        'Tile': tile,
        **metrics
    })

metrics_df = pd.DataFrame(metrics_per_tile)
print(metrics_df)

# Save metrics breakdown to CSV
metrics_df.to_csv('results/exp1_validation_metrics_per_tile.csv', index=False)

print("Validation metrics breakdown per tile saved.")


# Plotting
import matplotlib.pyplot as plt
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))

# Plotting the ROC curve
color = 'tab:blue'
ax1.set_xlabel('False Positive Rate')
ax1.set_ylabel('True Positive Rate', color=color)
ax1.plot(fpr, tpr, color=color, label=f'ROC curve (area = {best_roc_auc:.2f})')
ax1.scatter(fpr[optimal_idx], tpr[optimal_idx], marker='o', color='red', label=f'Optimal Threshold = {optimal_threshold:.2f}')
ax1.tick_params(axis='y', labelcolor=color)
ax1.legend(loc='lower right')
ax1.grid(True, linestyle='--', linewidth=0.5)
ax1.set_title(f'ROC Curve and AUC-ROC Scores for {best_feature}')

# Adding a second y-axis for the histogram
color = 'tab:orange'
ax2.set_ylabel('AUC-ROC Score', color=color)
ax2.bar(features, list_of_roc_auc, color=color, alpha=0.6)
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim(0, 1)
ax2.set_yticks(np.arange(0, 1.1, 0.1))
ax2.set_xticklabels(features, rotation=45, ha='right')

fig.tight_layout()  
plt.show()

import os 
os.makedirs('images', exist_ok=True)
fig.savefig('images/exp1_roc_auc_scores_and_features.png', dpi=300)


# Writing validation tiles

In [None]:
import pandas as pd
val_data = pd.read_csv('val_data.csv')
from utils import mapping_real_greco
val_data['greco_region'] = val_data['greco_region'].map(mapping_real_greco)


best_feature = 'phase_crswir'
optimal_threshold = 0.8757011273591082
directory = '/Users/arthurcalvi/Data/species/validation/tiles'

# Load the data and preprocess it
from tqdm import tqdm 
from datetime import datetime 
from utils import load_folder, calculate_slope_with_dates
import rasterio 
import os
def load_data_from_tile_inf(path: str) -> dict:
    tile_id = os.path.basename(path).split('_')[1]
    dates = [datetime.strptime(filename.split('_')[0], '%Y-%m-%d') for filename in os.listdir(os.path.join(path, 'rgb'))]
    dates.sort()
    rgb = load_folder(os.path.join(path, 'rgb'))
    chm = rasterio.open(os.path.join(path, 'tree_map', 'CHM2020.tif')).read(1)
    forest_mask = (chm > 250).astype(bool)
    slope_map = calculate_slope_with_dates(rgb[:, 0], dates, len(rgb[:, 0]) / 2, len(rgb[:, 0])) / 100
    weights = (1 - abs(slope_map.ravel())).clip(0, 1)

    path_features = os.path.join(path, 'features')
    r_APO = rasterio.open(os.path.join(path_features, 'r_APO.tif')).read()
    amplitude_map_r, phase_map_r, offset_map_r = r_APO[0], r_APO[1], r_APO[2]
    crswir_APO = rasterio.open(os.path.join(path_features, 'crswir_APO.tif')).read()
    amplitude_map_crswir, phase_map_crswir, offset_map_crswir = crswir_APO[0], crswir_APO[1], crswir_APO[2]
    rcc_APO = rasterio.open(os.path.join(path_features, 'rcc_APO.tif')).read()
    amplitude_map_rcc, phase_map_rcc, offset_map_rcc = rcc_APO[0], rcc_APO[1], rcc_APO[2]
    dem = rasterio.open(os.path.join(path_features, 'elevation_aspect.tif')).read()
    elevation, aspect = dem[0], dem[1]

    features_ = {
        'amplitude_red': amplitude_map_r.ravel(),
        'phase_red': phase_map_r.ravel(),
        'offset_red': offset_map_r.ravel(),
        'amplitude_crswir': amplitude_map_crswir.ravel(),
        'phase_crswir': phase_map_crswir.ravel(),
        'offset_crswir': offset_map_crswir.ravel(),
        'amplitude_rcc': amplitude_map_rcc.ravel(),
        'phase_rcc': phase_map_rcc.ravel(),
        'offset_rcc': offset_map_rcc.ravel(),
        'elevation': elevation.ravel(),
        'aspect': aspect.ravel(),
        'tile_id': np.array([tile_id] * aspect.size)  # Add tile_id to the features
    }


    return features_, forest_mask, amplitude_map_crswir.shape 

for folder in tqdm(os.listdir(directory)):
    path = os.path.join(directory, folder)
    
    if folder.__contains__('.DS_Store') or folder.__contains__('.txt'):
        continue
    tile_id = int(os.path.basename(path).split('_')[1])
    if tile_id not in val_data['tile_id'].unique():
        continue
    try:
        print(f"Processing {folder}")
        features, forest_mask, shape = load_data_from_tile_inf(path)
        results = np.where(features[best_feature] >= optimal_threshold, 2, 1)
        results[ forest_mask.ravel() == False ] = 0

        # Save the results to a GeoTIFF file
        ref = rasterio.open(os.path.join(path, 'tree_map', 'CHM2020.tif'))
        profile = ref.profile

        path_results = os.path.join(directory, folder, 'results')
        os.makedirs(path_results, exist_ok=True)
        path_file = os.path.join(path_results, f'exp1_feature-{best_feature}_threshold-{optimal_threshold}.tif')
        profile.update(dtype=rasterio.uint8, count=1, compress='lzw', nodata=0)
        with rasterio.open(path_file, 'w', **profile) as dst:
            dst.write(results.reshape(shape), 1)

    except Exception as e:
        print(f"Error processing {folder}: {e}")
        continue
