In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from utils import compute_metrics
import matplotlib.pyplot as plt
import joblib 
import os

# Load the training and validation datasets
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')

# Define features and target
features = ['amplitude_red', 'phase_red', 'offset_red', 'amplitude_crswir', 'phase_crswir', 'offset_crswir', 'amplitude_rcc', 'phase_rcc', 'offset_rcc', 'elevation', 'aspect']
X_train = train_data[features]
y_train = train_data['phen']
X_val = val_data[features]
y_val = val_data['phen']

# Replace infinity values with NaNs
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_val.replace([np.inf, -np.inf], np.nan, inplace=True)

# Replace NaNs with the mean of each column
X_train.fillna(X_train.mean(), inplace=True)
X_val.fillna(X_val.mean(), inplace=True)

# Train Random Forest classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, max_depth=None)
rf.fit(X_train, y_train)

#save model
os.makedirs('models', exist_ok=True)
joblib.dump(rf, 'models/exp2_rf_model_features-all.pkl')
print("Model saved.")

# Predict on validation set
val_data['predicted_phen'] = rf.predict(X_val)

# Compute overall metrics
metrics = compute_metrics(val_data['phen'], val_data['predicted_phen'])
print(metrics)

# Compute breakdown per GRECO region
metrics_per_greco = []

for greco_region in val_data['greco_region'].unique():
    region_data = val_data[val_data['greco_region'] == greco_region]
    y_val_region = region_data['phen']
    y_pred_region = region_data['predicted_phen']
    
    metrics = compute_metrics(y_val_region, y_pred_region)
    
    metrics_per_greco.append({
        'GRECO Region': greco_region,
        **metrics
    })

metrics_df = pd.DataFrame(metrics_per_greco)
print(metrics_df)

# Save metrics breakdown to CSV
os.makedirs('results', exist_ok=True)
metrics_df.to_csv('results/exp2_validation_metrics_per_greco_region.csv', index=False)

print("Validation metrics breakdown per GRECO region saved.")

#save per tile
metrics_per_tile = []
for tile in val_data['tile_id'].unique():
    tile_data = val_data[val_data['tile_id'] == tile]
    y_val_tile = tile_data['phen']
    y_pred_tile = tile_data['predicted_phen']
    
    metrics = compute_metrics(y_val_tile, y_pred_tile)
    
    metrics_per_tile.append({
        'Tile': tile,
        **metrics
    })

metrics_df = pd.DataFrame(metrics_per_tile)
print(metrics_df)

# Save metrics breakdown to CSV
metrics_df.to_csv('results/exp2_validation_metrics_per_tile.csv', index=False)

print("Validation metrics breakdown per tile saved.")

# Plot feature importance
feature_importances = rf.feature_importances_
importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(12, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'], color='skyblue')
plt.xlabel('Feature Importance')
plt.title('Feature Importance from Random Forest Classifier')
plt.gca().invert_yaxis()
plt.grid(True, linestyle='--', linewidth=0.5)

# Save the feature importance plot
os.makedirs('images', exist_ok=True)
plt.savefig('images/exp2_feature_importance.png', dpi=300)
plt.show()


In [None]:
rf.decision_path

In [None]:
import pandas as pd
val_data = pd.read_csv('val_data.csv')
from utils import mapping_real_greco
val_data['greco_region'] = val_data['greco_region'].map(mapping_real_greco)

#load model 
rf = joblib.load('models/exp2_rf_model_features-all.pkl')

features = ['amplitude_red', 'phase_red', 'offset_red', 'amplitude_crswir', 'phase_crswir', 'offset_crswir', 'amplitude_rcc', 'phase_rcc', 'offset_rcc', 'elevation', 'aspect']
directory = '/Users/arthurcalvi/Data/species/validation/tiles'

# Load the data and preprocess it
from tqdm import tqdm 
from datetime import datetime 
from utils import load_folder, calculate_slope_with_dates
import rasterio 
import os
def load_data_from_tile_inf(path: str) -> dict:
    tile_id = os.path.basename(path).split('_')[1]
    dates = [datetime.strptime(filename.split('_')[0], '%Y-%m-%d') for filename in os.listdir(os.path.join(path, 'rgb'))]
    dates.sort()
    rgb = load_folder(os.path.join(path, 'rgb'))
    chm = rasterio.open(os.path.join(path, 'tree_map', 'CHM2020.tif')).read(1)
    forest_mask = (chm > 250).astype(bool)
    slope_map = calculate_slope_with_dates(rgb[:, 0], dates, len(rgb[:, 0]) / 2, len(rgb[:, 0])) / 100
    weights = (1 - abs(slope_map.ravel())).clip(0, 1)

    path_features = os.path.join(path, 'features')
    r_APO = rasterio.open(os.path.join(path_features, 'r_APO.tif')).read()
    amplitude_map_r, phase_map_r, offset_map_r = r_APO[0], r_APO[1], r_APO[2]
    crswir_APO = rasterio.open(os.path.join(path_features, 'crswir_APO.tif')).read()
    amplitude_map_crswir, phase_map_crswir, offset_map_crswir = crswir_APO[0], crswir_APO[1], crswir_APO[2]
    rcc_APO = rasterio.open(os.path.join(path_features, 'rcc_APO.tif')).read()
    amplitude_map_rcc, phase_map_rcc, offset_map_rcc = rcc_APO[0], rcc_APO[1], rcc_APO[2]
    dem = rasterio.open(os.path.join(path_features, 'elevation_aspect.tif')).read()
    elevation, aspect = dem[0], dem[1]

    features_ = {
        'amplitude_red': amplitude_map_r.ravel(),
        'phase_red': phase_map_r.ravel(),
        'offset_red': offset_map_r.ravel(),
        'amplitude_crswir': amplitude_map_crswir.ravel(),
        'phase_crswir': phase_map_crswir.ravel(),
        'offset_crswir': offset_map_crswir.ravel(),
        'amplitude_rcc': amplitude_map_rcc.ravel(),
        'phase_rcc': phase_map_rcc.ravel(),
        'offset_rcc': offset_map_rcc.ravel(),
        'elevation': elevation.ravel(),
        'aspect': aspect.ravel(),
        'tile_id': np.array([tile_id] * aspect.size)  # Add tile_id to the features
    }


    return features_, forest_mask, amplitude_map_crswir.shape 

for folder in tqdm(os.listdir(directory)):
    path = os.path.join(directory, folder)
    
    if folder.__contains__('.DS_Store') or folder.__contains__('.txt'):
        continue
    tile_id = int(os.path.basename(path).split('_')[1])
    if tile_id not in val_data['tile_id'].unique():
        continue
    try:
        print(f"Processing {folder}")
        features_, forest_mask, shape = load_data_from_tile_inf(path)
        features_ = pd.DataFrame(features_)[features]

        #preprocess data
        features_.replace([np.inf, -np.inf], np.nan, inplace=True)
        features_.fillna(features_.mean(), inplace=True)

        results = rf.predict(features_)
        results[ forest_mask.ravel() == False ] = 0

        # Save the results to a GeoTIFF file
        ref = rasterio.open(os.path.join(path, 'tree_map', 'CHM2020.tif'))
        profile = ref.profile

        path_results = os.path.join(directory, folder, 'results')
        os.makedirs(path_results, exist_ok=True)
        path_file = os.path.join(path_results, f'exp2_rf_features-all.tif')
        profile.update(dtype=rasterio.uint8, count=1, compress='lzw', nodata=0)
        with rasterio.open(path_file, 'w', **profile) as dst:
            dst.write(results.reshape(shape), 1)

    except Exception as e:
        print(f"Error processing {folder}: {e}")
        continue
