# Test of 3 configurations with RF and CV

In this experiment, we aimed to evaluate the effectiveness of our feature selection process by testing different feature configurations for classifying Sentinel-2 pixels as deciduous or evergreen using a Random Forest classifier. Specifically, we sought to determine whether the selected features adequately capture the relevant information for accurate classification. We tested three configurations: (1) CRSWIR features combined with elevation, (2) CRSWIR and red channel features combined with elevation, and (3) CRSWIR, red channel features, an interaction term between offset_red and offset_crswir, and elevation. Each configuration was standardized and evaluated using 5-fold stratified cross-validation with a Random Forest model set to 30 estimators to handle non-linear relationships. The weighted F1-score was used to measure performance, and results were aggregated to determine the most effective feature set. The highest F1-score was obtained with the CRSWIR + Red + Elevation configuration, suggesting that including both CRSWIR and red channel features without the interaction term provides the most relevant information for classification. Despite high VIF values, keeping both offset_red and offset_crswir is recommended as they enhance model performance. The findings were saved to a CSV file for further analysis.

In [None]:
import os
from utils import load_and_preprocess_table_data
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import RobustScaler
import numpy as np
import pandas as pd

# Load data
config = "no_resample_cloud_disturbance_weights_3Y"
data = load_and_preprocess_table_data(config)
data['offset_interaction_gc'] = data['offset_green'] * data['offset_crswir']
data['offset_interaction_rc'] = data['offset_red'] * data['offset_crswir']
groups = data['tile_id']
# Define target
target = 'phen'
y = data[target].reset_index(drop=True)

# Configurations
configurations = {
    "CRSWIR + Elevation": ['amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir', 'offset_crswir', 'elevation'],
    "CRSWIR + Amp,Cos Red + Cos Phase Blue + Offset Green + Elevation": ['amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir', 'offset_crswir',
                                                                         'cos_phase_blue',
                                                                         'amplitude_red', 'cos_phase_red', 
                                                                         'offset_green', 
                                                                         'elevation'],
    "CRSWIR + Amp,Cos,Offset Red + Cos Phase Blue + Elevation": ['amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir', 'offset_crswir',
                                                'cos_phase_blue', 
                                                'amplitude_red', 'cos_phase_red', 'offset_red', 
                                                'elevation'],
    "CRSWIR + Amp,Cos Red + Cos Phase Blue + Interaction (Green, CRSWIR) + Elevation": ['amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir',
                                                                       'amplitude_red', 'cos_phase_red',
                                                                       'cos_phase_blue', 
                                                                       'offset_interaction_gc', 'elevation'],
    "CRSWIR + Amp,Cos Red + Cos Phase Blue + Interaction (Red, CRSWIR) + Elevation": ['amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir',
                                                                     'amplitude_red', 'cos_phase_red',
                                                                     'cos_phase_blue',
                                                                     'offset_interaction_rc', 'elevation']
}

# Standardize features
scaler = RobustScaler()

# Perform stratified k-fold cross-validation
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# Define model
model = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=-1)

# Results dictionary
results = {}

# Function to evaluate model using F1-score
def evaluate_model(model, X, y, groups):
    f1_scores = []
    for train_index, test_index in gkf.split(X, y, groups=groups):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
    return np.mean(f1_scores)

# Perform evaluation for each configuration
for config_name, features in configurations.items():
    print(f"Processing configuration: {config_name}")
    X = data[features].reset_index(drop=True)
    X_scaled = scaler.fit_transform(X)
    
    metrics = evaluate_model(model, X_scaled, y, groups)
    results[config_name] = {
        'f1_score': metrics
    }
    print(f"Evaluation metrics for {config_name}: {metrics}")

# Display results
results_df = pd.DataFrame(results).T

# Save results to CSV
results_df.to_csv("random_forest_evaluation_results.csv")

print("Random Forest evaluation results saved to random_forest_evaluation_results.csv")


In [None]:
import os
from utils import load_and_preprocess_table_data
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, GroupKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import RobustScaler
import numpy as np
import pandas as pd

# Load data
config = "no_resample_cloud_disturbance_weights_3Y"
data = load_and_preprocess_table_data(config)
data['offset_interaction_gc'] = data['offset_green'] * data['offset_crswir']
data['offset_interaction_rc'] = data['offset_red'] * data['offset_crswir']
groups = data['tile_id']
# Define target
target = 'phen'
y = data[target].reset_index(drop=True)

# Configurations
configurations = {

    "CRSWIR + Amp,Cos,Offset Red + Cos Phase Blue + Offset Green + Elevation": ['amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir', 'offset_crswir',
                                                'cos_phase_blue', 
                                                'offset_green',
                                                'amplitude_red', 'cos_phase_red', 'offset_red', 
                                                'elevation'],
}

# Standardize features
scaler = RobustScaler()

# Perform stratified k-fold cross-validation
n_splits = 5
gkf = GroupKFold(n_splits=n_splits)

# Define model
model = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=-1)

# Results dictionary
results = {}

# Function to evaluate model using F1-score
def evaluate_model(model, X, y, groups):
    f1_scores = []
    for train_index, test_index in gkf.split(X, y, groups=groups):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
    return np.mean(f1_scores)

# Perform evaluation for each configuration
for config_name, features in configurations.items():
    print(f"Processing configuration: {config_name}")
    X = data[features].reset_index(drop=True)
    X_scaled = scaler.fit_transform(X)
    
    metrics = evaluate_model(model, X_scaled, y, groups)
    results[config_name] = {
        'f1_score': metrics
    }
    print(f"Evaluation metrics for {config_name}: {metrics}")

# Display results
results_df = pd.read_csv("random_forest_evaluation_results.csv", index_col=0)
results_df = pd.concat([results_df, pd.DataFrame(results).T])

# Save results to CSV
results_df.to_csv("random_forest_evaluation_results.csv")

print("Random Forest evaluation results saved to random_forest_evaluation_results.csv")
