In [1]:
# ============================================================
# Spatial Grid-based Blocking (single file) + RF + GridSearchCV
# 5-fold CV within the 80% training grids; test on the 20% held-out grids
# Outputs: performance metrics, confusion matrices, feature importances,
#          best model (.joblib), best hyperparameters (.json)
# ============================================================
import os, warnings, json, joblib
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.base import clone

# -------------------------------
# Config
# -------------------------------
feature_cols = ['B','G','R','RE','NIR','NDVI','NDRE','SAVI','VARI','ExG']
target_col   = 'class'
csv_path     = "split_grid_best_cnn_fixed_cz.csv"

out_dir = "./rf_grid_blocking_gridsearch_outputs"
os.makedirs(out_dir, exist_ok=True)

# Lean but effective RF grid
param_grid = {
    "rf__n_estimators": [400],
    "rf__max_depth": [None, 20, 30],
    "rf__min_samples_split": [2, 5, 10],
    "rf__min_samples_leaf": [1, 2, 4],
    "rf__max_features": ["sqrt", 0.5],
    "rf__bootstrap": [True],
}
cv5 = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# -------------------------------
# Helpers
# -------------------------------
def plot_cm(cm, title, path_png, labels, normalize=False):
    if normalize:
        with np.errstate(invalid="ignore", divide="ignore"):
            cm_disp = cm.astype(float) / cm.sum(axis=1, keepdims=True)
        fmt = ".2f"
    else:
        cm_disp, fmt = cm, None
    disp = ConfusionMatrixDisplay(confusion_matrix=cm_disp, display_labels=labels)
    disp.plot(cmap=plt.cm.Blues, values_format=fmt)
    plt.title(title)
    plt.yticks(rotation="vertical")
    plt.tight_layout()
    plt.savefig(path_png, dpi=300)
    plt.close()

def cm_to_metrics(cm, labels):
    """Return per-class dicts (precision, sensitivity/recall, specificity, f1) and OA from a 2x2 CM."""
    metrics = {}
    total = cm.sum()
    for i, cls in enumerate(labels):
        TP = cm[i, i]
        FN = cm[i, :].sum() - TP
        FP = cm[:, i].sum() - TP
        TN = total - TP - FN - FP
        prec = TP / (TP + FP) if (TP + FP) > 0 else 0.0
        rec  = TP / (TP + FN) if (TP + FN) > 0 else 0.0
        spec = TN / (TN + FP) if (TN + FP) > 0 else 0.0
        f1   = (2*prec*rec)/(prec+rec) if (prec+rec) > 0 else 0.0
        metrics[cls] = dict(precision=prec, sensitivity=rec, specificity=spec, f1=f1)
    OA = np.trace(cm) / total if total > 0 else 0.0
    return metrics, OA

# -------------------------------
# Run experiment
# -------------------------------
tag = os.path.splitext(os.path.basename(csv_path))[0]
run_dir = os.path.join(out_dir, tag)
os.makedirs(run_dir, exist_ok=True)

df = pd.read_csv(csv_path)

# Normalize class labels to avoid NaNs
df[target_col] = df[target_col].astype(str).str.lower().str.strip()
classes_order = ["grass", "legume"]

train_df = df[df['folder'] == 'training'].copy()
test_df  = df[df['folder'] == 'testing' ].copy()

X_tr, y_tr = train_df[feature_cols], train_df[target_col]
X_te, y_te = test_df[feature_cols],  test_df[target_col]

pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("rf", RandomForestClassifier(random_state=42, n_jobs=-1, class_weight="balanced"))
])

print("Running 5-fold inner CV for hyperparameter tuning...")
grid = GridSearchCV(pipe, param_grid=param_grid, scoring="f1_weighted",
                    cv=cv5, n_jobs=-1, verbose=1)
grid.fit(X_tr, y_tr)

# Save best hyperparameters to JSON
best_params = grid.best_params_.copy()
with open(os.path.join(run_dir, f"{tag}_best_params.json"), "w") as f:
    json.dump(best_params, f, indent=4)

# Refit best model with more trees (variance reduction)
best_params["rf__n_estimators"] = 1000
final_model = clone(grid.best_estimator_)
final_model.set_params(**best_params)
final_model.fit(X_tr, y_tr)

# Save model to .joblib
model_path = os.path.join(run_dir, f"{tag}_best_model.joblib")
joblib.dump(final_model, model_path)
print(f"Best model saved → {model_path}")

# Test predictions
y_pred = final_model.predict(X_te)

# Confusion matrices
cm = confusion_matrix(y_te, y_pred, labels=classes_order)
plot_cm(cm, f"Prediction number",
        os.path.join(run_dir, f"{tag}_cm_counts.png"), classes_order, normalize=False)
plot_cm(cm, f"Prediction percentage",
        os.path.join(run_dir, f"{tag}_cm_norm.png"), classes_order, normalize=True)

# Metrics
per_class, OA = cm_to_metrics(cm, classes_order)

# Per-class rows
per_rows = []
for cls in classes_order:
    m = per_class[cls]
    per_rows.append({"Dataset": tag, "Class": cls,
                     "Precision": m["precision"], "Sensitivity": m["sensitivity"],
                     "Specificity": m["specificity"], "F1": m["f1"]})
pd.DataFrame(per_rows).to_csv(os.path.join(run_dir, f"{tag}_per_class_metrics.csv"), index=False)

# Final table (%, Grass | Legume | Average)
table = pd.DataFrame({
    "Metric": ["Precision","Sensitivity","Specificity","F1-score"],
    "grass":  [per_class["grass"]["precision"], per_class["grass"]["sensitivity"],
               per_class["grass"]["specificity"], per_class["grass"]["f1"]],
    "legume": [per_class["legume"]["precision"], per_class["legume"]["sensitivity"],
               per_class["legume"]["specificity"], per_class["legume"]["f1"]],
})
table["Average"] = table[["grass","legume"]].mean(axis=1)
final_table = (table.set_index("Metric") * 100).round(2)
oa_row = pd.DataFrame({"grass":[np.nan], "legume":[np.nan], "Average":[round(100*OA,2)]},
                      index=["Overall Accuracy"])
final_table = pd.concat([final_table, oa_row], axis=0)[["grass","legume","Average"]]
print("\n=== Grid-based Spatial Hold-Out – Macro Metrics (%) ===")
print(final_table)
final_table.to_csv(os.path.join(run_dir, f"{tag}_overall_metrics_table.csv"))

# Feature importances
rf = final_model.named_steps["rf"]
fi = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=False)
fi.to_csv(os.path.join(run_dir, f"{tag}_feature_importances.csv"), header=["importance"])
plt.figure(figsize=(9,4.5), dpi=300)
fi.plot(kind='bar')
plt.ylabel('Importance')
plt.title(f'Feature Importances')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig(os.path.join(run_dir, f"{tag}_feature_importances.png"), dpi=300)
plt.close()

# Save run summary
summary = {
    "Dataset": tag,
    "Best_CV_F1_weighted": grid.best_score_,
    "Overall_Accuracy": OA,
    "Model_File": os.path.basename(model_path)
}
pd.DataFrame([summary]).to_csv(os.path.join(run_dir, f"{tag}_run_summary.csv"), index=False)

print(f"\nAll outputs saved to: {os.path.abspath(run_dir)}")


Running 5-fold inner CV for hyperparameter tuning...
Fitting 5 folds for each of 54 candidates, totalling 270 fits
Best model saved → ./rf_grid_blocking_gridsearch_outputs\split_grid_best_cnn_fixed_cz\split_grid_best_cnn_fixed_cz_best_model.joblib

=== Grid-based Spatial Hold-Out – Macro Metrics (%) ===
                  grass  legume  Average
Precision         83.75   96.77    90.26
Sensitivity       98.53   69.77    84.15
Specificity       69.77   98.53    84.15
F1-score          90.54   81.08    85.81
Overall Accuracy    NaN     NaN    87.39

All outputs saved to: C:\Users\changzhao\UFL Dropbox\Chang Zhao\Research\Botanical_Composition\Spatial_Partition\Grid_Blocking\rf_grid_blocking_gridsearch_outputs\split_grid_best_cnn_fixed_cz
