**Notebook Description:**

This notebook is designed to test and optimize a series of machine learning algorithms for classifying pixels as deciduous or evergreen. The selected algorithms range from simple to complex, ensuring a comprehensive evaluation of their performance. The notebook is structured to include data preprocessing, model training with hyperparameter optimization, and evaluation of various metrics. Additionally, the notebook includes a checkpointing mechanism, allowing the process to be stopped and resumed, which is particularly useful given limited computational resources.

**Steps in the Notebook:**

1. **Data Loading and Preprocessing:**
   - Data is loaded using a predefined function.
   - Features and target variables are defined.
   - Standardization is applied selectively based on algorithm requirements.

2. **Model Definition:**
   - A set of machine learning algorithms is defined, including Logistic Regression, Linear Discriminant Analysis, Quadratic Discriminant Analysis, K-Nearest Neighbors, Support Vector Classifier, Random Forest, XGBoost, and MLP Classifier.

3. **Parameter Grids:**
   - Parameter grids for each algorithm are specified, with values adjusted to balance resource constraints and optimization needs.

4. **Checkpointing Mechanism:**
   - Intermediate results and the current state of hyperparameter search are saved to enable stopping and resuming the process.
   - Functions `save_checkpoint` and `load_checkpoint` are implemented to handle checkpointing.

5. **Model Training and Evaluation:**
   - HalvingGridSearchCV is used for efficient hyperparameter optimization.
   - Models are evaluated using stratified k-fold cross-validation.
   - Evaluation metrics include accuracy, precision, recall, and F1-score, with F1-score used for ranking.

6. **Regional Metrics Breakdown:**
   - Evaluation metrics are further broken down by regions using the 'greco_region' column, providing insights into geographical variations in model performance.

7. **Results Display:**
   - The final evaluation results and regional metrics breakdown are displayed for easy interpretation.

This structured approach ensures a thorough comparison of models while maintaining flexibility and efficiency in handling computational constraints.

In [None]:
import os
import joblib
from utils import load_and_preprocess_table_data
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier, BernoulliRBM
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, StratifiedGroupKFold, StratifiedKFold, GroupKFold
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd

# Load data
Y = 3
config = f"no_resample_cloud_disturbance_weights_{Y}Y"
CV = 'Group'
extra = config + '_' + CV
data = load_and_preprocess_table_data(config)
# Define features and target
features = ['amplitude_red', 'cos_phase_red','offset_red',
            'cos_phase_blue', 
            'amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir', 'offset_crswir', 
            'elevation']

target = 'phen'

X = data[features]
y = data[target]
# Adjust target labels to start from 0
y = y - 1

# Ensure indices are aligned
X, y = X.align(y, join='inner', axis=0)

# Standardize features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)

# Convert X_scaled back to DataFrame
X_scaled = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)

# Perform group k-fold cross-validation using tile_id
n_splits = 5
groups = data['tile_id']
if CV == 'Group':
    gkf = GroupKFold(n_splits=n_splits)
else:
    gkf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "MLP": MLPClassifier(),
    "BernoulliRBM": Pipeline([
        ("rbm", BernoulliRBM(random_state=42)),
        ("logistic", LogisticRegression())
    ])
}

# Define parameter grids
param_grids = {
    "Logistic Regression": {"C": [0.1, 1, 10], "penalty": ["l2"], "solver": ["liblinear"]},
    "LDA": {"solver": ["svd", "lsqr"]},
    "QDA": {"reg_param": [0.0, 0.1, 0.5]},
    "KNN": {"n_neighbors": [3, 5, 7], "metric": ["euclidean", "manhattan"]},
    "Random Forest": {"n_estimators": [30, 50], "max_depth": [None, 10], "min_samples_split": [2, 5], "max_features": ["auto", "sqrt"]},
    "XGBoost": {"n_estimators": [30, 50], "learning_rate": [0.01, 0.1], "max_depth": [3, 6], "colsample_bytree": [0.8, 1.0]},
    "MLP": {"hidden_layer_sizes": [(10,), (50,), (10, 10), (50, 50)], "activation": ["relu"], "solver": ["adam"], "alpha": [0.0001, 0.001], "learning_rate": ["constant"]},
    "BernoulliRBM": {
        "rbm__n_components": [64, 128],
        "rbm__learning_rate": [0.01, 0.1],
        "logistic__C": [0.1, 1, 10]
    }
}

# Checkpoint file paths
checkpoint_dir = "checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

# Results dictionary
results = {}
metrics_file = f"results/algorithm_search_model_evaluation_results_{extra}.csv"
# regional_metrics_file = "results/algorithm_search_regional_metrics_breakdown_{Y}Y.csv"

# Load existing metrics if they exist
if os.path.exists(metrics_file):
    results_df = pd.read_csv(metrics_file, index_col=0)
    results = results_df.to_dict(orient="index")
else:
    results_df = pd.DataFrame()

# if os.path.exists(regional_metrics_file):
#     regional_metrics_df = pd.read_csv(regional_metrics_file, index_col=0)
#     regional_metrics = regional_metrics_df.to_dict(orient="index")
# else:
#     regional_metrics_df = pd.DataFrame()
#     regional_metrics = {region: {} for region in data['greco_region'].unique()}

# Function to evaluate model using multiple metrics
def evaluate_model(model, X, y, cv, groups):
    f1_scores = []
    precision_scores = []
    recall_scores = []
    if cv.__class__ == GroupKFold:
        for train_index, test_index in gkf.split(X, y, groups):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
            precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
            recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    elif cv.__class__ == StratifiedKFold:
        for train_index, test_index in gkf.split(X, y):
            X_train, X_test = X.iloc[train_index], X.iloc[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            f1_scores.append(f1_score(y_test, y_pred, average='weighted'))
            precision_scores.append(precision_score(y_test, y_pred, average='weighted'))
            recall_scores.append(recall_score(y_test, y_pred, average='weighted'))
    return {
        'f1_score': np.mean(f1_scores),
        'precision': np.mean(precision_scores),
        'recall': np.mean(recall_scores),
    }

# Function to save checkpoint
def save_checkpoint(name, search, extra=''):
    filename = os.path.join(checkpoint_dir, f"{name}_{extra}_checkpoint.pkl")
    joblib.dump(search, filename)

# Function to load checkpoint
def load_checkpoint(name, extra=''):
    filename = os.path.join(checkpoint_dir, f"{name}_{extra}_checkpoint.pkl")
    if os.path.exists(filename):
        return joblib.load(filename)
    return None

# Perform hyperparameter optimization and evaluation for each model
def perform_hyperparameter_search(name, model, force=False):
    if name in results and not force:
        print(f"Metrics for model {name} already exist, skipping evaluation.")
        return

    print(f"Processing model: {name}")
    param_grid = param_grids[name]
    search = load_checkpoint(name, extra)
    if search is None or force:
        search = HalvingGridSearchCV(model, param_grid, cv=gkf, scoring='f1_weighted', n_jobs=-1, verbose=1)
        search.fit(X_scaled if name in ["Logistic Regression", "KNN", "MLP"] else X, y, groups=groups)
        save_checkpoint(name, search, extra)
    else:
        print(f"Resuming from checkpoint for {name}")
        if not hasattr(search, 'best_estimator_'):
            search.fit(X_scaled if name in ["Logistic Regression", "KNN", "MLP"] else X, y, groups=groups)
    best_model = search.best_estimator_
    metrics = evaluate_model(best_model, X_scaled if name in ["Logistic Regression", "KNN", "MLP"] else X, y, gkf, groups)
    results[name] = metrics
    results_df = pd.DataFrame(results).T
    results_df.to_csv(metrics_file)
    print(f"Best parameters for {name}: {search.best_params_}")
    print(f"Evaluation metrics for {name}: {metrics}")

# Perform the hyperparameter search
force = True
for name, model in models.items():
    perform_hyperparameter_search(name, model, force=force)

print("Model evaluation results saved to model_evaluation_results.csv")

# # Breakdown of metrics per eco region
# for region in regional_metrics:
#     region_indices = data[data['greco_region'] == region].index
#     X_region = X_scaled.loc[region_indices]
#     y_region = y.loc[region_indices]
#     region_groups = data.loc[region_indices, 'tile_id']
#     for name, model in models.items():
#         if region in regional_metrics and name in regional_metrics[region] and not force:
#             print(f"Metrics for model {name} in region {region} already exist, skipping evaluation.")
#             continue

#         best_model = load_checkpoint(name).best_estimator_
#         metrics = evaluate_model(best_model, X_region, y_region, region_groups)
#         regional_metrics[region][name] = metrics
#         regional_metrics_df = pd.DataFrame(regional_metrics).T
#         regional_metrics_df.to_csv(regional_metrics_file)

# print("Regional metrics breakdown saved to regional_metrics_breakdown.csv")


# Inference time 

In [None]:
import os
import time
import joblib
from utils import load_and_preprocess_table_data
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier, BernoulliRBM
from sklearn.pipeline import Pipeline
import numpy as np
import pandas as pd
from sklearn.preprocessing import RobustScaler

# Load data
Y = 3
config = f"no_resample_cloud_disturbance_weights_{Y}Y"
CV = 'Group'
extra = config + '_' + CV
data = load_and_preprocess_table_data(config)

# Define features and target
features = ['amplitude_red', 'cos_phase_red','offset_red',
            'cos_phase_blue', 
            'amplitude_crswir', 'cos_phase_crswir', 'sin_phase_crswir', 'offset_crswir', 
            'elevation']
target = 'phen'

X = data[features]
y = data[target]
y = y - 1  # Adjust target labels to start from 0

# Ensure indices are aligned
X, y = X.align(y, join='inner', axis=0)

# Standardize features
scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, index=X.index, columns=X.columns)

# Define models
models = {
    "Logistic Regression": LogisticRegression(),
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "MLP": MLPClassifier(),
}

# Checkpoint directory
checkpoint_dir = "checkpoints"

# Function to load checkpoint
def load_checkpoint(name, extra=''):
    filename = os.path.join(checkpoint_dir, f"{name}_{extra}_checkpoint.pkl")
    if os.path.exists(filename):
        return joblib.load(filename)
    return None

# Results dictionary
time_results = []

# Measure training and inference time
def measure_time(model_name: str, model, X, y):
    start_time = time.time()
    model.fit(X, y)
    training_time = time.time() - start_time
    
    start_time = time.time()
    model.predict(X)
    inference_time = time.time() - start_time
    
    return training_time, inference_time

# Evaluate each model
for name, model in models.items():
    print(f"Evaluating model: {name}")
    checkpoint = load_checkpoint(name, extra)
    if checkpoint:
        best_model = checkpoint.best_estimator_
        X_used = X_scaled if name in ["Logistic Regression", "KNN", "MLP"] else X
        train_time, infer_time = measure_time(name, best_model, X_used, y)
        time_results.append({
            "model": name,
            "training_time": train_time,
            "inference_time": infer_time
        })
        print(f"{name}: Training time = {train_time:.4f}s, Inference time = {infer_time:.4f}s")

# Save time results
time_results_df = pd.DataFrame(time_results)
time_results_df.to_csv(f"results/training_inference_times_{extra}.csv", index=False)
print("Training and inference times saved to results/training_inference_times_{extra}.csv")
