In [None]:
# -------------------------------------------
# 1. IMPORT, DEFINE PATHS, AND CREATE DEFAULT MODEL DICTIONARY
# -------------------------------------------

# Stage 04b: Baseline Model Evaluation (Before Hyperparameter Tuning)

# The objective of this module 04b is to measure the accuracy and runtime for each model
# using only default parameters (i.e., no hyperparameter tuning). This provides the BEFORE
# measures of each model's accuracy and runtime.

# This stage creates default model instances with no hyperparameter tuning and performs exhaustive
# feature subset analysis on a selected model. It generates all possible feature
# combinations from min_features (e.g., 14) to total_features (16) and evaluates baseline model performance
# on each combination. This stage creates and saves results to six different output files:
#
# PER-MODEL FILES (created for each individual model tested):
#    1. Main results file: baseline_{model}_results_{min}_{max}.xlsx 
#       - Shows accuracy and runtime for each feature combination (137 rows)
#    2. Per-class accuracy: baseline_{model}_per_class_{min}_{max}.xlsx
#       - Shows how well the model predicted each bean variety (1 row)
#    3. Validation report: baseline_{model}_validation_{min}_{max}.txt
#       - Detailed confusion matrix and accuracy analysis for complete feature set
#    4. F1-scores: baseline_{model}_f1_scores_{min}_{max}.xlsx
#       - F1-macro, F1-weighted, and overall accuracy metrics (1 row)
#
# CONSOLIDATED FILES (created/updated after each model run):
#    5. Confusion matrices: baseline_confusion_matrices_all_models_{min}_{max}.xlsx
#       - Each model's confusion matrix as a separate sheet
#    6. All F1-scores: baseline_f1_scores_all_models_{min}_{max}.xlsx
#       - F1-scores from all baseline models consolidated into one file

# Import libraries
import pandas as pd
import itertools
import joblib
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm
import time
import gc
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

# Import all model classes for default_model_dict
# Individual model imports are used only in this 04b module. Dynamic imports are used
# in downstream modules. (This module could be changed in a future version to dynamically import.)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier, SGDClassifier, Perceptron
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier

# Define this project's file locations.
# This notebook uses a centralized config.py file for all path management.

# Import config paths
import sys
sys.path.append('..')
from config import TUNED_MODELS_DIR, CURATED_DATA_DIR, RESULTS_DIR, BASELINE_RESULTS_DIR

curated_data_dir = CURATED_DATA_DIR
results_dir = BASELINE_RESULTS_DIR   # Use dedicated baseline results directory
tuned_models_dir = TUNED_MODELS_DIR  # NOTE: This is not used in this module, but kept for consistency
results_dir.mkdir(parents=True, exist_ok=True)

# Define file paths
data_path = CURATED_DATA_DIR / "DryBean_curated.parquet"    # From Stage 04

# Create default model dictionary with baseline parameters (no hyperparameter tuning)
# Using the same 16-model order established in Stage 05
default_model_dict = {
    'DecisionTreeClassifier': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'ExtraTreesClassifier': ExtraTreesClassifier(),
    'GradientBoostingClassifier': GradientBoostingClassifier(),
    'AdaBoostClassifier': AdaBoostClassifier(),
    'XGBClassifier': XGBClassifier(),
    'LogisticRegression': LogisticRegression(),
    'RidgeClassifier': RidgeClassifier(),
    'SGDClassifier': SGDClassifier(),
    'Perceptron': Perceptron(),
    'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'GaussianNB': GaussianNB(),
    'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
    'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
    'MLPClassifier': MLPClassifier()
}

print(f"Created {len(default_model_dict)} baseline models with default parameters")
print(f"Data source: {data_path}")
print(f"Results will be saved to: {results_dir}")



Created 16 baseline models with default parameters
Data source: C:\Misc\ml_benchmark\outputs\curated_data\DryBean_curated.parquet
Results will be saved to: C:\Misc\ml_benchmark\outputs\baseline_results


In [77]:
# -------------------------------------------
# 2. SELECT MODEL AND FEATURE SETS
# -------------------------------------------

# Display available baseline models in the order defined in default_model_dict
print("\n Available baseline models (with default parameters):")
for i, model_name in enumerate(default_model_dict.keys(), 1):
    print(f"   {i:2d}. {model_name}")


# ------------- SELECT MODEL TO TEST HERE -------------
# Change this variable to select the model to test
selected_model_name = "GradientBoostingClassifier"  # Change this to test different models
# -----------------------------------------------------

if selected_model_name not in default_model_dict:
    print(f"❌ Error: {selected_model_name} not found in default_model_dict")
    print(f"Available models: {list(default_model_dict.keys())}")
else:
    model = default_model_dict[selected_model_name]
    print(f"\n✅ Selected baseline model: {selected_model_name}")
    
    # Feature Selection Parameters
    # ------ ENTER THE MINIMUM FEATURE COUNT HERE -------
    min_features = 14 
    print(f"🔍 Testing feature set combinations from {min_features} to all features")
    # ---------------------------------------------------

    # Display the model's parameters
    params = model.get_params()
    param_items = list(params.items())
    print(f"Model parameters:")
    for i in range(0, len(param_items), 5):
        chunk = param_items[i:i+5]
        param_str = ", ".join([f"{k}={v}" for k, v in chunk])
        print(f"   {param_str}")


 Available baseline models (with default parameters):
    1. DecisionTreeClassifier
    2. RandomForestClassifier
    3. ExtraTreesClassifier
    4. GradientBoostingClassifier
    5. AdaBoostClassifier
    6. XGBClassifier
    7. LogisticRegression
    8. RidgeClassifier
    9. SGDClassifier
   10. Perceptron
   11. SVC
   12. KNeighborsClassifier
   13. GaussianNB
   14. LinearDiscriminantAnalysis
   15. QuadraticDiscriminantAnalysis
   16. MLPClassifier

✅ Selected baseline model: GradientBoostingClassifier
🔍 Testing feature set combinations from 14 to all features
Model parameters:
   ccp_alpha=0.0, criterion=friedman_mse, init=None, learning_rate=0.1, loss=log_loss
   max_depth=3, max_features=None, max_leaf_nodes=None, min_impurity_decrease=0.0, min_samples_leaf=1
   min_samples_split=2, min_weight_fraction_leaf=0.0, n_estimators=100, n_iter_no_change=None, random_state=None
   subsample=1.0, tol=0.0001, validation_fraction=0.1, verbose=0, warm_start=False


In [78]:
# -------------------------------------------
# 3. LOAD PREPROCESSED DATA
# -------------------------------------------

# Load processed dataset from Stage 04 (scaled and encoded)
df = pd.read_parquet(data_path)
X = df.drop(columns=["label"])  # Features (already scaled/encoded from Stage 04)
y = df["label"]                 # Target (already label-encoded from Stage 04)

# The data was preprocessed in Stage 04, so can be used here
# without additional scaling or encoding.

feature_names = X.columns.tolist()
total_features = len(feature_names)
unique_classes = y.unique()

# Confirm key dataset information. The last print line confirmas that: 
#   1. Target classes are numeric (some models require numeric labels), and
#   2. No classes are missing.
print(f"\n Dataset Information:")
print(f"   Shape: {X.shape}")
print(f"   Total features: {total_features}")
print(f"   Feature names: {feature_names}")
print(f"   Target classes: {sorted(unique_classes)}")
print(f"   Data types: Features={X.dtypes.iloc[0]}, Target={y.dtype}")


 Dataset Information:
   Shape: (13611, 16)
   Total features: 16
   Feature names: ['A', 'P', 'L', 'l', 'K', 'Ec', 'C', 'Ed', 'Ex', 'S', 'R', 'CO', 'SF1', 'SF2', 'SF3', 'SF4']
   Target classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]
   Data types: Features=float64, Target=int64


In [None]:
# -------------------------------------------
# 4. MEASURE AND RECORD A MODEL'S RUNTIMES (3-LOOP AVERAGE). LATEST VERSION!
# -------------------------------------------

# This code block creates all possible features sets, then loops through tests of the selected model 
# on every feature set created.  Each feature set is examined the number of times 
# set by the user in the variable num_runs. Meaning, each feature set can be examined 1, 2, 3,... times.
# The purpose of examining each feature set more than once is to use the average runtime of all loops,
# possibly offseting runtime outliers caused by Windows background processes and RAM management
# interfering with the model's runtime.
#  
# The average accuracy and runtime across all three loops of the same feature set are recorded.
# Additionally, we measure how well the model predicts each of the 7 bean varieties.

from sklearn.model_selection import cross_val_score, StratifiedKFold
import numpy as np

results = []
per_class_data = []  # Each model's accuracy in predicing each bean variety (aka, class) is stored here.

# Loops taken on each dataset record. Used to possibly offset runtime outliers caused
# by Windows background processes and RAM management interfering with the model's runtime.
num_runs = 1

# Get unique class names for consistent reporting
class_names = sorted(y.unique())
print(f"Bean varieties to track: {class_names}")

# Calculate expected combinations for progress tracking.
total_combinations = sum(len(list(itertools.combinations(feature_names, r))) 
                         for r in range(min_features, total_features + 1))
print(f"\nStarting exhaustive feature subset analysis...")
print(f"   Total combinations to test: {total_combinations:,}")
print(f"   Feature range: {min_features} to {total_features} features")

# Conduct benchmark: repeat each feature set 3x, then determine average accuracy and runtime
for r in range(min_features, len(feature_names) + 1):
    combos = list(itertools.combinations(feature_names, r))
    print(f"\nTesting {len(combos):,} combinations of {r} features...")

    for combo in tqdm(combos, desc=f"Feature combinations ({r} features)"):
        accs = []
        runtimes = []
        
        # Track per-class accuracy across all runs
        class_accuracy_runs = []

        for run in range(num_runs):
            model = default_model_dict[selected_model_name]  # fresh model each run
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

            start_time = time.perf_counter()
            
            # Collect predictions for per-class analysis
            y_true_all = []
            y_pred_all = []
            
            # Manual cross-validation to capture individual predictions - generally same runtime for all models
            for train_idx, test_idx in cv.split(X[list(combo)], y):
                X_train, X_test = X[list(combo)].iloc[train_idx], X[list(combo)].iloc[test_idx]
                y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
                
                
                # Train model on 80% of dataset - COMPLETELY DIFFERENT for each model, so different runtimes
                model.fit(X_train, y_train)

                # Generate predictions on 20% of dataset - COMPLETELY DIFFERENT for each model, so different
                # runtimes and accuracies
                y_pred = model.predict(X_test)   
                
                # Store for per-class analysis - generally same runtime for all models
                y_true_all.extend(y_test.tolist())
                y_pred_all.extend(y_pred.tolist())
            
            end_time = time.perf_counter()
            
            # Calculate overall accuracy
            overall_accuracy = sum(yt == yp for yt, yp in zip(y_true_all, y_pred_all)) / len(y_true_all)
            accs.append(overall_accuracy)
            runtimes.append((end_time - start_time) * 1000)  # ms
            
            # Calculate per-class accuracy for this run
            class_accuracies = {}
            for class_name in class_names:
                class_true_indices = [i for i, yt in enumerate(y_true_all) if yt == class_name]
                if class_true_indices:  # Only if class exists in test set
                    class_correct = sum(1 for i in class_true_indices if y_pred_all[i] == class_name)
                    class_accuracies[class_name] = class_correct / len(class_true_indices)
                else:
                    class_accuracies[class_name] = None  # No samples of this class
            
            class_accuracy_runs.append(class_accuracies)

        # Average over num_runs runs
        avg_acc = round(sum(accs) / num_runs, 4)
        avg_rt = round(sum(runtimes) / num_runs, 3)
        
        # Average per-class accuracies over num_run runs
        avg_class_accuracies = {}
        for class_name in class_names:
            class_accs = [run[class_name] for run in class_accuracy_runs if run[class_name] is not None]
            if class_accs:
                avg_class_accuracies[class_name] = round(sum(class_accs) / len(class_accs), 4)
            else:
                avg_class_accuracies[class_name] = None

        # Build output row: feature columns + blank pads + avg metrics
        row = list(combo) + [""] * (total_features - len(combo)) + [avg_acc, avg_rt]
        results.append(row)
        
        # Store per-class data for this feature combination
        per_class_data.append({
            'Feature_Combination': combo,
            'Feature_Count': len(combo),
            'Overall_Accuracy': avg_acc,
            'Runtime_ms': avg_rt,
            **{class_name: avg_class_accuracies[class_name] for class_name in class_names}
        })

print(f"\n✅ Completed testing {len(results):,} feature combinations")
print(f"Per-class data collected for {len(per_class_data)} feature combinations")

Bean varieties to track: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]

🔄 Starting exhaustive feature subset analysis...
   Total combinations to test: 137
   Feature range: 14 to 16 features

🧪 Testing 120 combinations of 14 features...


Feature combinations (14 features): 100%|██████████| 120/120 [4:40:45<00:00, 140.38s/it] 



🧪 Testing 16 combinations of 15 features...


Feature combinations (15 features): 100%|██████████| 16/16 [40:06<00:00, 150.44s/it]



🧪 Testing 1 combinations of 16 features...


Feature combinations (16 features): 100%|██████████| 1/1 [02:39<00:00, 159.81s/it]


✅ Completed testing 137 feature combinations
Per-class data collected for 137 feature combinations





In [None]:
# -------------------------------------------
# 5. ASSEMBLE AND STORE RESULTS IN TWO EXCEL FILES
# -------------------------------------------

# Create a results DataFrame with proper column structure.
# Columns: Feature_1, Feature_2, ..., Feature_16, Accuracy, Runtime (ms)
columns = [f"Feature_{i+1}" for i in range(total_features)] + ["Accuracy", "Runtime (ms)"]
results_df = pd.DataFrame(results, columns=columns)

# Sort by accuracy (descending) to see best performing combinations first
results_df_sorted = results_df.sort_values("Accuracy", ascending=False)


# PREPARE AND SAVE FIRST EXCEL FILE: Accuracy and Runtime Results for each feature set
# Generate the Excel output filename following the naming convention:
# baseline_{model_name}_results_{min_features}_{max_features}.xlsx
model_name_clean = selected_model_name.lower().replace("classifier", "")
output_filename = f"baseline_{model_name_clean}_results_{min_features}_{total_features}.xlsx"
output_path = results_dir / output_filename

# Create and save the Excel file showing accuracy and runtime for each feature combination, for this model.
results_df_sorted.to_excel(output_path, index=False)


# PREAPARE AND SAVE SECOND EXCEL FILE: Per-Class Accuracy Summary for this model
# Create per-class accuracy summary for this model (how well this model predicted the label's value)
class_names = sorted(y.unique())

# Calculate average per-class accuracy across all feature combinations
model_per_class_summary = {'Model': selected_model_name}

for class_name in class_names:
    class_accuracies = [data[class_name] for data in per_class_data if data[class_name] is not None]
    if class_accuracies:
        avg_class_acc = round(sum(class_accuracies) / len(class_accuracies), 4)
        model_per_class_summary[f'Accuracy_{class_name}'] = avg_class_acc
    else:
        model_per_class_summary[f'Accuracy_{class_name}'] = None

# Create per-class DataFrame (single row)
per_class_df = pd.DataFrame([model_per_class_summary])

# Generate per-class Excel filename
per_class_filename = f"baseline_{model_name_clean}_per_class_{min_features}_{total_features}.xlsx"
per_class_path = results_dir / per_class_filename

# Create and then save the Excel file showing this model's accuracy in identifying each bean variety
per_class_df.to_excel(per_class_path, index=False)

# Display summary statistics
best_accuracy = results_df_sorted['Accuracy'].iloc[0]
total_runtime_sec = results_df['Runtime (ms)'].sum() / 1000
avg_runtime_ms = results_df['Runtime (ms)'].mean()

print(f"\nAnalysis Summary:")
print(f"   Model tested: {selected_model_name}")
print(f"   Feature combinations: {len(results):,}")
print(f"   Best accuracy: {best_accuracy:.4f}")
print(f"   Total runtime: {total_runtime_sec:.1f} seconds")
print(f"   Average runtime per combination: {avg_runtime_ms:.1f} ms")
print(f"\nResults saved to: {output_path}")
print(f"Per-class results saved to: {per_class_path}")

# Display top 5 feature combinations
print(f"\nTop 5 Feature Combinations:")
for i, row in results_df_sorted.head().iterrows():
    features_used = [col for col in columns[:-2] if row[col] != ""]
    feature_list = [row[col] for col in features_used]
    print(f"   {row['Accuracy']:.4f} | {len(feature_list)} features | {feature_list}")

# Display per-class accuracy summary
print(f"\nPer-Class (bean type) Accuracy Summary for {selected_model_name}:")
for class_name in class_names:
    class_acc = model_per_class_summary[f'Accuracy_{class_name}']
    if class_acc is not None:
        print(f"   {class_name}: {class_acc:.4f} ({class_acc*100:.2f}%)")
    else:
        print(f"   {class_name}: No data")


📊 Analysis Summary:
   Model tested: GradientBoostingClassifier
   Feature combinations: 137
   Best accuracy: 0.9268
   Total runtime: 19411.2 seconds
   Average runtime per combination: 141687.6 ms

💾 Results saved to: C:\Misc\ml_benchmark\outputs\baseline_results\baseline_gradientboosting_results_14_16.xlsx
💾 Per-class results saved to: C:\Misc\ml_benchmark\outputs\baseline_results\baseline_gradientboosting_per_class_14_16.xlsx

Top 5 Feature Combinations:
   0.9268 | 14 features | ['A', 'P', 'L', 'l', 'Ec', 'C', 'Ex', 'S', 'R', 'CO', 'SF1', 'SF2', 'SF3', 'SF4']
   0.9265 | 14 features | ['A', 'L', 'l', 'K', 'Ec', 'C', 'Ed', 'Ex', 'S', 'R', 'CO', 'SF2', 'SF3', 'SF4']
   0.9265 | 15 features | ['P', 'L', 'l', 'K', 'Ec', 'C', 'Ed', 'Ex', 'S', 'R', 'CO', 'SF1', 'SF2', 'SF3', 'SF4']
   0.9265 | 16 features | ['A', 'P', 'L', 'l', 'K', 'Ec', 'C', 'Ed', 'Ex', 'S', 'R', 'CO', 'SF1', 'SF2', 'SF3', 'SF4']
   0.9265 | 14 features | ['P', 'L', 'l', 'K', 'Ec', 'C', 'Ed', 'Ex', 'S', 'R', 'SF1', 

In [81]:
# -------------------------------------------
# 6. VALIDATE RESULTS WITH A CM FOR SELECTED MODEL   ************* LATEST VERSION *****************
# -------------------------------------------

# VALIDATION PURPOSE:
# This validation section creates a confusion matrix and detailed accuracy report to:
#   1. Validate the per-class accuracy calculations from the main benchmark (prior Section 4)
#   2. Provide detailed insight into which bean varieties the model predicts well/poorly
#   3. Confirm the overall accuracy methodology by using only the complete, 16-feature, orginal dataset
#   4. Generate a comprehensive text report saved alongside the Excel benchmark files
#
# VALIDATION APPROACH:
# Uses identical cross-validation methodology as the main benchmark, but focuses on:
#   - The complete feature set (all 16 features) rather than iterating through other feature subsets
#   - Detailed confusion matrix showing prediction patterns for each bean class
#   - Per-class accuracy breakdown to identify model strengths/weaknesses
#   - Comparison context for interpreting the averaged results in the Excel files
#
# OUTPUTS: 
#   1. Creates a detailed .txt validation report in the same directory as Excel results
#   2. Adds the model's confusion matrix to an existing Excel file containing all models' confusion matrices

from sklearn.metrics import confusion_matrix, classification_report
import numpy as np
from datetime import datetime

print(f"\nVALIDATION: Creating confusion matrix for {selected_model_name}")
print(f"Using all {total_features} features on {len(X)} records")
print(f"This validates the per-class accuracy calculations")

# Use the same cross-validation setup as the main analysis
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Collect all predictions using identical methodology
y_true_all = []
y_pred_all = []

print(f"\nRunning cross-validation...")

# Manual cross-validation (identical to main code)
for train_idx, test_idx in cv.split(X, y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Use fresh model instance and train/predict (same as main code)
    validation_model = default_model_dict[selected_model_name]
    validation_model.fit(X_train, y_train)
    y_pred = validation_model.predict(X_test)
    
    # Store all predictions
    y_true_all.extend(y_test.tolist())
    y_pred_all.extend(y_pred.tolist())

# Create confusion matrix
cm = confusion_matrix(y_true_all, y_pred_all)

# Create readable confusion matrix with class labels
cm_df = pd.DataFrame(cm, 
                     index=[f"True_{cls}" for cls in sorted(y.unique())], 
                     columns=[f"Pred_{cls}" for cls in sorted(y.unique())])

# Calculate per-class accuracy from confusion matrix
validation_per_class = {}
for i, class_name in enumerate(sorted(y.unique())):
    class_total = cm[i, :].sum()  # Total actual samples of this class
    class_correct = cm[i, i]      # Correctly predicted samples
    class_accuracy = class_correct / class_total if class_total > 0 else 0
    validation_per_class[class_name] = class_accuracy

# Calculate overall accuracy
overall_accuracy = sum(y_true_all[i] == y_pred_all[i] for i in range(len(y_true_all))) / len(y_true_all)

# Identify perfect and poor performing classes
perfect_classes = [cls for cls, acc in validation_per_class.items() if acc == 1.0]
poor_classes = [cls for cls, acc in validation_per_class.items() if acc < 0.8]

# Generate validation report filename
model_name_clean = selected_model_name.lower().replace("classifier", "")
validation_filename = f"baseline_{model_name_clean}_validation_{min_features}_{total_features}.txt"
validation_path = results_dir / validation_filename

# Create comprehensive text report
report_content = []
report_content.append("=" * 80)
report_content.append(f"VALIDATION REPORT: {selected_model_name}")
report_content.append("=" * 80)
report_content.append(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
report_content.append(f"Dataset: {len(X):,} records, {total_features} features")
report_content.append(f"Validation uses all {total_features} features (complete feature set)")
#report_content.append(f"Feature range tested: {min_features} to {total_features} features")
report_content.append("")

# Confusion Matrix Section
report_content.append("CONFUSION MATRIX")
report_content.append("-" * 40)
report_content.append("Rows = True Bean Classes, Columns = Predicted Bean Classes")
report_content.append(f"Bean Classes: {sorted(y.unique())}")
report_content.append("")
report_content.append(cm_df.to_string())
report_content.append("")

# Per-Class Accuracy Section
report_content.append("PER-CLASS ACCURACY (from confusion matrix)")
report_content.append("-" * 50)
for i, class_name in enumerate(sorted(y.unique())):
    class_total = cm[i, :].sum()
    class_correct = cm[i, i]
    class_accuracy = validation_per_class[class_name]
    report_content.append(f"   Bean Class {class_name}: {class_accuracy:.4f} ({class_accuracy*100:.2f}%) - {class_correct}/{class_total} correct")

report_content.append("")

# Overall Accuracy Section
report_content.append("OVERALL PERFORMANCE")
report_content.append("-" * 30)
report_content.append(f"Overall Accuracy: {overall_accuracy:.4f} ({overall_accuracy*100:.2f}%)")
report_content.append("")

# Performance Classification
if perfect_classes:
    report_content.append(f"Perfect Classification (100%): Bean classes {perfect_classes}")
if poor_classes:
    report_content.append(f"Challenging Classification (<80%): Bean classes {poor_classes}")
if not perfect_classes and not poor_classes:
    report_content.append("All classes achieved 80%+ accuracy, none achieved 100%")

report_content.append("")

# Comparison to Main Results
report_content.append("COMPARISON TO MAIN BENCHMARK RESULTS")
report_content.append("-" * 45)
report_content.append("   • These per-class accuracies are for the complete 16-feature set only")
report_content.append("   • Your main benchmark Excel file shows averages across all 137 feature combinations")
report_content.append("   • Expected: These validation results may differ slightly from main benchmark averages")
report_content.append("   • This validation confirms the accuracy calculation methodology")
report_content.append("")

# Additional Statistics
report_content.append("DETAILED STATISTICS")
report_content.append("-" * 25)
report_content.append(f"Cross-validation folds: 5")
report_content.append(f"Random state: 42 (reproducible results)")
report_content.append(f"Total predictions made: {len(y_true_all):,}")
report_content.append(f"Correct predictions: {sum(y_true_all[i] == y_pred_all[i] for i in range(len(y_true_all))):,}")
report_content.append(f"Incorrect predictions: {len(y_true_all) - sum(y_true_all[i] == y_pred_all[i] for i in range(len(y_true_all))):,}")

# Class distribution
report_content.append("")
report_content.append("CLASS DISTRIBUTION IN VALIDATION")
report_content.append("-" * 35)
for class_name in sorted(y.unique()):
    class_count = y_true_all.count(class_name)
    class_percentage = (class_count / len(y_true_all)) * 100
    report_content.append(f"   Bean Class {class_name}: {class_count:,} samples ({class_percentage:.1f}%)")

report_content.append("")
report_content.append("=" * 80)
report_content.append("END OF VALIDATION REPORT")
report_content.append("=" * 80)

# Write to file
with open(validation_path, 'w') as f:
    f.write('\n'.join(report_content))

# Display summary to console
print(f"\nVALIDATION SUMMARY:")
print(f"   Model: {selected_model_name}")
print(f"   Overall Accuracy: {overall_accuracy:.4f} ({overall_accuracy*100:.2f}%)")
if perfect_classes:
    print(f"   Perfect Classes: {perfect_classes}")
if poor_classes:
    print(f"   ⚠️  Challenging Classes: {poor_classes}")

print(f"\nValidation report saved to: {validation_path}")
print(f"Report includes confusion matrix, per-class accuracy, and detailed statistics")
print(f"\nValidation complete! All results saved to text file.")


# Save confusion matrix to Excel format for consistency analysis
cm_excel_filename = f"baseline_confusion_matrices_all_models_{min_features}_{total_features}.xlsx"
cm_excel_path = results_dir / cm_excel_filename

# Add model name column to confusion matrix
cm_df_excel = cm_df.copy()
cm_df_excel.insert(0, 'Model', selected_model_name)

# Check if Excel file already exists
if cm_excel_path.exists():
    # File exists - add new sheet to existing workbook
    with pd.ExcelWriter(cm_excel_path, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
        cm_df_excel.to_excel(writer, sheet_name=selected_model_name, index=True)
    print(f"Added {selected_model_name} sheet to existing Excel file: {cm_excel_path}")
else:
    # File doesn't exist - create new workbook with first sheet
    cm_df_excel.to_excel(cm_excel_path, sheet_name=selected_model_name, index=True)
    print(f"Created new Excel file with {selected_model_name} sheet: {cm_excel_path}")


# Calculate and save F1-scores for this model
from sklearn.metrics import f1_score

# Calculate F1-scores using existing predictions from validation
f1_macro = f1_score(y_true_all, y_pred_all, average='macro')
f1_weighted = f1_score(y_true_all, y_pred_all, average='weighted')

# Create F1-scores summary (single row)
f1_summary = {
    'Model': selected_model_name,
    'F1_Macro': round(f1_macro, 4),
    'F1_Weighted': round(f1_weighted, 4),
    'Overall_Accuracy': round(overall_accuracy, 4)
}

# Create F1-scores DataFrame and save to Excel
f1_df = pd.DataFrame([f1_summary])
f1_filename = f"baseline_{model_name_clean}_f1_scores_{min_features}_{total_features}.xlsx"
f1_path = results_dir / f1_filename
f1_df.to_excel(f1_path, index=False)

# Confirmation print with full path
print(f"F1-scores Excel file created and saved: {f1_path}")
print(f"   F1-Macro: {f1_macro:.4f}, F1-Weighted: {f1_weighted:.4f}")    


VALIDATION: Creating confusion matrix for GradientBoostingClassifier
Using all 16 features on 13611 records
This validates the per-class accuracy calculations

Running cross-validation...

VALIDATION SUMMARY:
   Model: GradientBoostingClassifier
   Overall Accuracy: 0.9259 (92.59%)

Validation report saved to: C:\Misc\ml_benchmark\outputs\baseline_results\baseline_gradientboosting_validation_14_16.txt
Report includes confusion matrix, per-class accuracy, and detailed statistics

Validation complete! All results saved to text file.
Added GradientBoostingClassifier sheet to existing Excel file: C:\Misc\ml_benchmark\outputs\baseline_results\baseline_confusion_matrices_all_models_14_16.xlsx
F1-scores Excel file created and saved: C:\Misc\ml_benchmark\outputs\baseline_results\baseline_gradientboosting_f1_scores_14_16.xlsx
   F1-Macro: 0.9375, F1-Weighted: 0.9259


In [None]:
# -------------------------------------------
# 7. CONSOLIDATE F1-SCORES FROM ALL MODELS INTO SINGLE EXCEL FILE
# -------------------------------------------

# Run this module only after all ML models have been tested and the individual F1-scores
# Excel files have been created in the prior code cells.  It will collect the data from
# each model's F1-scores Excel file and consolidate them into a single Excel file

import pandas as pd
from pathlib import Path

# Define search pattern for the F1-score Excel files
f1_pattern = f"baseline_*_f1_scores_{min_features}_{total_features}.xlsx"
f1_files = list(results_dir.glob(f1_pattern))    # Uses the .glob() method of the pathlib.Path object, so glob import not needed

# Create consolidated F1-scores Excel file
consolidated_f1_filename = f"baseline_f1_scores_all_models_{min_features}_{total_features}.xlsx"
consolidated_f1_path = results_dir / consolidated_f1_filename

if f1_files:
    print(f"\nConsolidating F1-scores from {len(f1_files)} models...")
    
    with pd.ExcelWriter(consolidated_f1_path, engine='openpyxl') as writer:
        for f1_file in f1_files:
            # Extract model name from filename
            model_name = f1_file.stem.replace(f"baseline_", "").replace(f"_f1_scores_{min_features}_{total_features}", "")
            
            # Read the F1-score data
            f1_data = pd.read_excel(f1_file)
            
            # Write to sheet named after model
            f1_data.to_excel(writer, sheet_name=model_name, index=False)
            
            print(f"   Added {model_name} F1-scores to consolidated file")
    
    print(f"\n✅ F1-scores consolidated successfully: {consolidated_f1_path}")
    print(f"   Total sheets: {len(f1_files)}")
    
else:
    print(f"\n⚠️  No F1-score files found matching pattern: {f1_pattern}")


Consolidating F1-scores from 16 models...
   Added adaboost F1-scores to consolidated file
   Added decisiontree F1-scores to consolidated file
   Added extratrees F1-scores to consolidated file
   Added gaussiannb F1-scores to consolidated file
   Added gradientboosting F1-scores to consolidated file
   Added kneighbors F1-scores to consolidated file
   Added lineardiscriminantanalysis F1-scores to consolidated file
   Added logisticregression F1-scores to consolidated file
   Added mlp F1-scores to consolidated file
   Added perceptron F1-scores to consolidated file
   Added quadraticdiscriminantanalysis F1-scores to consolidated file
   Added randomforest F1-scores to consolidated file
   Added ridge F1-scores to consolidated file
   Added sgd F1-scores to consolidated file
   Added svc F1-scores to consolidated file
   Added xgb F1-scores to consolidated file

✅ F1-scores consolidated successfully: C:\Misc\ml_benchmark\outputs\baseline_results\baseline_f1_scores_all_models_14_16.x