In [None]:
# -------------------------------------------
# 1. LOAD TUNED PARAMETERS
# -------------------------------------------

# Module 06_build_model_dict.ipynb:
#   Load best_params.json (output from module 05) 
#   Dynamically import model classes using AVAILABLE_MODELS for class/module mapping
#   Instantiate all 16 models with: 
#   -- best parameters (from JSON)
#   -- default parameters if model was not tuned (e.g., GaussianNB)


import json
import importlib
from pathlib import Path
from sklearn.base import BaseEstimator
import warnings
import joblib
warnings.filterwarnings("ignore")  # Ignore all warnings for cleaner output. Hypertuning has many warnings.

# Define this project's file locations.
# This notebook uses a centralized config.py file for all path management.

# Import config paths
import sys
sys.path.append('..')
from config import TUNED_MODELS_DIR, MODEL_DICT_PATH

# Define this project's paths to retrieve and save files. 
tuned_models_dir = TUNED_MODELS_DIR
model_dict = joblib.load(MODEL_DICT_PATH)


# Load tuning results (the winners from GridSearchCV runs)
best_params_path = TUNED_MODELS_DIR / "best_params.json"
with open(best_params_path, "r") as f:
    best_params = json.load(f)

# Confirm full list of models are loaded
print(f"Loaded best_params.json with {len(best_params)} models:")
for model in best_params.keys():
    print(f"   • {model}")


Loaded best_params.json with 16 models:
   • DecisionTreeClassifier
   • RandomForestClassifier
   • ExtraTreesClassifier
   • GradientBoostingClassifier
   • AdaBoostClassifier
   • XGBClassifier
   • LogisticRegression
   • RidgeClassifier
   • SGDClassifier
   • Perceptron
   • KNeighborsClassifier
   • SVC
   • GaussianNB
   • LinearDiscriminantAnalysis
   • QuadraticDiscriminantAnalysis
   • MLPClassifier


In [2]:
# -------------------------------------------
# 2. DEFINE AVAILABLE_MODELS METADATA
# -------------------------------------------


#  The .json file contains only the winning parameter values, not the metadata needed to instantiate models.
#  Therefore, this cell provides the metadata needed to build (instantiate) each model.

#  AVAILABLE_MODELS defines the metadata. It documents how to find and instantiate each model, with:
#  -- "class":  the model class name to use
#  -- "module":  the module path to import from
#  Must use the same AVAILABLE_MODELS structure as in module 05_tune_hyperparameters.ipynb for 
#  the instantiation of each model from .json best parameters file to work correctly in this module.
#  AVAILABLE_MODELS provides the "how to build" instructions for each model.


AVAILABLE_MODELS = {
    "DecisionTreeClassifier": {
        "class": "DecisionTreeClassifier",
        "module": "sklearn.tree",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42},
        "param_grid": {
            "criterion": ["gini", "entropy"],
            "max_depth": [None, 5, 10, 20],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 3]
        }
    },
    "RandomForestClassifier": {
        "class": "RandomForestClassifier",
        "module": "sklearn.ensemble",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42, "n_jobs": -1},
        "param_grid": {
            "n_estimators": [50, 100],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5],
            "min_samples_leaf": [1, 3]
        }
    },
    "ExtraTreesClassifier": {
        "class": "ExtraTreesClassifier",
        "module": "sklearn.ensemble",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42, "n_jobs": -1},
        "param_grid": {
            "n_estimators": [50, 100],
            "max_depth": [None, 10, 20],
            "min_samples_split": [2, 5]
        }
    },
    "GradientBoostingClassifier": {
        "class": "GradientBoostingClassifier",
        "module": "sklearn.ensemble",
        "subsample": 0.8,
        "max_depth": 5,
        "max_features": "sqrt",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42, "subsample": 0.8,
            "max_depth": 5,
            "max_features": "sqrt"
        },
        "param_grid": {
            "n_estimators": [50, 100],
            "learning_rate": [0.05, 0.1],
            "max_depth": [3, 5]
        }
    },
    "AdaBoostClassifier": {
        "class": "AdaBoostClassifier",
        "module": "sklearn.ensemble",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42},
        "param_grid": {
            "n_estimators": [50, 100],
            "learning_rate": [0.5, 1.0]
        }
    },
    "XGBClassifier": {
        "class": "XGBClassifier",
        "module": "xgboost",
        "requires_numeric_labels": True,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {
            "random_state": 42,
            "use_label_encoder": False,
            "eval_metric": "mlogloss",
            "n_jobs": -1
        },
        "param_grid": {
            "n_estimators": [50, 100],
            "max_depth": [3, 6],
            "learning_rate": [0.1, 0.2],
            "subsample": [0.8, 1.0]
        }
    },
    "LogisticRegression": {
        "class": "LogisticRegression",
        "module": "sklearn.linear_model",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42, "n_jobs": -1, "max_iter": 1000},
        "param_grid": {
            "C": [0.1, 1, 10],
            "penalty": ["l2"],
            "solver": ["lbfgs", "liblinear"]
        }
    },
    "RidgeClassifier": {
        "class": "RidgeClassifier",
        "module": "sklearn.linear_model",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {},
        "param_grid": {
            "alpha": [0.1, 1.0, 10.0],
            "solver": ["auto", "sparse_cg"]
        }
    },
    "SGDClassifier": {
        "class": "SGDClassifier",
        "module": "sklearn.linear_model",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42, "n_jobs": -1},
        "param_grid": {
            "loss": ["hinge", "log_loss"],
            "alpha": [0.0001, 0.001],
            "penalty": ["l2", "l1"]
        }
    },
    "Perceptron": {
        "class": "Perceptron",
        "module": "sklearn.linear_model",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42, "max_iter": 1000, "n_jobs": -1},
        "param_grid": {
            "penalty": ["l2", "elasticnet", None],
            "alpha": [0.0001, 0.001]
        }
    },
    "KNeighborsClassifier": {
        "class": "KNeighborsClassifier",
        "module": "sklearn.neighbors",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"n_jobs": -1},
        "param_grid": {
            "n_neighbors": [3, 5, 7],
            "weights": ["uniform", "distance"],
            "metric": ["euclidean", "manhattan"]
        }
    },
    "SVC": {
        "class": "SVC",
        "module": "sklearn.svm",
        "random_state": 42,
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42, "probability": True},
        "param_grid": {
            "C": [0.1, 1, 10],
            "kernel": ["linear", "rbf"],
            "gamma": ["scale", "auto"]
        }
    },
    "GaussianNB": {
        "class": "GaussianNB",
        "module": "sklearn.naive_bayes",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {},
        "param_grid": {}
    },
    "LinearDiscriminantAnalysis": {
        "class": "LinearDiscriminantAnalysis",
        "module": "sklearn.discriminant_analysis",
        "requires_numeric_labels": False,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {},
        "param_grid": {
            "solver": ["svd", "lsqr"],
            "shrinkage": [None, "auto"]
        }
    },
    "QuadraticDiscriminantAnalysis": {
    "class": "QuadraticDiscriminantAnalysis",
    "module": "sklearn.discriminant_analysis",
    "requires_numeric_labels": False,
    "search_type": "grid",
    "scoring": "accuracy",
    "default_params": {},
    "param_grid": {
        "reg_param": [0.0, 0.1, 0.5]
    }
    },
    "MLPClassifier": {
        "class": "MLPClassifier",
        "module": "sklearn.neural_network",
        "requires_numeric_labels": True,
        "search_type": "grid",
        "scoring": "accuracy",
        "default_params": {"random_state": 42, "max_iter": 1000},
        "param_grid": {
            "hidden_layer_sizes": [(50,), (100,), (50, 50)],
            "activation": ["relu", "tanh"],
            "solver": ["adam", "lbfgs"],
            "alpha": [0.0001, 0.001, 0.01]
        }
    }
}


In [3]:
# -------------------------------------------
# 3. INSTANTIATE MODELS FROM TUNING RESULTS
# -------------------------------------------

# This code transforms the abstract tuning results (JSON parameter values) into
# ready-to-use model objects. The results are the actual models ready for benchmark comparison.
 
# Instantiate all 16 classification models, using either their tuned
# hyperparameters (from best_params.json) created in the prior module,
# or their default settings (e.g., GaussianNB). 
# Store this info in model_dict for each models evaluation in the next module.


from joblib import dump
from pprint import pprint

model_dict = {}

print("Building model_dict from best_params.json...\n")

for model_name, meta in AVAILABLE_MODELS.items():
    model_class_str = meta["class"]
    module_str = meta["module"]

    # Dynamically import and instantiate each model class.
    # Eliminates the need for 16 separate hardcoded imports and instantiation blocks.
    module = importlib.import_module(module_str)
    model_class = getattr(module, model_class_str)

    # Use tuned parameters if they were generated, else use default.
    # Merge default_params with tuned best_params (i.e., the winner from the GridSearchCV). 
    default_params = meta.get("default_params", {})
    tuned_params = best_params.get(model_name, {}).get("best_params", {})
    params = {**default_params, **tuned_params}

    comment = (
        "✅ Tuned + default params used"
        if tuned_params else
        "⚠️ Only default parameters used (no tuning found)"
    )
    # Instantiate and store each model in model_dict.  
    model = model_class(**params)
    model_dict[model_name] = model

    print(f"{model_name.ljust(30)} | {comment}")

# Print its keys to confirm that the dictionary, model_dict, is populated
print("\nmodel_dict created with the following keys:")
model_keys = list(model_dict.keys())

for i in range(0, len(model_keys), 4):
    print("   " + " | ".join(model_keys[i:i+4]))


   
from pprint import pprint

# Displays the complete parameter configuration for each instantiated model
# using scikit-learn's .get_params() method.
# Show ALL parameters the model will use during training/testing - not just the ones 
# explicitly set, but also all the scikit-learn defaults. Helps traceability and 
# provides documentation of the benchmarking setup.

print("\nConfirm full constructor parameters for each model in model_dict:\n")

for model_name in sorted(model_dict.keys()):
    print(f"🧠 {model_name}")
    model = model_dict[model_name]
    pprint(model.get_params())
    print("-" * 60)

# Create the complete dictionary of instantiated, configured models to disk 
# using joblib. The model_dict.joblib file becomes the primary input for the next step in
# this benchmarking project. The file provides a clean separation between hyperparameter
# optimization and subsequent model benchmarking script. Hey, saves a little RAM, too.

dump(model_dict, MODEL_DICT_PATH)
print(f"\nSaved model_dict to: {MODEL_DICT_PATH}")

Building model_dict from best_params.json...

DecisionTreeClassifier         | ✅ Tuned + default params used
RandomForestClassifier         | ✅ Tuned + default params used
ExtraTreesClassifier           | ✅ Tuned + default params used
GradientBoostingClassifier     | ✅ Tuned + default params used
AdaBoostClassifier             | ✅ Tuned + default params used
XGBClassifier                  | ✅ Tuned + default params used
LogisticRegression             | ✅ Tuned + default params used
RidgeClassifier                | ✅ Tuned + default params used
SGDClassifier                  | ✅ Tuned + default params used
Perceptron                     | ✅ Tuned + default params used
KNeighborsClassifier           | ✅ Tuned + default params used
SVC                            | ✅ Tuned + default params used
GaussianNB                     | ⚠️ Only default parameters used (no tuning found)
LinearDiscriminantAnalysis     | ✅ Tuned + default params used
QuadraticDiscriminantAnalysis  | ✅ Tuned + default p

In [4]:
# -------------------------------------------
# 4. DISPLAY EACH MODEL'S HYPERPARAMETERS
# -------------------------------------------

print("\nModel instantiations that will be used (showing only scikt-learn's subset, not all actual params):\n")

for key in sorted(model_dict.keys()):
    model = model_dict[key]
    print(f'"{key}": {model}')





Model instantiations that will be used (showing only scikt-learn's subset, not all actual params):

"AdaBoostClassifier": AdaBoostClassifier(n_estimators=100, random_state=42)
"DecisionTreeClassifier": DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42)
"ExtraTreesClassifier": ExtraTreesClassifier(min_samples_split=5, n_jobs=-1, random_state=42)
"GaussianNB": GaussianNB()
"GradientBoostingClassifier": GradientBoostingClassifier(max_features='sqrt', random_state=42, subsample=0.8)
"KNeighborsClassifier": KNeighborsClassifier(metric='euclidean', n_jobs=-1, n_neighbors=7,
                     weights='distance')
"LinearDiscriminantAnalysis": LinearDiscriminantAnalysis()
"LogisticRegression": LogisticRegression(C=10, max_iter=1000, n_jobs=-1, random_state=42)
"MLPClassifier": MLPClassifier(alpha=0.01, hidden_layer_sizes=[50], max_iter=1000,
              random_state=42)
"Perceptron": Perceptron(n_jobs=-1, random_state=42)
"QuadraticDiscriminantAnalysis": QuadraticD