# Assignment 5.3 – Hyperparameter Tuning (STEDI)

This notebook:
1. Loads the saved feature pipeline + transformed train/test datasets from the Workspace repo path (NOT DBFS).
2. Converts loaded data to numeric float matrices using `to_float_matrix`.
3. Runs **reasonable** `GridSearchCV` for:
   - Logistic Regression
   - Random Forest
4. Compares tuned models and selects the best model.
5. Saves **ONLY** the final best model back to the Workspace repo path.

**Important constraints:**
- Do NOT use `/dbfs`, `FileStore`, or `dbutils.fs`.
- Always verify files with `os.listdir`.
- Always print shapes after loading.


In [0]:
import os
import pickle
import joblib
import numpy as np

from scipy import sparse
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# --- Determine current repo directory ---
BASE_PATH = os.getcwd()

print("Current working directory:")
print(BASE_PATH)

print("\nFiles in BASE_PATH:")
print(os.listdir(BASE_PATH))


In [0]:
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
import os
import warnings
warnings.filterwarnings("ignore")


base_path = (Path.cwd() / ".." / "etl_pipeline").resolve()

pipeline_path = base_path / "stedi_feature_pipeline.pkl"
X_train_path = base_path / "X_train_transformed.pkl"
X_test_path = base_path / "X_test_transformed.pkl"
y_train_path = base_path / "y_train.pkl"
y_test_path = base_path / "y_test.pkl"

print("BASE_PATH:", base_path)
print("Files:", [p.name for p in base_path.iterdir()])

print("\nChecking if files exist...")
print("Pipeline:", pipeline_path.exists())
print("X_train:", X_train_path.exists())
print("X_test:", X_test_path.exists())
print("y_train:", y_train_path.exists())
print("y_test:", y_test_path.exists())

print("\nLoading saved pipeline and datasets...")
try:
    feature_pipeline = joblib.load(pipeline_path)
    X_train_transformed = joblib.load(X_train_path)
    X_test_transformed = joblib.load(X_test_path)
    y_train = joblib.load(y_train_path)
    y_test = joblib.load(y_test_path)
    print("✓ Loading complete!")
except Exception as e:
    print(f"✗ Error loading files: {type(e).__name__}: {e}")


In [0]:
import os
import pickle
import joblib
import numpy as np
from scipy import sparse

# FIX: ensure BASE_PATH exists in this cell
BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "etl_pipeline"))
print("BASE_PATH:", BASE_PATH)
print("Files:", os.listdir(BASE_PATH))

def load_pkl(path):
    """
    Robust loader: tries joblib first, falls back to pickle.
    """
    try:
        return joblib.load(path)
    except Exception as e:
        print(f"joblib failed on {path}: {e}")
        with open(path, "rb") as f:
            return pickle.load(f)

pipeline_path = os.path.join(BASE_PATH, "stedi_feature_pipeline.pkl")
xtr_path = os.path.join(BASE_PATH, "X_train_transformed.pkl")
xte_path = os.path.join(BASE_PATH, "X_test_transformed.pkl")
ytr_path = os.path.join(BASE_PATH, "y_train.pkl")
yte_path = os.path.join(BASE_PATH, "y_test.pkl")

# Validate files exist BEFORE loading
required = [pipeline_path, xtr_path, xte_path, ytr_path, yte_path]
missing = [p for p in required if not os.path.exists(p)]
print("Missing files:", missing)

assert len(missing) == 0, "One or more required .pkl files are missing."

# Load
feature_pipeline = load_pkl(pipeline_path)
X_train_transformed = load_pkl(xtr_path)
X_test_transformed  = load_pkl(xte_path)
y_train = load_pkl(ytr_path)
y_test  = load_pkl(yte_path)

# Print types
print("\nType checks:")
print(type(feature_pipeline))
print(type(X_train_transformed))
print(type(X_test_transformed))
print(type(y_train))
print(type(y_test))

# Shape checks
def get_shape(x):
    if sparse.issparse(x):
        return x.shape
    return np.array(x).shape

print("\nShape checks:")
print("X_train_transformed:", get_shape(X_train_transformed))
print("X_test_transformed:", get_shape(X_test_transformed))
print("y_train:", np.array(y_train).shape)
print("y_test:", np.array(y_test).shape)


In [0]:
import os

print("Current working directory:")
print(os.getcwd())

print("\nFiles in current directory:")
print(os.listdir())


In [0]:
import os

print("Current directory:")
print(os.getcwd())

print("\nListing parent directories:")
print(os.listdir("/Workspace/Users/dec816@ensign.edu"))


## Step 2 — Convert to Numeric Matrices (`to_float_matrix`)

Some ML models require numeric float matrices.
We convert:
- `X_train_transformed` → `X_train_float`
- `X_test_transformed` → `X_test_float`

We also ensure `y_train` and `y_test` are 1D arrays.


In [0]:
import os

# Notebook is running from .../notebooks
# PKL files are in sibling folder .../etl_pipeline
BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "etl_pipeline"))

print("CWD:", os.getcwd())
print("BASE_PATH:", BASE_PATH)
print("BASE_PATH exists:", os.path.exists(BASE_PATH))
print("Files in BASE_PATH:")
print(os.listdir(BASE_PATH))


In [0]:
def to_float_matrix(X):
    """
    Converts X into a numeric float matrix.
    Handles:
    - scipy sparse matrices
    - numpy arrays
    - lists
    """
    if sparse.issparse(X):
        return X.astype(np.float64)
    X = np.array(X)
    return X.astype(np.float64)

X_train_float = to_float_matrix(X_train_transformed)
X_test_float  = to_float_matrix(X_test_transformed)

y_train_arr = np.array(y_train).ravel()
y_test_arr  = np.array(y_test).ravel()

print("After conversion:")
print("X_train_float type:", type(X_train_float), "shape:", get_shape(X_train_float))
print("X_test_float type:", type(X_test_float), "shape:", get_shape(X_test_float))
print("y_train_arr shape:", y_train_arr.shape, "dtype:", y_train_arr.dtype)
print("y_test_arr shape:", y_test_arr.shape, "dtype:", y_test_arr.dtype)

# Optional: check sparsity
if sparse.issparse(X_train_float):
    nnz = X_train_float.nnz
    total = X_train_float.shape[0] * X_train_float.shape[1]
    print(f"Sparse matrix nnz={nnz:,} out of {total:,} ({nnz/total:.6f} density)")


## Step 3 — Hyperparameter Tuning: Logistic Regression (GridSearchCV)

We run a **small, reasonable** grid to avoid long runtimes.
We use 3-fold CV and accuracy scoring.


In [0]:
log_reg = LogisticRegression()

# Reasonable grid (fast but still meaningful)
log_reg_param_grid = {
    "C": [0.1, 1.0, 10.0],
    "solver": ["liblinear", "saga"],
    "penalty": ["l2"],
    "max_iter": [200, 500]
}

log_reg_grid = GridSearchCV(
    estimator=log_reg,
    param_grid=log_reg_param_grid,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=1
)

log_reg_grid.fit(X_train_float, y_train_arr)

print("Best Logistic Regression Params:", log_reg_grid.best_params_)
print("Best CV Accuracy (LogReg):", log_reg_grid.best_score_)


In [0]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import ConvergenceWarning
import warnings

warnings.filterwarnings("ignore", category=ConvergenceWarning)

pipe = Pipeline([
    ("scaler", StandardScaler(with_mean=False)),
    ("clf", LogisticRegression())
])

# MUCH smaller grid (only liblinear, one max_iter)
param_grid = {
    "clf__C": [0.1, 1.0, 10.0],
    "clf__solver": ["liblinear"],
    "clf__penalty": ["l2"],
    "clf__max_iter": [2000]
}

grid = GridSearchCV(
    pipe,
    param_grid,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid.fit(X_train_float, y_train_arr)

print("Best Params:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)


## Step 3b — Evaluate Tuned Logistic Regression on Test Set


In [0]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

best_lr = grid_lr.best_estimator_

y_pred_lr = best_lr.predict(X_test_float)
lr_test_acc = accuracy_score(y_test_arr, y_pred_lr)

print("Tuned Logistic Regression Test Accuracy:", lr_test_acc)
print("\nConfusion Matrix:\n", confusion_matrix(y_test_arr, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test_arr, y_pred_lr))


## Step 4 — Hyperparameter Tuning: Random Forest (GridSearchCV)

Random Forest can get slow if the grid is too large.
This is a **small grid** designed to stay within a reasonable runtime.


In [0]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=42)

param_grid_rf = {
    "n_estimators": [100, 200],
    "max_depth": [None, 20],
    "min_samples_split": [2, 5]
}

grid_rf = GridSearchCV(
    rf,
    param_grid_rf,
    scoring="accuracy",
    cv=3,
    n_jobs=-1,
    verbose=1
)

grid_rf.fit(X_train_float, y_train_arr)

print("Best RF Params:", grid_rf.best_params_)
print("Best RF CV Accuracy:", grid_rf.best_score_)


## Step 4b — Evaluate Tuned Random Forest on Test Set


In [0]:
best_rf = grid_rf.best_estimator_

y_pred_rf = best_rf.predict(X_test_float)
rf_test_acc = accuracy_score(y_test_arr, y_pred_rf)

print("Tuned Random Forest Test Accuracy:", rf_test_acc)
print("\nConfusion Matrix:\n", confusion_matrix(y_test_arr, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test_arr, y_pred_rf))


## Step 5 — Compare Tuned Models and Select Best

We compare **test accuracy** (not just CV accuracy) and select the best model.


In [0]:
print("LR Test Accuracy:", lr_test_acc)
print("RF Test Accuracy:", rf_test_acc)

if rf_test_acc > lr_test_acc:
    best_model_name = "RandomForestClassifier (Tuned)"
    best_model = best_rf
    best_acc = rf_test_acc
else:
    best_model_name = "LogisticRegression (Tuned)"
    best_model = best_lr
    best_acc = lr_test_acc

print("\n✅ Selected Best Model:", best_model_name)
print("Best Model Test Accuracy:", best_acc)


## Step 6 — Save ONLY the Final Best Model (Workspace Repo Path)

We save the final selected model to:
- `best_model_final.pkl`

We do NOT save to DBFS/FileStore.
We verify the file exists by listing the directory after saving.

If an old version exists, we delete it first (explicitly).


In [0]:
import os
import joblib

final_model_path = os.path.join(BASE_PATH, "best_model_final.pkl")

# Explicit delete if it already exists
if os.path.exists(final_model_path):
    print("Deleting old model:", final_model_path)
    os.remove(final_model_path)

joblib.dump(best_model, final_model_path)

print("✅ Saved best model to:", final_model_path)
print("\nFiles now in BASE_PATH:")
print(os.listdir(BASE_PATH))

assert os.path.exists(final_model_path), "Save failed: best_model_final.pkl not found."


## Hyperparameter Tuning — Logistic Regression

In [0]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

param_grid_lr = {
    "C": [0.1, 1.0, 10.0],
    "solver": ["liblinear"],
    "max_iter": [200]
}

grid_lr = GridSearchCV(
    log_reg,
    param_grid_lr,
    scoring="accuracy",
    cv=3,
    n_jobs=-1
)

grid_lr.fit(X_train_float, y_train_arr)

print("Best LR Params:", grid_lr.best_params_)
print("Best LR CV Accuracy:", grid_lr.best_score_)


## Hyperparameter Tuning — Random Forest


In [0]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)

param_grid_rf = {
    "n_estimators": [100, 200],
    "max_depth": [None, 20]
}

grid_rf = GridSearchCV(
    rf,
    param_grid_rf,
    scoring="accuracy",
    cv=3,
    n_jobs=-1
)

grid_rf.fit(X_train_float, y_train_arr)

print("Best RF Params:", grid_rf.best_params_)
print("Best RF CV Accuracy:", grid_rf.best_score_)


## Compare Tuned Models on Test Set


In [0]:
from sklearn.metrics import accuracy_score

best_lr = grid_lr.best_estimator_
best_rf = grid_rf.best_estimator_

lr_test_acc = accuracy_score(y_test_arr, best_lr.predict(X_test_float))
rf_test_acc = accuracy_score(y_test_arr, best_rf.predict(X_test_float))

print("Tuned LR Test Accuracy:", lr_test_acc)
print("Tuned RF Test Accuracy:", rf_test_acc)


## Select Best Model and Save


In [0]:
import joblib
import os


best_model = best_rf if rf_test_acc > lr_test_acc else best_lr

save_path = os.path.join(BASE_PATH, "best_model_final.pkl")
joblib.dump(best_model, save_path)

print("Best model saved to:", save_path)


## Model Performance Summary

Two models were tuned using GridSearchCV:

• Logistic Regression  
• Random Forest Classifier  

After hyperparameter tuning and evaluation on the test dataset:

- Tuned Logistic Regression Test Accuracy: ~0.951
- Tuned Random Forest Test Accuracy: ~0.951

Both models achieved nearly identical performance on unseen data.

Because the models performed similarly, Logistic Regression was selected as the final model due to its simplicity, interpretability, and lower computational cost.


## Bias and Fairness Considerations

Machine learning models can reflect biases present in the training data.

Potential risks include:

• Demographic bias  
• Underrepresentation of certain groups  
• Historical bias embedded in the dataset  

To mitigate these risks, it is important to:

• Evaluate model performance across different subgroups  
• Monitor for disparate impact  
• Continuously retrain with updated and diverse data
