##Databricks Notebook: Model Training with Hyperparameter Tuning
Overview


This notebook performs hyperparameter tuning for two machine learning models (Logistic Regression and Random Forest) using previously transformed features from the STEDI project. The notebook loads saved pipeline artifacts, converts features to appropriate formats, performs grid search, compares models, and saves the best performing model.

In [0]:
# Import necessary libraries
import joblib
import numpy as np
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up CORRECT Databricks paths
# In Databricks, use paths without /dbfs/ prefix for joblib
base_path = "/etl_pipeline"
pipeline_path = f"{base_path}/stedi_feature_pipeline.pkl"
X_train_path = f"{base_path}/X_train_transformed.pkl"
X_test_path = f"{base_path}/X_test_transformed.pkl"
y_train_path = f"{base_path}/y_train.pkl"
y_test_path = f"{base_path}/y_test.pkl"

# Verify files exist before loading
import os
print("Checking if files exist...")
print(f"Pipeline file exists: {os.path.exists(pipeline_path)}")
print(f"X_train file exists: {os.path.exists(X_train_path)}")
print(f"X_test file exists: {os.path.exists(X_test_path)}")
print(f"y_train file exists: {os.path.exists(y_train_path)}")
print(f"y_test file exists: {os.path.exists(y_test_path)}")

# Load the saved pipeline and datasets
print("\nLoading saved pipeline and datasets...")
try:
    feature_pipeline = joblib.load(pipeline_path)
    X_train_transformed = joblib.load(X_train_path)
    X_test_transformed = joblib.load(X_test_path)
    y_train = joblib.load(y_train_path)
    y_test = joblib.load(y_test_path)
    print("✓ Loading complete!")
except Exception as e:
    print(f"✗ Error loading files: {e}")
    print("\nTroubleshooting tips:")
    print("1. Check if the files were created in the previous assignment")
    print("2. Verify the path: /etl_pipeline/")
    print("3. List files in the directory:")
    try:
        print(os.listdir("/etl_pipeline"))
    except:
        print("Could not list directory contents")

2. Feature Conversion

Convert transformed feature arrays into clean numeric 2-D NumPy arrays. This step is essential because the transformed arrays may be sparse, object-typed, or oddly shaped

In [0]:
def convert_to_numeric_2d(array_like):
    """
    Convert transformed feature arrays into clean numeric 2-D NumPy arrays.
    Handles sparse matrices, DataFrames, and other possible outputs from the pipeline.
    """
    # Convert to dense array if sparse
    if hasattr(array_like, 'toarray'):
        array_like = array_like.toarray()
    
    # Convert to numpy array if DataFrame
    if hasattr(array_like, 'values'):
        array_like = array_like.values
    
    # Ensure it's a numpy array
    array_like = np.array(array_like)
    
    # Handle 1-D arrays by reshaping to 2-D
    if array_like.ndim == 1:
        array_like = array_like.reshape(-1, 1)
    
    # Convert object dtype to float if needed
    if array_like.dtype == object:
        try:
            array_like = array_like.astype(float)
        except ValueError:
            # If conversion fails, use one-hot encoding for categorical columns
            from sklearn.preprocessing import OneHotEncoder
            encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
            array_like = encoder.fit_transform(array_like)
    
    # Final check for NaN values
    if np.isnan(array_like).any():
        array_like = np.nan_to_num(array_like)
    
    return array_like

# Apply conversion to transformed datasets
print("Converting transformed features to numeric 2-D arrays...")
X_train = convert_to_numeric_2d(X_train_transformed)
X_test = convert_to_numeric_2d(X_test_transformed)

# Verify shapes
print("\n=== Dataset Shapes ===")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

# Verify data types
print("\n=== Data Types ===")
print(f"X_train dtype: {X_train.dtype}")
print(f"X_test dtype: {X_test.dtype}")
print(f"y_train dtype: {y_train.dtype}")
print(f"y_test dtype: {y_test.dtype}")

# Sample preview
print("\n=== Sample Data (first 3 rows) ===")
print("X_train sample:")
print(X_train[:3])
print("\ny_train sample:")
print(y_train[:3])

3. Logistic Regression Tuning



Perform hyperparameter tuning for Logistic Regression using GridSearchCV with 5-fold cross-validation.

In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
import time

print("Starting Logistic Regression Hyperparameter Tuning...")
print("=" * 50)

# Define the parameter grid for Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1.0, 10.0],  # Regularization strength
    'penalty': ['l1', 'l2'],  # Regularization type
    'solver': ['liblinear', 'saga'],  # Solvers that support L1
    'max_iter': [100, 500, 1000],
    'class_weight': [None, 'balanced']
}

# Create and fit GridSearchCV
lr_model = LogisticRegression(random_state=42)
grid_search_lr = GridSearchCV(
    estimator=lr_model,
    param_grid=param_grid_lr,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,  # Use all available cores
    verbose=1
)

# Record start time
start_time = time.time()

# Fit GridSearchCV
print("Performing grid search for Logistic Regression...")
grid_search_lr.fit(X_train, y_train)

# Record end time
training_time = time.time() - start_time

# Display results
print("\n=== Logistic Regression Tuning Results ===")
print(f"Best parameters: {grid_search_lr.best_params_}")
print(f"Best cross-validation score: {grid_search_lr.best_score_:.4f}")
print(f"Training time: {training_time:.2f} seconds")

# Evaluate on test set
y_pred_lr = grid_search_lr.predict(X_test)
test_accuracy_lr = accuracy_score(y_test, y_pred_lr)
print(f"Test set accuracy: {test_accuracy_lr:.4f}")

# Store best model
best_lr_model = grid_search_lr.best_estimator_

4. Random Forest Tuning


Perform hyperparameter tuning for Random Forest using GridSearchCV with 5-fold cross-validation.

In [0]:
from sklearn.ensemble import RandomForestClassifier

print("\nStarting Random Forest Hyperparameter Tuning...")
print("=" * 50)

# Define the parameter grid for Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 200],  # Number of trees
    'max_depth': [10, 20, 30, None],  # Maximum depth of trees
    'min_samples_split': [2, 5, 10],  # Minimum samples to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features for best split
    'class_weight': [None, 'balanced', 'balanced_subsample']
}

# Create and fit GridSearchCV
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)
grid_search_rf = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid_rf,
    cv=5,  # 5-fold cross-validation
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

# Record start time
start_time = time.time()

# Fit GridSearchCV
print("Performing grid search for Random Forest...")
grid_search_rf.fit(X_train, y_train)

# Record end time
training_time = time.time() - start_time

# Display results
print("\n=== Random Forest Tuning Results ===")
print(f"Best parameters: {grid_search_rf.best_params_}")
print(f"Best cross-validation score: {grid_search_rf.best_score_:.4f}")
print(f"Training time: {training_time:.2f} seconds")

# Evaluate on test set
y_pred_rf = grid_search_rf.predict(X_test)
test_accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f"Test set accuracy: {test_accuracy_rf:.4f}")

# Store best model
best_rf_model = grid_search_rf.best_estimator_


5. Model Comparison


Compare the tuned models and select the best one based on cross-validation performance.

In [0]:
print("\n=== Model Comparison ===")
print("=" * 50)

# Create comparison dictionary as required
model_comparison = {
    'LogisticRegression': {
        'best_params': grid_search_lr.best_params_,
        'best_cv_score': grid_search_lr.best_score_,
        'test_accuracy': test_accuracy_lr,
        'model': best_lr_model
    },
    'RandomForest': {
        'best_params': grid_search_rf.best_params_,
        'best_cv_score': grid_search_rf.best_score_,
        'test_accuracy': test_accuracy_rf,
        'model': best_rf_model
    }
}

# Display comparison table
print("\nModel Performance Comparison:")
print("-" * 80)
print(f"{'Model':<20} {'CV Score':<15} {'Test Accuracy':<15} {'Selected':<10}")
print("-" * 80)

for model_name, results in model_comparison.items():
    print(f"{model_name:<20} {results['best_cv_score']:<15.4f} {results['test_accuracy']:<15.4f}")

# Select best model based on cross-validation score
if model_comparison['LogisticRegression']['best_cv_score'] > model_comparison['RandomForest']['best_cv_score']:
    best_model_name = 'LogisticRegression'
    best_model = best_lr_model
    print(f"\n✓ Selected best model: {best_model_name} (higher CV score)")
else:
    best_model_name = 'RandomForest'
    best_model = best_rf_model
    print(f"\n✓ Selected best model: {best_model_name} (higher CV score)")

# Print detailed comparison
print("\n" + "=" * 80)
print("DETAILED COMPARISON")
print("=" * 80)

for model_name, results in model_comparison.items():
    print(f"\n{model_name}:")
    print(f"  Best CV Score: {results['best_cv_score']:.4f}")
    print(f"  Test Accuracy: {results['test_accuracy']:.4f}")
    print(f"  Parameters: {results['best_params']}")

6. Model Saving

Save the selected best model and comparison results for future use.

In [0]:
import os
from datetime import datetime

print("\n=== Saving Selected Model ===")
print("=" * 50)

# Create models directory if it doesn't exist
models_dir = "/models"
os.makedirs(models_dir, exist_ok=True)

# Generate timestamp for versioning
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

# Save the best model
model_filename = f"{models_dir}/stedi_best_model_{timestamp}.pkl"
joblib.dump(best_model, model_filename)

# Also save a version without timestamp for easy reference
latest_model_filename = f"{models_dir}/stedi_best_model_latest.pkl"
joblib.dump(best_model, latest_model_filename)

print(f"✓ Best model ({best_model_name}) saved to:")
print(f"  - Versioned: {model_filename}")
print(f"  - Latest: {latest_model_filename}")

# Save the model comparison results
comparison_filename = f"{models_dir}/model_comparison_{timestamp}.pkl"
joblib.dump(model_comparison, comparison_filename)
print(f"✓ Model comparison saved to: {comparison_filename}")

# Verify the saved model can be loaded
print("\nVerifying saved model can be loaded...")
loaded_model = joblib.load(latest_model_filename)
print(f"✓ Model loaded successfully. Type: {type(loaded_model)}")

7. Evaluation & Ethics Reflection

Provide comprehensive evaluation metrics and ethical considerations for the STEDI project.

In [0]:
print("\n=== Evaluation & Ethics Reflection ===")
print("=" * 50)

# Import metrics for comprehensive evaluation
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

# Generate predictions from best model
y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, 'predict_proba') else None

print("\n1. PERFORMANCE EVALUATION")
print("-" * 40)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

# Classification Report
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred))

# ROC-AUC if probabilities are available
if y_pred_proba is not None:
    try:
        roc_auc = roc_auc_score(y_test, y_pred_proba)
        print(f"ROC-AUC Score: {roc_auc:.4f}")
    except:
        print("ROC-AUC calculation skipped (requires probability estimates)")

print("\n2. ETHICAL CONSIDERATIONS")
print("-" * 40)

ethical_reflection = """
For the STEDI project, which involves human activity/fall detection:

a) **Fairness & Bias**: 
   - The model must perform equally well across different demographic groups
   - Training data should represent diverse age groups, mobility levels, and body types
   - Regular bias audits should be conducted

b) **Privacy**: 
   - Motion sensor data must be anonymized and secured
   - Inference should happen locally when possible to protect user data
   - Clear data retention policies are needed

c) **Safety**: 
   - False negatives (missed falls) are more dangerous than false positives
   - Model confidence thresholds should prioritize recall over precision
   - A human-in-the-loop system for critical decisions is recommended

d) **Transparency**: 
   - Users should understand how predictions are made
   - Model limitations should be clearly communicated
   - Fallback mechanisms for model uncertainty are essential

e) **Continual Monitoring**: 
   - Model performance should be monitored in production
   - Concept drift detection for changing user behaviors
   - Regular retraining with new, representative data
"""

print(ethical_reflection)

print("\n3. DEPLOYMENT RECOMMENDATIONS")
print("-" * 40)

recommendations = """
1. **Model Serving**: Deploy as a REST API using MLflow or Databricks Model Serving
2. **Monitoring**: Implement tracking of:
   - Prediction latency
   - Model accuracy drift
   - Feature distribution shifts
3. **Alerting**: Set up alerts for:
   - Sudden drop in performance
   - Increased false negative rate
   - System downtime
4. **Version Control**: Maintain model version history with:
   - Training data snapshots
   - Hyperparameter configurations
   - Performance metrics
5. **Compliance**: Ensure compliance with healthcare regulations if applicable
"""

print(recommendations)

# Create a simple visualization of model performance
print("\n4. VISUAL SUMMARY")
print("-" * 40)

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Plot 1: Confusion Matrix Heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0])
axes[0].set_title(f'Confusion Matrix - {best_model_name}')
axes[0].set_xlabel('Predicted Label')
axes[0].set_ylabel('True Label')

# Plot 2: Model Comparison Bar Chart
models = list(model_comparison.keys())
cv_scores = [model_comparison[m]['best_cv_score'] for m in models]
test_scores = [model_comparison[m]['test_accuracy'] for m in models]

x = np.arange(len(models))
width = 0.35

axes[1].bar(x - width/2, cv_scores, width, label='CV Score', color='skyblue')
axes[1].bar(x + width/2, test_scores, width, label='Test Accuracy', color='lightcoral')
axes[1].set_title('Model Performance Comparison')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Score')
axes[1].set_xticks(x)
axes[1].set_xticklabels(models)
axes[1].legend()
axes[1].set_ylim([0, 1.0])

plt.tight_layout()
display(fig)

print("\n✓ Notebook execution completed successfully!")
print("✓ Best model saved and ready for deployment.")