# 3.0 - Model Training - Team 34
Training and comparing multiple ML models

## Setup

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#from pathlib import Path
#import sys
#sys.path.append(str(Path('C:/my-ws/projects/MLOps/Fase2').resolve()))

from fase2.config import config
from fase2.core.data_processor import DataProcessor
from fase2.core.feature_engineer import FeatureEngineer
from fase2.pipeline_builder import PipelineBuilder
from fase2.plots import plot_confusion_matrix, plot_roc_curve, plot_model_comparison

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    confusion_matrix,
)

sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)

print("✓ Imports successful")

✓ Imports successful


## 1. Prepare Data

In [54]:
# Check if processed data exists
X_train_path = config.paths.processed_data_dir / "X_train.csv"
X_test_path = config.paths.processed_data_dir / "X_test.csv"

if not X_train_path.exists():
    print("Processed data not found. Running data preparation...")

    # 1. Clean data
    processor = DataProcessor()
    df_clean = (
        processor.load_raw_data()
        .translate_columns()
        .clean_whitespace()
        .convert_to_numeric()
        .validate_target()
        .handle_missing_values()
        .validate_categorical_ranges()
        .remove_duplicates()
        .get_data()
    )

    # Save cleaned data
    output_path = config.paths.interim_data_dir / "german_credit_cleaned.csv"
    output_path.parent.mkdir(parents=True, exist_ok=True)
    df_clean.to_csv(output_path, index=False)

    # 2. Feature engineering
    engineer = FeatureEngineer()
    paths = (
        engineer.load_data(output_path)
        .detect_outliers()
        .split_target()
        .train_test_split()
        .scale_features()
        .save_all()
    )

    print("✓ Data preparation complete")
else:
    print("✓ Using existing processed data")

# Load processed data
X_train = pd.read_csv(X_train_path)
X_test = pd.read_csv(X_test_path)
y_train = pd.read_csv(config.paths.processed_data_dir / "y_train.csv").values.ravel()
y_test = pd.read_csv(config.paths.processed_data_dir / "y_test.csv").values.ravel()

print(f"\nData loaded:")
print(f"  Train: {X_train.shape}")
print(f"  Test: {X_test.shape}")

✓ Using existing processed data

Data loaded:
  Train: (551, 20)
  Test: (138, 20)


## 2. Train Multiple Models

In [55]:
# Setup integration with MLFlow server
import mlflow
import mlflow.sklearn

remote_server_uri = "http://34.67.152.248:5000/"
mlflow.set_tracking_uri(remote_server_uri)
mlflow.set_experiment("/south_german_credit_risk_experiment_001")

<Experiment: artifact_location='gs://tidy-discipline-476410-r5-mlflow-artifacts/mlflow-artifacts/2', creation_time=1761710172468, experiment_id='2', last_update_time=1761710172468, lifecycle_stage='active', name='/south_german_credit_risk_experiment_001', tags={}>

In [56]:
# Models to compare
models_to_train = ["random_forest", "logistic_regression", "decision_tree"]

results = {}

for model_name in models_to_train:
    print(f"\n{'='*70}")
    print(f"Training: {model_name.replace('_', ' ').title()}")
    print("=" * 70)

    # Build pipeline with GridSearch
    builder = PipelineBuilder()
    grid_pipeline = builder.build_grid_search_pipeline(
        model_name=model_name, cv_folds=3  # Reduced for speed
    )

    with mlflow.start_run(run_name=f"{model_name.replace('_', ' ').title()} Pipeline"):
        # Train
        grid_pipeline.fit(X_train, y_train)
    
        # Get best model
        best_pipeline = grid_pipeline.best_estimator_
    
        # Predictions
        y_pred = best_pipeline.predict(X_test)
        y_proba = best_pipeline.predict_proba(X_test)[:, 1]
    
        # Metrics
        metrics = {
            "accuracy": accuracy_score(y_test, y_pred),
            "precision": precision_score(y_test, y_pred),
            "recall": recall_score(y_test, y_pred),
            "f1_score": f1_score(y_test, y_pred),
            "auc_roc": roc_auc_score(y_test, y_proba),
        }
    
        results[model_name] = {
            "metrics": metrics,
            "best_params": grid_pipeline.best_params_,
            "best_cv_score": grid_pipeline.best_score_,
            "y_pred": y_pred,
            "y_proba": y_proba,
        }
    
        print(f"\n✓ Results:")
        print(f"  Best CV Score: {grid_pipeline.best_score_:.4f}")
        print(f"  Test Accuracy: {metrics['accuracy']:.4f}")
        print(f"  Test AUC-ROC: {metrics['auc_roc']:.4f}")

        # Log parameters
        mlflow.log_param("Model", model_name)
        for key, value in grid_pipeline.best_params_.items():
            mlflow.log_param(key, value)

        # Log metrics        
        mlflow.log_metric("Accuracy", metrics['accuracy'])
        mlflow.log_metric("Precision", metrics['precision'])
        mlflow.log_metric("Recall", metrics['recall'])
        mlflow.log_metric("F1-Score", metrics['f1_score'])
        mlflow.log_metric("AUC-ROC", metrics['auc_roc'])

        # Create an MLflow Dataset object from the DataFrame
        mlflow_dataset = mlflow.data.from_pandas(df=X_train, name="german_credit_cleaned.csv")

        # Log the dataset to the current MLflow run
        mlflow.log_input(mlflow_dataset, context="training")
        
        # Log pipeline
        #mlflow.sklearn.log_model(best_pipeline, "pipeline")
        print(f"✅ {model_name} pipeline logged to MLflow")

mlflow.end_run()

[32m2025-10-29 09:37:20.116[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline_builder[0m:[36m__init__[0m:[36m40[0m - [34m[1mPipelineBuilder initialized[0m
[32m2025-10-29 09:37:20.118[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_grid_search_pipeline[0m:[36m106[0m - [1mBuilding GridSearchCV pipeline for random_forest...[0m
[32m2025-10-29 09:37:20.118[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m63[0m - [1mBuilding sklearn pipeline for random_forest...[0m
[32m2025-10-29 09:37:20.120[0m | [34m[1mDEBUG   [0m | [36mfase2.core.model_factory[0m:[36mcreate_model[0m:[36m58[0m - [34m[1mCreating Random Forest model with random_state=42[0m
[32m2025-10-29 09:37:20.122[0m | [32m[1mSUCCESS [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m77[0m - [32m[1m✓ Pipeline built with 3 steps[0m
[32m2025-10-29 09:37:20.122[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline_builder[0m:[36m


Training: Random Forest
Fitting 3 folds for each of 81 candidates, totalling 243 fits

✓ Results:
  Best CV Score: 0.7949
  Test Accuracy: 0.7464
  Test AUC-ROC: 0.8196
✅ random_forest pipeline logged to MLflow
🏃 View run Random Forest Pipeline at: http://34.67.152.248:5000/#/experiments/2/runs/10fbae52443c46f489cade2a7c4abbb5
🧪 View experiment at: http://34.67.152.248:5000/#/experiments/2


[32m2025-10-29 09:37:31.160[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline_builder[0m:[36m__init__[0m:[36m40[0m - [34m[1mPipelineBuilder initialized[0m
[32m2025-10-29 09:37:31.160[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_grid_search_pipeline[0m:[36m106[0m - [1mBuilding GridSearchCV pipeline for logistic_regression...[0m
[32m2025-10-29 09:37:31.160[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m63[0m - [1mBuilding sklearn pipeline for logistic_regression...[0m
[32m2025-10-29 09:37:31.160[0m | [34m[1mDEBUG   [0m | [36mfase2.core.model_factory[0m:[36mcreate_model[0m:[36m64[0m - [34m[1mCreating Logistic Regression model with random_state=42[0m
[32m2025-10-29 09:37:31.160[0m | [32m[1mSUCCESS [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m77[0m - [32m[1m✓ Pipeline built with 3 steps[0m
[32m2025-10-29 09:37:31.160[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline


Training: Logistic Regression
Fitting 3 folds for each of 5 candidates, totalling 15 fits

✓ Results:
  Best CV Score: 0.7763
  Test Accuracy: 0.7971
  Test AUC-ROC: 0.8186
✅ logistic_regression pipeline logged to MLflow
🏃 View run Logistic Regression Pipeline at: http://34.67.152.248:5000/#/experiments/2/runs/282e38be1a104581813c032003e09ba3
🧪 View experiment at: http://34.67.152.248:5000/#/experiments/2


[32m2025-10-29 09:37:33.464[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline_builder[0m:[36m__init__[0m:[36m40[0m - [34m[1mPipelineBuilder initialized[0m
[32m2025-10-29 09:37:33.476[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_grid_search_pipeline[0m:[36m106[0m - [1mBuilding GridSearchCV pipeline for decision_tree...[0m
[32m2025-10-29 09:37:33.476[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m63[0m - [1mBuilding sklearn pipeline for decision_tree...[0m
[32m2025-10-29 09:37:33.478[0m | [34m[1mDEBUG   [0m | [36mfase2.core.model_factory[0m:[36mcreate_model[0m:[36m72[0m - [34m[1mCreating Decision Tree model with random_state=42[0m
[32m2025-10-29 09:37:33.478[0m | [32m[1mSUCCESS [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m77[0m - [32m[1m✓ Pipeline built with 3 steps[0m
[32m2025-10-29 09:37:33.478[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline_builder[0m:[36m


Training: Decision Tree
Fitting 3 folds for each of 45 candidates, totalling 135 fits

✓ Results:
  Best CV Score: 0.6646
  Test Accuracy: 0.7681
  Test AUC-ROC: 0.7680
✅ decision_tree pipeline logged to MLflow
🏃 View run Decision Tree Pipeline at: http://34.67.152.248:5000/#/experiments/2/runs/6afc64b718474f44aaff121e896bd373
🧪 View experiment at: http://34.67.152.248:5000/#/experiments/2


In [57]:
# Models to compare
models_to_train = ["random_forest", "logistic_regression", "decision_tree"]

results = {}

for model_name in models_to_train:
    print(f"\n{'='*70}")
    print(f"Training: {model_name.replace('_', ' ').title()}")
    print("=" * 70)

    # Build pipeline with GridSearch
    builder = PipelineBuilder()
    grid_pipeline = builder.build_grid_search_pipeline(
        model_name=model_name, cv_folds=3  # Reduced for speed
    )

    # Train
    grid_pipeline.fit(X_train, y_train)

    # Get best model
    best_pipeline = grid_pipeline.best_estimator_

    # Predictions
    y_pred = best_pipeline.predict(X_test)
    y_proba = best_pipeline.predict_proba(X_test)[:, 1]

    # Metrics
    metrics = {
        "accuracy": accuracy_score(y_test, y_pred),
        "precision": precision_score(y_test, y_pred),
        "recall": recall_score(y_test, y_pred),
        "f1_score": f1_score(y_test, y_pred),
        "auc_roc": roc_auc_score(y_test, y_proba),
    }

    results[model_name] = {
        "metrics": metrics,
        "best_params": grid_pipeline.best_params_,
        "best_cv_score": grid_pipeline.best_score_,
        "y_pred": y_pred,
        "y_proba": y_proba,
    }

    print(f"\n✓ Results:")
    print(f"  Best CV Score: {grid_pipeline.best_score_:.4f}")
    print(f"  Test Accuracy: {metrics['accuracy']:.4f}")
    print(f"  Test AUC-ROC: {metrics['auc_roc']:.4f}")

[32m2025-10-29 09:37:55.253[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline_builder[0m:[36m__init__[0m:[36m40[0m - [34m[1mPipelineBuilder initialized[0m
[32m2025-10-29 09:37:55.253[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_grid_search_pipeline[0m:[36m106[0m - [1mBuilding GridSearchCV pipeline for random_forest...[0m
[32m2025-10-29 09:37:55.253[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m63[0m - [1mBuilding sklearn pipeline for random_forest...[0m
[32m2025-10-29 09:37:55.253[0m | [34m[1mDEBUG   [0m | [36mfase2.core.model_factory[0m:[36mcreate_model[0m:[36m58[0m - [34m[1mCreating Random Forest model with random_state=42[0m
[32m2025-10-29 09:37:55.253[0m | [32m[1mSUCCESS [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m77[0m - [32m[1m✓ Pipeline built with 3 steps[0m
[32m2025-10-29 09:37:55.253[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline_builder[0m:[36m


Training: Random Forest
Fitting 3 folds for each of 81 candidates, totalling 243 fits


[32m2025-10-29 09:38:03.330[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline_builder[0m:[36m__init__[0m:[36m40[0m - [34m[1mPipelineBuilder initialized[0m
[32m2025-10-29 09:38:03.330[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_grid_search_pipeline[0m:[36m106[0m - [1mBuilding GridSearchCV pipeline for logistic_regression...[0m
[32m2025-10-29 09:38:03.330[0m | [1mINFO    [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m63[0m - [1mBuilding sklearn pipeline for logistic_regression...[0m
[32m2025-10-29 09:38:03.330[0m | [34m[1mDEBUG   [0m | [36mfase2.core.model_factory[0m:[36mcreate_model[0m:[36m64[0m - [34m[1mCreating Logistic Regression model with random_state=42[0m
[32m2025-10-29 09:38:03.330[0m | [32m[1mSUCCESS [0m | [36mfase2.pipeline_builder[0m:[36mbuild_pipeline[0m:[36m77[0m - [32m[1m✓ Pipeline built with 3 steps[0m
[32m2025-10-29 09:38:03.330[0m | [34m[1mDEBUG   [0m | [36mfase2.pipeline


✓ Results:
  Best CV Score: 0.7949
  Test Accuracy: 0.7464
  Test AUC-ROC: 0.8196

Training: Logistic Regression
Fitting 3 folds for each of 5 candidates, totalling 15 fits

✓ Results:
  Best CV Score: 0.7763
  Test Accuracy: 0.7971
  Test AUC-ROC: 0.8186

Training: Decision Tree
Fitting 3 folds for each of 45 candidates, totalling 135 fits

✓ Results:
  Best CV Score: 0.6646
  Test Accuracy: 0.7681
  Test AUC-ROC: 0.7680


## 3. Model Comparison

In [58]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Create comparison table
comparison_data = []
for model_name, result in results.items():
    row = {
        "Model": model_name.replace("_", " ").title(),
        "Accuracy": result["metrics"]["accuracy"],
        "Precision": result["metrics"]["precision"],
        "Recall": result["metrics"]["recall"],
        "F1-Score": result["metrics"]["f1_score"],
        "AUC-ROC": result["metrics"]["auc_roc"],
        "CV Score": result["best_cv_score"],
    }
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values("AUC-ROC", ascending=False)

print("\n📊 Model Comparison:")
print(comparison_df.to_string(index=False))


📊 Model Comparison:
              Model  Accuracy  Precision   Recall  F1-Score  AUC-ROC  CV Score
      Random Forest  0.746377   0.775000 0.920792  0.841629 0.819641  0.794892
Logistic Regression  0.797101   0.811966 0.940594  0.871560 0.818571  0.776250
      Decision Tree  0.768116   0.834951 0.851485  0.843137 0.767996  0.664550


## 4. Visualizations

In [59]:
# Plot model comparison
plot_model_comparison(results)

  axes[1].set_xticklabels(model_names, rotation=45, ha="right")
[32m2025-10-29 09:38:11.649[0m | [1mINFO    [0m | [36mfase2.plots[0m:[36mplot_model_comparison[0m:[36m356[0m - [1m✓ Model comparison plot saved: C:\my-ws\projects\MLOps\Fase2\reports\figures\model_comparison.png[0m


WindowsPath('C:/my-ws/projects/MLOps/Fase2/reports/figures/model_comparison.png')

In [60]:
# Best model details
best_model_name = comparison_df.iloc[0]["Model"].lower().replace(" ", "_")
best_result = results[best_model_name]

print(f"\n🏆 Best Model: {best_model_name.replace('_', ' ').title()}")
print(f"  AUC-ROC: {best_result['metrics']['auc_roc']:.4f}")
print(f"  Best Parameters: {best_result['best_params']}")


🏆 Best Model: Random Forest
  AUC-ROC: 0.8196
  Best Parameters: {'model__max_depth': 10, 'model__min_samples_leaf': 1, 'model__min_samples_split': 5, 'model__n_estimators': 50}


In [61]:
# Confusion matrix for best model
cm = confusion_matrix(y_test, best_result["y_pred"])
plot_confusion_matrix(cm, best_model_name.replace("_", " ").title())

[32m2025-10-29 09:38:15.659[0m | [1mINFO    [0m | [36mfase2.plots[0m:[36mplot_confusion_matrix[0m:[36m81[0m - [1m✓ Confusion matrix saved: C:\my-ws\projects\MLOps\Fase2\reports\figures\confusion_matrix_random_forest.png[0m


WindowsPath('C:/my-ws/projects/MLOps/Fase2/reports/figures/confusion_matrix_random_forest.png')

In [62]:
# ROC curve for best model
plot_roc_curve(
    y_test, best_result["y_proba"], best_model_name.replace("_", " ").title()
)

[32m2025-10-29 09:38:17.651[0m | [1mINFO    [0m | [36mfase2.plots[0m:[36mplot_roc_curve[0m:[36m154[0m - [1m✓ ROC curve saved: C:\my-ws\projects\MLOps\Fase2\reports\figures\roc_curve_random_forest.png[0m


WindowsPath('C:/my-ws/projects/MLOps/Fase2/reports/figures/roc_curve_random_forest.png')

## 5. Classification Report

In [63]:
print(f"\nClassification Report - {best_model_name.replace('_', ' ').title()}:")
print("=" * 70)
report = classification_report(
    y_test, best_result["y_pred"], target_names=["Bad Credit (0)", "Good Credit (1)"]
)
print(report)


Classification Report - Random Forest:
                 precision    recall  f1-score   support

 Bad Credit (0)       0.56      0.27      0.36        37
Good Credit (1)       0.78      0.92      0.84       101

       accuracy                           0.75       138
      macro avg       0.67      0.60      0.60       138
   weighted avg       0.72      0.75      0.71       138



## Summary

**Models Trained:**
- Random Forest
- Logistic Regression
- Decision Tree

**Best Model:** [Will be determined by AUC-ROC]

**Key Findings:**
- All models trained successfully with GridSearchCV
- sklearn Pipeline ensures no data leakage
- Cross-validation provides robust performance estimates

**Next Steps:**
- Review sklearn Pipeline best practices (Notebook 4)
- Implement MLflow for experiment tracking (Stage 4)

In [64]:
print("\n✓ Model Training Complete!")


✓ Model Training Complete!
