In [1]:
# Notebook: 02_model_training.ipynb

import sys
import os
import pandas as pd
import joblib

import plotly.io as pio
pio.renderers.default = "notebook"  # or "inline" or "notebook_connected"
pio.renderers.default = "notebook_connected"


# Add src/ to path
sys.path.append(os.path.abspath("../src"))

from data_processing import load_data, preprocess_data, get_train_test_data
from model import train_model, evaluate_model

# Load and preprocess data
df = load_data("../data/employee_data.csv")
X, y, preprocessor = preprocess_data(df)
X_train, X_test, y_train, y_test = get_train_test_data(X, y)

# Transform features
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Dictionary to store results
results = {}

# Train and evaluate Logistic Regression
print("=== Logistic Regression ===")
log_model = train_model(X_train_processed, y_train, model_type="logistic")
results["Logistic Regression"] = evaluate_model(log_model, X_test_processed, y_test)

# Train and evaluate Random Forest
print("\n=== Random Forest ===")
rf_model = train_model(X_train_processed, y_train, model_type="random_forest")
results["Random Forest"] = evaluate_model(rf_model, X_test_processed, y_test)

# Train and evaluate XGBoost
print("\n=== XGBoost ===")
xgb_model = train_model(X_train_processed, y_train, model_type="xgboost")
results["XGBoost"] = evaluate_model(xgb_model, X_test_processed, y_test)

# Display comparison
print("\n\n Model Comparison:")
pd.DataFrame(results).T

=== Logistic Regression ===

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.86      0.86       753
           1       0.92      0.91      0.91      1247

    accuracy                           0.89      2000
   macro avg       0.88      0.89      0.88      2000
weighted avg       0.89      0.89      0.89      2000


=== Random Forest ===

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       753
           1       1.00      1.00      1.00      1247

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


=== XGBoost ===



Parameters: { "use_label_encoder" } are not used.





Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       753
           1       1.00      1.00      1.00      1247

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000



 Model Comparison:


Unnamed: 0,Accuracy,Precision,Recall,F1 Score,ROC AUC
Logistic Regression,0.8915,0.917342,0.907779,0.912535,0.964996
Random Forest,0.9995,1.0,0.999198,0.999599,1.0
XGBoost,1.0,1.0,1.0,1.0,1.0


In [2]:
import mlflow
import mlflow.sklearn
import sys
import os
sys.path.append(os.path.abspath("../src"))

from model import train_model, evaluate_model

with mlflow.start_run():
    model = train_model(X_train_processed, y_train, model_type="xgboost")
    metrics = evaluate_model(model, X_test_processed, y_test)

    mlflow.log_param("model_type", "xgboost")
    for key, value in metrics.items():
        mlflow.log_metric(key, value)

    mlflow.sklearn.log_model(model, "model")



# Save model and preprocessor
os.makedirs("../models", exist_ok=True)
joblib.dump(xgb_model, "../models/model.pkl")
joblib.dump(preprocessor, "../models/preprocessor.pkl")

print(" Model and preprocessor saved to /models")

The git executable must be specified in one of the following ways:
    - be included in your $PATH
    - be set via $GIT_PYTHON_GIT_EXECUTABLE
    - explicitly set via git.refresh(<full-path-to-git-executable>)

All git commands will error until this is rectified.

This initial message can be silenced or aggravated in the future by setting the
$GIT_PYTHON_REFRESH environment variable. Use one of the following values:
    - quiet|q|silence|s|silent|none|n|0: for no message or exception
    - error|e|exception|raise|r|2: for a raised exception

Example:
    export GIT_PYTHON_REFRESH=quiet


Parameters: { "use_label_encoder" } are not used.





Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       753
           1       1.00      1.00      1.00      1247

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000





 Model and preprocessor saved to /models


In [3]:
import os
import numpy as np
import joblib
import plotly.graph_objects as go
import plotly.figure_factory as ff
from sklearn.metrics import (
    confusion_matrix,
    roc_curve,
    auc,
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
)

# === Ensure output folder exists ===
os.makedirs("../reports", exist_ok=True)

# === 1. Bar Chart of Evaluation Metrics ===
metric_names = list(metrics.keys())
metric_values = list(metrics.values())

fig_metrics = go.Figure([go.Bar(x=metric_names, y=metric_values, marker_color='indigo')])
fig_metrics.update_layout(
    title=" Model Evaluation Metrics",
    xaxis_title="Metric",
    yaxis_title="Score",
    yaxis=dict(range=[0, 1]),
    template="plotly_white"
)

fig_metrics.show()
fig_metrics.write_html("../reports/metric_chart.html")
fig_metrics.write_image("../reports/metric_chart.png")

# === 2. Confusion Matrix Heatmap ===
y_pred = model.predict(X_test_processed)
cm = confusion_matrix(y_test, y_pred)

fig_cm = ff.create_annotated_heatmap(
    z=cm,
    x=["Not Enrolled (0)", "Enrolled (1)"],
    y=["Not Enrolled (0)", "Enrolled (1)"],
    colorscale='Purples',
    showscale=True
)
fig_cm.update_layout(title_text=" Confusion Matrix", template="plotly_white")
fig_cm.show()
fig_cm.write_html("../reports/confusion_matrix.html")
fig_cm.write_image("../reports/confusion_matrix.png")

# === 3. ROC Curve ===
y_probs = model.predict_proba(X_test_processed)[:, 1]
fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = auc(fpr, tpr)

fig_roc = go.Figure()
fig_roc.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name=f"AUC = {roc_auc:.2f}", line=dict(color='darkorange')))
fig_roc.add_trace(go.Scatter(x=[0, 1], y=[0, 1], mode='lines', name='Random', line=dict(dash='dash')))

fig_roc.update_layout(
    title=" ROC Curve",
    xaxis_title="False Positive Rate",
    yaxis_title="True Positive Rate",
    xaxis=dict(range=[0, 1]),
    yaxis=dict(range=[0, 1]),
    template="plotly_white"
)

fig_roc.show()
fig_roc.write_html("../reports/roc_curve.html")
fig_roc.write_image("../reports/roc_curve.png")
