In [1]:
# ============================================================
# SPSV Project - Phase 4: Machine Learning (Predictive Insights)

# In step 3, I created an ML-ready dataset: "ml_ready_dataset.csv".
#
# In this phase, I will:
# 1) load the ML dataset,
# 2) prepare features (handle categorical variables),
# 3) split into train/test sets,
# 4) train simple, interpretable models,
# 5) evaluate performance using standard metrics,
# 6) save model predictions for dashboard integration later.
#
# Important to state that:
# The data is synthetic, so this ML section is presented as a demonstration
# of an analytics workflow rather than a claim about real SPSV outcomes.
# ============================================================

import pandas as pd
import numpy as np

# Machine learning tools
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Evaluation metrics
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)

# ------------------------------------------------------------
# 1) LOAD ML DATASET
# ------------------------------------------------------------

# I use the engineered dataset from Phase 3 so features are already aggregated
# to licence level and suitable for modelling.
ml = pd.read_csv("ml_ready_dataset.csv")

print("ML dataset shape:", ml.shape)
print("Target distribution (High_Risk):")
print(ml["High_Risk"].value_counts(dropna=False))

# ------------------------------------------------------------
# 2) BASIC DATA CHECKS
# ------------------------------------------------------------

# I check for missing values before modelling because models cannot handle NaN.
print("\nMissing values (top 15 columns):")
print(ml.isna().sum().sort_values(ascending=False).head(15))

# If there are any missing values, I will fill numeric with 0 and categorical with "UNKNOWN".
# This is consistent with earlier preprocessing (and keeps the dataset usable).
for col in ml.columns:
    if ml[col].dtype == "object":
        ml[col] = ml[col].fillna("UNKNOWN")
    else:
        ml[col] = ml[col].fillna(0)

# ------------------------------------------------------------
# 3) DEFINE TARGET (y) AND FEATURES (X)
# ------------------------------------------------------------

# The target variable is High_Risk, created in Phase 3 using an interpretable rule.
y = ml["High_Risk"]
X = ml.drop(columns=["High_Risk"])

# Optional:
# Drop ID columns if they exist (IDs do not usually help prediction and can cause leakage)
id_cols = [c for c in ["Licence_ID"] if c in X.columns]
if id_cols:
    X = X.drop(columns=id_cols)

# ------------------------------------------------------------
# 4) IDENTIFY NUMERIC VS CATEGORICAL FEATURES
# ------------------------------------------------------------

# Many features are numeric (counts), while others are categorical (County, Licence_Type, Status).
# Categorical variables must be encoded before modelling.
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()
numeric_features = X.select_dtypes(exclude=["object"]).columns.tolist()

print("\nCategorical features:", categorical_features)
print("Numeric features:", numeric_features)

# ------------------------------------------------------------
# 5) PREPROCESSING PIPELINE (One-Hot Encoding for categoricals)
# ------------------------------------------------------------

# One-hot encoding converts categories into numeric columns.
# I use handle_unknown="ignore" so unseen categories in test data do not crash the model.
preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features),
    ]
)

# ------------------------------------------------------------
# 6) TRAIN / TEST SPLIT
# ------------------------------------------------------------

# I split the data into training and testing sets so I can evaluate generalisation.
# Stratify=y keeps the High_Risk class distribution similar in both sets.
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain size:", X_train.shape, "Test size:", X_test.shape)

# ------------------------------------------------------------
# 7) MODEL 1: LOGISTIC REGRESSION (interpretable baseline)
# ------------------------------------------------------------

# Logistic Regression is a strong baseline for binary classification and is relatively interpretable.
log_reg_model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", LogisticRegression(max_iter=1000))
    ]
)

log_reg_model.fit(X_train, y_train)

# Predictions
y_pred_lr = log_reg_model.predict(X_test)

# Evaluation
print("\n=== Logistic Regression Results ===")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr, zero_division=0))
print("Recall:", recall_score(y_test, y_pred_lr, zero_division=0))
print("F1:", f1_score(y_test, y_pred_lr, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nClassification Report:\n", classification_report(y_test, y_pred_lr, zero_division=0))

# ------------------------------------------------------------
# 8) MODEL 2: DECISION TREE (simple rule-based comparison)
# ------------------------------------------------------------

# Decision Trees are easy to explain because they behave like human decision rules.
tree_model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", DecisionTreeClassifier(max_depth=4, random_state=42))
    ]
)

tree_model.fit(X_train, y_train)
y_pred_tree = tree_model.predict(X_test)

print("\n=== Decision Tree Results (max_depth=4) ===")
print("Accuracy:", accuracy_score(y_test, y_pred_tree))
print("Precision:", precision_score(y_test, y_pred_tree, zero_division=0))
print("Recall:", recall_score(y_test, y_pred_tree, zero_division=0))
print("F1:", f1_score(y_test, y_pred_tree, zero_division=0))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_tree))
print("\nClassification Report:\n", classification_report(y_test, y_pred_tree, zero_division=0))

# ------------------------------------------------------------
# 9) SAVE PREDICTIONS FOR DASHBOARD INTEGRATION
# ------------------------------------------------------------

# The dashboard can show a predicted risk label per licence (e.g. "High Risk" vs "Low Risk").
# I save predictions back into a table that includes the original features.
pred_output = X_test.copy()
pred_output["Actual_High_Risk"] = y_test.values
pred_output["Predicted_High_Risk_LogReg"] = y_pred_lr
pred_output["Predicted_High_Risk_Tree"] = y_pred_tree

pred_output.to_csv("ml_predictions_for_dashboard.csv", index=False)

print("\nSaved: ml_predictions_for_dashboard.csv")
print("This file can be used later in the dashboard to display predicted risk flags.")


ML dataset shape: (5000, 25)
Target distribution (High_Risk):
High_Risk
1    4627
0     373
Name: count, dtype: int64

Missing values (top 15 columns):
Avg_Days_To_Resolution     239
Failure_Rate                92
Issue_Date                   0
Licence_Type                 0
Status                       0
County                       0
Wheelchair_Accessible        0
Expiry_Date                  0
Licence_ID                   0
Vehicle_Plate_Year           0
Vehicle_Age                  0
Driver_Experience_Years      0
Fleet_Segment                0
Licence_Duration_Days        0
Days_To_Expiry               0
dtype: int64

Categorical features: ['Licence_Type', 'Issue_Date', 'Expiry_Date', 'Status', 'County', 'Wheelchair_Accessible', 'Vehicle_Plate_Year', 'Driver_Experience_Years', 'Fleet_Segment', 'Vehicle_Age_Band']
Numeric features: ['Vehicle_Age', 'Is_Wheelchair_Accessible', 'Licence_Duration_Days', 'Days_To_Expiry', 'Is_Expiring_90_Days', 'Complaints', 'Escalations', 'Avg_Days_To_

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=1000).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
