In [1]:
# --------------------------------------------------
# Import required libraries for model training,
# evaluation, and saving trained models
# --------------------------------------------------
import pandas as pd
import os
import joblib

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score


In [2]:
# --------------------------------------------------
# Load preprocessed training and testing datasets
# generated from the preprocessing pipeline
# --------------------------------------------------
X_train = pd.read_csv("../data/processed/X_train.csv")
X_test  = pd.read_csv("../data/processed/X_test.csv")
y_train = pd.read_csv("../data/processed/y_train.csv").squeeze() # .squeeze() to convert DataFrame to Series
y_test  = pd.read_csv("../data/processed/y_test.csv").squeeze() # .squeeze() to convert DataFrame to Series

X_train.shape, X_test.shape, y_train.shape, y_test.shape


((316, 41), (79, 41), (316,), (79,))

In [3]:
# --------------------------------------------------
# Initialize all models used in the project
# --------------------------------------------------
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    
    "Decision Tree": DecisionTreeClassifier(
        random_state=42
    ),
    
    "Random Forest": RandomForestClassifier(
        n_estimators=100,
        random_state=42
    ),
    
    "Gradient Boosting": GradientBoostingClassifier(
        random_state=42
    ),
    
    "Support Vector Machine": SVC(
        kernel="rbf",
        probability=True,   # Required for SHAP & LIME
        random_state=42
    )
}


In [4]:
# --------------------------------------------------
# Train each model and evaluate using accuracy
# --------------------------------------------------
trained_models = {}
results = []

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    
    trained_models[model_name] = model
    results.append({
        "Model": model_name,
        "Accuracy": accuracy
    })
    
    print(f"{model_name} trained successfully | Accuracy: {accuracy:.4f}")


Logistic Regression trained successfully | Accuracy: 0.8861
Decision Tree trained successfully | Accuracy: 0.8608
Random Forest trained successfully | Accuracy: 0.8861
Gradient Boosting trained successfully | Accuracy: 0.8861
Support Vector Machine trained successfully | Accuracy: 0.8861


In [5]:
# --------------------------------------------------
# Save all trained models to the models directory
# --------------------------------------------------
os.makedirs("../models", exist_ok=True)

for model_name, model in trained_models.items():
    filename = model_name.lower().replace(" ", "_") + ".pkl"
    joblib.dump(model, f"../models/{filename}")

print("All trained models saved successfully.")


All trained models saved successfully.


In [6]:
# --------------------------------------------------
# Display accuracy comparison of all models
# --------------------------------------------------
results_df = pd.DataFrame(results)
results_df.sort_values(by="Accuracy", ascending=False)


Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.886076
2,Random Forest,0.886076
3,Gradient Boosting,0.886076
4,Support Vector Machine,0.886076
1,Decision Tree,0.860759


In [7]:
print("MODEL TRAINING COMPLETED")
print("-" * 50)
print(f"Total models trained: {len(trained_models)}")
print("Models are ready for evaluation and explainability analysis.")


MODEL TRAINING COMPLETED
--------------------------------------------------
Total models trained: 5
Models are ready for evaluation and explainability analysis.
