In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from utils import ensure_dir, save_json, save_model
from data_loader import load_breast_cancer_data
from visualization import plot_confusion_matrix, plot_roc_curve, plot_learning_curve
from xai_explainer import shap_summary_plot

In [2]:
X, y = load_breast_cancer_data()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("‚úÖ Data loaded successfully.")
print("Train shape:", X_train.shape, "Test shape:", X_test.shape)

‚úÖ Data loaded successfully.
Train shape: (455, 30) Test shape: (114, 30)


In [3]:
from model_training import train_with_cv

# Logistic Regression
pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=1000))
])
param_lr = {"clf__C": [0.1, 1, 10]}

# Random Forest
pipe_rf = Pipeline([
    ("clf", RandomForestClassifier(random_state=42))
])
param_rf = {"clf__n_estimators": [200, 400],
            "clf__max_depth": [None, 8, 12]}

# MLP
pipe_mlp = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(random_state=42, max_iter=1000))
])
param_mlp = {"clf__hidden_layer_sizes": [(64,), (128,), (64, 32)]}

In [4]:
grids = []
for name, pipe, params in [("LR", pipe_lr, param_lr), ("RF", pipe_rf, param_rf), ("MLP", pipe_mlp, param_mlp)]:
    grid = train_with_cv(pipe, params, X_train, y_train)
    grids.append((name, grid))
    print(f"‚úÖ {name} best score: {grid.best_score_:.3f}")

Fitting 5 folds for each of 3 candidates, totalling 15 fits
‚úÖ LR best score: 0.986
Fitting 5 folds for each of 6 candidates, totalling 30 fits
‚úÖ RF best score: 0.970
Fitting 5 folds for each of 3 candidates, totalling 15 fits
‚úÖ MLP best score: 0.981


In [5]:
base_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))  # proje k√∂k√º

In [6]:
fig_dir = os.path.join(base_dir, "figures")
art_dir = os.path.join(base_dir, "artifacts")

In [7]:
best = max(grids, key=lambda t: t[1].best_score_)[1]
best_model = best.best_estimator_
print("üèÜ Best model:", best_model)

save_model(best_model, art_dir + "/best_model.pkl")
print("‚úÖ Model saved to artifacts/best_model.pkl")

üèÜ Best model: Pipeline(steps=[('scaler', StandardScaler()),
                ('clf', LogisticRegression(C=0.1, max_iter=1000))])
‚úÖ Model saved to artifacts/best_model.pkl


In [8]:
from sklearn.metrics import confusion_matrix

y_pred = best_model.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
save_json(report, art_dir + "/metrics.json")

print("‚úÖ Classification report saved.")
print(pd.DataFrame(report).T.head())

‚úÖ Classification report saved.
              precision    recall  f1-score     support
0              0.975610  0.952381  0.963855   42.000000
1              0.972603  0.986111  0.979310   72.000000
accuracy       0.973684  0.973684  0.973684    0.973684
macro avg      0.974106  0.969246  0.971583  114.000000
weighted avg   0.973711  0.973684  0.973616  114.000000


In [9]:
plot_confusion_matrix(y_test, y_pred, os.path.join(fig_dir, "confusion_matrix.png"))
plot_roc_curve(best_model, X_test, y_test, os.path.join(fig_dir, "roc_curve.png"))
plot_learning_curve(best_model, X_train, y_train, os.path.join(fig_dir, "learning_curve.png"))
shap_summary_plot(best_model, X_test, os.path.join(fig_dir, "shap_summary.png"))

üîç Starting SHAP analysis...
Model type: Pipeline
üß© Detected Pipeline. Extracting final estimator...
‚û°Ô∏è Inner model: LogisticRegression
üìà Using LinearExplainer
üìä Generating SHAP summary plot...
‚úÖ SHAP summary plot saved: /home/yakamoz/tabular_classification_demo/figures/shap_summary.png
