In [None]:
from ucimlrepo import fetch_ucirepo 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from collections import Counter
import seaborn as sns
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score
)

# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
X = breast_cancer_wisconsin_diagnostic.data.features.copy()
y = breast_cancer_wisconsin_diagnostic.data.targets.copy()

# Show all columns when displaying DataFrames
pd.set_option('display.max_columns', None)

#data exploration 
X.info() # shows that all data is numerical with no missing values
X.head() # shows the first 5 rows of the features

y['Diagnosis'].value_counts() # the value counts for the label (diagnosis) shows an imbalanced dataset

# Encode labels
y['Diagnosis'] = y['Diagnosis'].map({'M': 1, 'B': 0})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y['Diagnosis'], test_size=0.2, stratify=y['Diagnosis'], random_state=42)

# Log transform skewed columns (SE features ending in '2')
# se_cols = [col for col in X_train.columns if col.endswith('2')]
# X_train[se_cols] = X_train[se_cols].apply(np.log1p)
# X_test[se_cols] = X_test[se_cols].apply(np.log1p)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to training data only
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Save preprocessed features and labels to CSV
preprocessed_df = pd.DataFrame(X_train_smote, columns=X_train.columns)
preprocessed_df["Diagnosis"] = y_train_smote.values  # Add target column
preprocessed_df.to_csv("preprocessed_dataset.csv", index=False)

y_train_smote.value_counts()

# Extract the 11th feature (0-indexed)
feature_index = 10
feature_name = X_train.columns[feature_index]
feature_data = X_train[feature_name]

# Plot histogram
plt.figure(figsize=(8, 5))
sns.histplot(feature_data, bins=30, kde=True, color='skyblue', edgecolor='black')
# plt.title(f"Distribution of Feature: {feature_name}", fontsize=14)
plt.xlabel("Radius Standard Error")
plt.ylabel("Frequency")
plt.grid(True, linestyle='--', alpha=0.6)
plt.tight_layout()
plt.savefig("feature2hist.png", dpi=300)
plt.show()

In [None]:
# check the pre-processed data 
preprocessed_df.info()
preprocessed_df['Diagnosis'].value_counts()
preprocessed_df.describe()

In [None]:

# Define base models with class_weight if available
base_models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "SVM": SVC(probability=True, random_state=42),
    "KNN": KNeighborsClassifier(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "XGBoost": XGBClassifier(eval_metric='logloss', random_state=42),
    "MLP": MLPClassifier(hidden_layer_sizes=(10,), alpha=0.01, max_iter=500, random_state=42)
}

# evaluate chosen models
results = {}
for name, model in base_models.items():
    model.fit(X_train_smote, y_train_smote)
    y_pred = model.predict(X_test_scaled)
    report = classification_report(y_test, y_pred, output_dict=True)
    cm = confusion_matrix(y_test, y_pred)
    acc_train = accuracy_score(y_train_smote, model.predict(X_train_smote))
    acc_test = accuracy_score(y_test, y_pred)
    results[name] = {
        "training_accuracy": acc_train,
        "testing_accuracy": acc_test,
        "precision": report['1']['precision'],
        "recall": report['1']['recall'],
        "f1_score": report['1']['f1-score'],
        "confusion_matrix": cm
    }

results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# Define the stratified K-fold cross-validator
cv_strategy = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# stacking classifier with passthrough=False to avoid overfitting
stacking_clf = StackingClassifier(
    estimators=[
        ('lr', base_models['Logistic Regression']),
        ('rf', base_models['Random Forest']),
        ('xgb', base_models['XGBoost'])
    ],
    final_estimator=LogisticRegression(class_weight='balanced', random_state=42),
    passthrough=False,
    cv=cv_strategy
)

stacking_clf.fit(X_train_smote, y_train_smote)
y_pred_stack = stacking_clf.predict(X_test_scaled)
report_stack = classification_report(y_test, y_pred_stack, output_dict=True)
cm_stack = confusion_matrix(y_test, y_pred_stack)
acc_train_stack = accuracy_score(y_train_smote, stacking_clf.predict(X_train_smote))
acc_test_stack = accuracy_score(y_test, y_pred_stack)

results["Stacked Model"] = {
    "training_accuracy": acc_train_stack,
    "testing_accuracy": acc_test_stack,
    "precision": report_stack['1']['precision'],
    "recall": report_stack['1']['recall'],
    "f1_score": report_stack['1']['f1-score'],
    "confusion_matrix": cm_stack
}

# Show final results
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# tuning

# models for tuning
param_grid = {
    "MLP": {
        "hidden_layer_sizes": [(10,), (20,)],
        "alpha": [0.0001, 0.01],
        "max_iter": [500]
    },
    "Stacked Model": {
        "final_estimator__C": [0.1, 1.0, 10.0]  # for LogisticRegression meta-classifier
    }
}

# Grid search for MLP
mlp = MLPClassifier(random_state=42)
grid_mlp = GridSearchCV(mlp, param_grid['MLP'], cv=5, scoring='accuracy', n_jobs=-1)
grid_mlp.fit(X_train_smote, y_train_smote)
mlp_best = grid_mlp.best_estimator_

# Grid search for stacked model
stacking_clf = StackingClassifier(
    estimators=[
        ('lr', LogisticRegression(class_weight='balanced', max_iter=1000)),
        ('rf', RandomForestClassifier(class_weight='balanced')),
        ('xgb', XGBClassifier(eval_metric='logloss'))
    ],
    final_estimator=LogisticRegression(class_weight='balanced'),
    passthrough=False,
    cv=5,
)

grid_stack = GridSearchCV(stacking_clf, param_grid['Stacked Model'], cv=5, scoring='accuracy', n_jobs=-1)
grid_stack.fit(X_train_smote, y_train_smote)
stacked_best = grid_stack.best_estimator_

# Save the best models
# joblib.dump(mlp_best, 'best_mlp_model.pkl')
# joblib.dump(stacked_best, 'best_stacked_model.pkl')

# Evaluate and plot MLP
y_pred_mlp = mlp_best.predict(X_test_scaled)
probs_mlp = mlp_best.predict_proba(X_test_scaled)[:, 1]
fpr_mlp, tpr_mlp, _ = roc_curve(y_test, probs_mlp)
precision_mlp, recall_mlp, _ = precision_recall_curve(y_test, probs_mlp)
cm_mlp = confusion_matrix(y_test, y_pred_mlp)

# Evaluate and plot Stacked model
y_pred_stack = stacked_best.predict(X_test_scaled)
probs_stack = stacked_best.predict_proba(X_test_scaled)[:, 1]
fpr_stack, tpr_stack, _ = roc_curve(y_test, probs_stack)
precision_stack, recall_stack, _ = precision_recall_curve(y_test, probs_stack)
cm_stack = confusion_matrix(y_test, y_pred_stack)

# Plot ROC and PR curves
# ROC Curve
plt.figure(figsize=(6, 5))
plt.plot(fpr_mlp, tpr_mlp, label='MLP')
plt.plot(fpr_stack, tpr_stack, label='Stacked')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()
plt.tight_layout()
plt.savefig("ROC.png", dpi=300)
plt.show()

# Precision-Recall Curve
plt.figure(figsize=(6, 5))
plt.plot(recall_mlp, precision_mlp, label='MLP')
plt.plot(recall_stack, precision_stack, label='Stacked')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend()
plt.tight_layout()
plt.savefig("P_R_curve.png", dpi=300)
plt.show()

# Confusion Matrix - MLP
plt.figure(figsize=(6, 5))
sns.heatmap(cm_mlp, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig("confusion_matrix_mlp.png", dpi=300)
plt.show()

# Confusion Matrix - Stacked
plt.figure(figsize=(6, 5))
sns.heatmap(cm_stack, annot=True, fmt='d', cmap='Greens')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.tight_layout()
plt.savefig("confusion_matrix_stacked.png", dpi=300)
plt.show()

# Training accuracy
train_acc_mlp = accuracy_score(y_train_smote, mlp_best.predict(X_train_smote))
train_acc_stack = accuracy_score(y_train_smote, stacked_best.predict(X_train_smote))

# Testing metrics
mlp_metrics = {
    "Train Accuracy": train_acc_mlp,
    "Test Accuracy": accuracy_score(y_test, y_pred_mlp),
    "Precision": precision_score(y_test, y_pred_mlp),
    "Recall": recall_score(y_test, y_pred_mlp),
    "F1 Score": f1_score(y_test, y_pred_mlp),
    "AUC": roc_auc_score(y_test, probs_mlp)
}

stacked_metrics = {
    "Train Accuracy": train_acc_stack,
    "Test Accuracy": accuracy_score(y_test, y_pred_stack),
    "Precision": precision_score(y_test, y_pred_stack),
    "Recall": recall_score(y_test, y_pred_stack),
    "F1 Score": f1_score(y_test, y_pred_stack),
    "AUC": roc_auc_score(y_test, probs_stack)
}

# Create DataFrame
final_metrics_df = pd.DataFrame(
    [mlp_metrics, stacked_metrics],
    index=["MLP", "Stacked Model"]
)

# Display the table
print(final_metrics_df.round(4))