**Final Python Notebook 3: Advanced Classification Modelling for Breast Cancer Mortality Prediction**

**Author:** Dinuka Induwara,



---
This notebook extends the classification modeling for predicting Mortality Status(Alive: 0, Death:1) using the preprocessed breast cancer dataset form Final python notebook 1. Itbuilds on Final Python notebook2 incorporating advance techniques, Decition trees, ensembke methods and optionally multilayer preceptron classifier.


In [None]:


# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, RocCurveDisplay
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree
import plotly.express as px

# Set pandas display options
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)
pd.set_option('display.float_format', '{:.2f}'.format)

# Loading the prepared classification dataset
data_path = './data/processed/Prepared_Breast_Cancer_Classification.csv'
data = pd.read_csv(data_path)

# Display first five rows
print(data.head())

# Checking columns and target
print('Columns:', list(data.columns))
if 'Survival_Months' not in data.columns:
    print('Warning: Survival_Months not found. Using Tumor_Size as placeholder for regression.')
    target = 'Tumor_Size'  # Placeholder; replace with Survival_Months if available
else:
    target = 'Survival_Months'
print(f'Target ({target}) sample:')
print(data[target].head())

# Dataset structure and data types
data.info()

# Visualizing target distribution
fig = px.histogram(data, x=target, title=f'Distribution of {target}')
fig.show()

# Checking and handling missing values
print('Missing Values (%):')
print(data.isna().sum() / len(data) * 100)

# Define features and target for classification
X = data.drop(['Mortality_Status'], axis=1)
y = data['Mortality_Status']

# Impute missing values in features
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

# Impute missing values in target if any (unlikely)
y_imputer = SimpleImputer(strategy='mean')
y = y_imputer.fit_transform(y.values.reshape(-1, 1)).ravel()

print('Missing values in X after imputation (%):')
print(X.isna().sum() / len(X) * 100)
print('Missing values in y:', pd.Series(y).isna().sum())

print('Features:', list(X.columns))

# Split data into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

# Feature scaling (critical for MLP and Voting classifier)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# --- Fully Grown Decision Tree (DT-1) ---
dt_full = DecisionTreeClassifier(random_state=42)
dt_full.fit(X_train, y_train)

y_pred_dt_full = dt_full.predict(X_test)

plt.figure(figsize=(20, 10))
tree.plot_tree(dt_full, feature_names=X_train.columns, class_names=['Alive', 'Dead'], filled=True)
plt.title('Fully Grown Decision Tree')
plt.savefig('./results/fully_grown_decision_tree.png')
plt.show()

print('Fully Grown Decision Tree Depth:', dt_full.tree_.max_depth)

dt_full_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_dt_full})
dt_full_comparison.to_csv('./results/DT_Full_Predictions.csv', index=True)

print('DT-1 Prediction Comparison:')
print(dt_full_comparison.head())

cm_dt_full = confusion_matrix(y_test, y_pred_dt_full, labels=dt_full.classes_)
disp_dt_full = ConfusionMatrixDisplay(cm_dt_full, display_labels=['Alive', 'Dead'])
disp_dt_full.plot()
plt.title('Fully Grown Decision Tree Confusion Matrix')
plt.show()

print('Fully Grown Decision Tree Classification Report:')
print(classification_report(y_test, y_pred_dt_full))

dt_full_roc = RocCurveDisplay.from_estimator(dt_full, X_test, y_test)
plt.title('Fully Grown Decision Tree ROC Curve')
plt.show()

# --- Pruned Decision Tree (DT-2) with GridSearchCV ---
dt = DecisionTreeClassifier(random_state=42)
param_grid_dt = {
    'max_depth': np.arange(1, 20),
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random']
}

dt_gscv = GridSearchCV(dt, param_grid_dt, cv=5, scoring='roc_auc')
dt_gscv.fit(X_train, y_train)

dt_pruned = dt_gscv.best_estimator_
print('Best Pruned Decision Tree Parameters:', dt_gscv.best_params_)

y_pred_dt_pruned = dt_pruned.predict(X_test)

plt.figure(figsize=(20, 10))
tree.plot_tree(dt_pruned, feature_names=X_train.columns, class_names=['Alive', 'Dead'], filled=True)
plt.title('Pruned Decision Tree')
plt.savefig('./results/pruned_decision_tree.png')
plt.show()

dt_pruned_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_dt_pruned})
dt_pruned_comparison.to_csv('./results/DT_Pruned_Predictions.csv', index=True)

cm_dt_pruned = confusion_matrix(y_test, y_pred_dt_pruned, labels=dt_pruned.classes_)
disp_dt_pruned = ConfusionMatrixDisplay(cm_dt_pruned, display_labels=['Alive', 'Dead'])
disp_dt_pruned.plot()
plt.title('Pruned Decision Tree Confusion Matrix')
plt.show()

print('Pruned Decision Tree Classification Report:')
print(classification_report(y_test, y_pred_dt_pruned))

dt_pruned_roc = RocCurveDisplay.from_estimator(dt_pruned, X_test, y_test)
plt.title('Pruned Decision Tree ROC Curve')
plt.show()

# --- Regression Metrics for DT-1 and DT-2 ---
mse_dt1 = mean_squared_error(y_test, y_pred_dt_full)
mae_dt1 = mean_absolute_error(y_test, y_pred_dt_full)
r2_dt1 = r2_score(y_test, y_pred_dt_full)

mse_dt2 = mean_squared_error(y_test, y_pred_dt_pruned)
mae_dt2 = mean_absolute_error(y_test, y_pred_dt_pruned)
r2_dt2 = r2_score(y_test, y_pred_dt_pruned)

metrics_table = pd.DataFrame({
    'Model': ['DT-1 (Fully Grown)', 'DT-2 (Pruned)', 'DT-1 (Fully Grown)', 'DT-2 (Pruned)', 'DT-1 (Fully Grown)', 'DT-2 (Pruned)'],
    'Metric': ['MSE', 'MSE', 'MAE', 'MAE', 'R-Square', 'R-Square'],
    'Test Score': [mse_dt1, mse_dt2, mae_dt1, mae_dt2, r2_dt1, r2_dt2]
})

print('Model Evaluation Metrics:')
print(metrics_table)

metrics_table.to_csv('./results/Regression_Metrics.csv', index=False)

# --- Voting Classifier (Naive Bayes + Logistic Regression) ---
nb = GaussianNB()
lr = LogisticRegression(random_state=42, max_iter=1000)

base_learners = [('NB', nb), ('LR', lr)]
voting_clf = VotingClassifier(base_learners, voting='soft')
voting_clf.fit(X_train_scaled, y_train)

y_pred_voting = voting_clf.predict(X_test_scaled)

voting_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_voting})
voting_comparison.to_csv('./results/Voting_Predictions.csv', index=True)

print('Voting Classifier Prediction Comparison:')
print(voting_comparison.head())

cm_voting = confusion_matrix(y_test, y_pred_voting, labels=voting_clf.classes_)
disp_voting = ConfusionMatrixDisplay(cm_voting, display_labels=['Alive', 'Dead'])
disp_voting.plot()
plt.title('Voting Classifier Confusion Matrix')
plt.show()

print('Voting Classifier Classification Report:')
print(classification_report(y_test, y_pred_voting))

voting_roc = RocCurveDisplay.from_estimator(voting_clf, X_test_scaled, y_test)
plt.title('Voting Classifier ROC Curve')
plt.show()

# --- Random Forest Classifier with GridSearchCV ---
rf = RandomForestClassifier(random_state=42)
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]}

rf_gscv = GridSearchCV(rf, param_grid_rf, cv=5, scoring='roc_auc')
rf_gscv.fit(X_train, y_train)

rf_best = rf_gscv.best_estimator_
print('Best Random Forest Parameters:', rf_gscv.best_params_)

y_pred_rf = rf_best.predict(X_test)

rf_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_rf})
rf_comparison.to_csv('./results/RF_Predictions.csv', index=True)

print('Random Forest Prediction Comparison:')
print(rf_comparison.head())

cm_rf = confusion_matrix(y_test, y_pred_rf, labels=rf_best.classes_)
disp_rf = ConfusionMatrixDisplay(cm_rf, display_labels=['Alive', 'Dead'])
disp_rf.plot()
plt.title('Random Forest Confusion Matrix')
plt.show()

print('Random Forest Classification Report:')
print(classification_report(y_test, y_pred_rf))

rf_roc = RocCurveDisplay.from_estimator(rf_best, X_test, y_test)
plt.title('Random Forest ROC Curve')
plt.show()

# --- AdaBoost Classifier with Decision Tree base learner ---
base_model = DecisionTreeClassifier(max_depth=1, random_state=42)
adaboost = AdaBoostClassifier(estimator=base_model, random_state=42)

param_grid_adaboost = {'n_estimators': [50, 100, 200]}

adaboost_gscv = GridSearchCV(adaboost, param_grid_adaboost, cv=5, scoring='roc_auc')
adaboost_gscv.fit(X_train, y_train)

adaboost_best = adaboost_gscv.best_estimator_
print('Best AdaBoost Parameters:', adaboost_gscv.best_params_)

y_pred_adaboost = adaboost_best.predict(X_test)

adaboost_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_adaboost})
adaboost_comparison.to_csv('./results/AdaBoost_Predictions.csv', index=True)

print('AdaBoost Prediction Comparison:')
print(adaboost_comparison.head())

cm_adaboost = confusion_matrix(y_test, y_pred_adaboost, labels=adaboost_best.classes_)
disp_adaboost = ConfusionMatrixDisplay(cm_adaboost, display_labels=['Alive', 'Dead'])
disp_adaboost.plot()
plt.title('AdaBoost Confusion Matrix')
plt.show()

print('AdaBoost Classification Report:')
print(classification_report(y_test, y_pred_adaboost))

adaboost_roc = RocCurveDisplay.from_estimator(adaboost_best, X_test, y_test)
plt.title('AdaBoost ROC Curve')
plt.show()

# --- Check for Regression Target and R² Computation ---
numeric_cols = data.select_dtypes(include=[np.number]).columns
potential_regression_targets = [col for col in numeric_cols if col != 'Mortality_Status']

if potential_regression_targets:
    print('Potential Regression Targets:', potential_regression_targets)

    target_col = potential_regression_targets[0]
    df_clean = data.dropna(subset=[target_col])
    X_reg = df_clean.drop(columns=['Mortality_Status', target_col])
    y_reg = df_clean[target_col]

    imputer_reg = SimpleImputer(strategy='median')
    X_reg = pd.DataFrame(imputer_reg.fit_transform(X_reg), columns=X_reg.columns)

    X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

    reg = DecisionTreeRegressor(random_state=42)
    reg.fit(X_train_reg, y_train_reg)

    y_pred_reg = reg.predict(X_test_reg)
    print(f'R² for {target_col} (Decision Tree Regressor):', r2_score(y_test_reg, y_pred_reg))
else:
    print('No continuous target found for R² computation. R² is inapplicable as Mortality_Status is categorical.')
