**Python Notebook 1: Data Understanding and Preprocessing**

**Author:** Dinuka Induwara,

---
This notebook focuses on data understanding and preprocessing for the breast cancer dataset as per the coursework requirments. In this notebook we created two separate datasets for classification and regression for cancer mortality status and cancer survial months.


In [None]:
# 1. IMPORT LIBRARIES
import pandas as pd
import numpy as np
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import plotly.express as px

# Set pandas options
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 180)


In [None]:
# 2. LOAD RAW DATA
data_path = '../data/raw/breast_cancer.csv'  # Make sure this file is here
df = pd.read_csv(data_path)
df.head()


In [None]:
# 3. INITIAL INSPECTION
print("Columns:", list(df.columns))
print("\nData Info:")
df.info()

print("\nMissing Values (%):")
print(df.isna().sum() / len(df) * 100)

print("\nTarget Distribution:")
print(df['Mortality_Status'].value_counts())
fig = px.bar(df, x='Mortality_Status', title='Distribution of Mortality Status')
fig.show()


In [None]:
# 4. IMPUTATION
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print("Remaining Missing (%):")
print(df_imputed.isna().sum() / len(df_imputed) * 100)


In [None]:
# 5. ENCODING CATEGORICAL COLUMNS
categorical_cols = ['Sex', 'T_Stage', 'N_Stage', '6th_Stage', 'Differentiated', 'Grade', 'A_Stage',
                    'Estrogen_Status', 'Progesterone_Status']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_imputed[col] = le.fit_transform(df_imputed[col].astype(str))
    label_encoders[col] = le


In [None]:
# 6. SCALING NUMERICAL FEATURES
numerical_cols = ['Age', 'Tumor_Size', 'Regional_Node_Examined', 'Regional_Node_Positive']
scaler = StandardScaler()
df_imputed[numerical_cols] = scaler.fit_transform(df_imputed[numerical_cols])


In [None]:
# 7. SAVE CLASSIFICATION & REGRESSION DATASETS
classification_data_path = '../data/processed/Prepared_Breast_Cancer_Classification.csv'
regression_data_path = '../data/processed/Prepared_Breast_Cancer_Regression.csv'

# Save for classification (drop rows with missing target)
df_imputed.to_csv(classification_data_path, index=False)

# Save for regression if needed (optional)
if 'Survival_Months' in df_imputed.columns:
    df_reg = df_imputed.dropna(subset=['Survival_Months'])
    df_reg.to_csv(regression_data_path, index=False)


**Loading prepared dataset**

---


load preprocessed classification dataset from Final notebook 1


In [None]:
#loading prepared classification dataset
data = pd.read_csv('./data/processed/Prepared_Breast_Cancer_Classification.csv')

# Displaying first five rows
data.head()

**Understanding the dataset**

---


checks the dataset's structure, data types, and visualizes the distribution of Mortality_Status

In [None]:
#Displaying column names
print('Columns:', list(data.columns))

# Checking data types and summary
data.info()
#Visualizing Mortality_Status distribution
fig = px.bar(data, x='Mortality_Status', title='Distribution of Mortality Status')
fig.show()


In [None]:
# Task 4: Define features and target
features = ['Age', 'Sex', 'T_Stage', 'N_Stage', '6th_Stage', 'Differentiated', 'Grade',
            'A_Stage', 'Tumor_Size', 'Estrogen_Status', 'Progesterone_Status',
            'Regional_Node_Examined', 'Regional_Node_Positive']
target = 'Mortality_Status'

X = data[features]
y = data[target]

# Graphical Output: Feature Names and Data Shape
plt.figure(figsize=(8, 7))
plt.text(0.05, 0.95, "Features for Classification Models:", fontsize=12, fontweight='bold')
plt.text(0.05, 0.85, "\n".join(features), fontsize=10, va='top')
plt.text(0.05, 0.25, f"Data Shape:\nFeatures (X): {X.shape}\nTarget (y): {y.shape}", fontsize=10)
plt.axis('off')
plt.title("Feature Names and Data Shape")
plt.savefig('./results/Feature_Names_Data_Shape.png', dpi=300, bbox_inches='tight')
plt.show()

# Splitting data into training and test sets with stratification
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)

print("\nMortality_Status Distribution (0: Alive, 1: Dead):")
print("Training Set:")
print(y_train.value_counts(normalize=True))
print("Test Set:")
print(y_test.value_counts(normalize=True))

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5), sharey=True)
train_dist = y_train.value_counts(normalize=True).sort_index()
test_dist = y_test.value_counts(normalize=True).sort_index()
ax1.bar(['Alive (0)', 'Dead (1)'], train_dist, color='skyblue')
ax1.set_title('Training Set Distribution')
ax1.set_ylabel('Proportion')
ax2.bar(['Alive (0)', 'Dead (1)'], test_dist, color='lightgreen')
ax2.set_title('Test Set Distribution')
plt.suptitle('Mortality_Status Distribution (0: Alive, 1: Dead)')
plt.savefig('./results/Class_Distribution.png', dpi=300, bbox_inches='tight')
plt.show()

# Checking and imputing missing values
print('Missing Values (%):')
print(data.isna().sum() / len(data) * 100)

X = data.drop(['Mortality_Status'], axis=1)
y = data['Mortality_Status']

imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns)

print('Missing Values in X after Imputation (%):')
print(X.isna().sum() / len(X) * 100)

print('Features:', list(X.columns))
print('Target sample:', y.head())

# Naive Bayes Model
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

nb_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_nb})
nb_comparison.to_csv('./results/NB_Predictions.csv', index=True)

print('Naive Bayes Prediction Comparison:')
print(nb_comparison.head())

cm_nb = confusion_matrix(y_test, y_pred_nb, labels=nb.classes_)
disp_nb = ConfusionMatrixDisplay(cm_nb, display_labels=nb.classes_)
disp_nb.plot()
plt.title('Naive Bayes Confusion Matrix')
plt.show()

print('Naive Bayes Classification Report:')
print(classification_report(y_test, y_pred_nb))

nb_roc = RocCurveDisplay.from_estimator(nb, X_test, y_test)
plt.title('Naive Bayes ROC Curve')
plt.show()

# Logistic Regression Model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

lr_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_lr})
lr_comparison.to_csv('./results/LR_Predictions.csv', index=True)

print('Logistic Regression Prediction Comparison:')
print(lr_comparison.head())

cm_lr = confusion_matrix(y_test, y_pred_lr, labels=lr.classes_)
disp_lr = ConfusionMatrixDisplay(cm_lr, display_labels=lr.classes_)
disp_lr.plot()
plt.title('Logistic Regression Confusion Matrix')
plt.show()

print('Logistic Regression Classification Report:')
print(classification_report(y_test, y_pred_lr))

lr_roc = RocCurveDisplay.from_estimator(lr, X_test, y_test)
plt.title('Logistic Regression ROC Curve')
plt.show()

# Optimizing kNN: Finding Best k
error = []

for i in range(1, 40):
    knn_temp = KNeighborsClassifier(n_neighbors=i)
    knn_temp.fit(X_train, y_train)
    pred_i = knn_temp.predict(X_test)
    error.append(np.mean(pred_i != y_test))

plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K value')
plt.ylabel('Mean Error')
plt.savefig('./results/kNN_Error_Rate.png', dpi=300, bbox_inches='tight')
plt.show()

best_k = np.argmin(error) + 1
print(f'Best k value: {best_k}')

# Building and evaluating kNN with best k
knn = KNeighborsClassifier(n_neighbors=best_k)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

knn_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_knn})
knn_comparison.to_csv('./results/KNN_Predictions.csv', index=True)

print('kNN Prediction Comparison:')
print(knn_comparison.head())

cm_knn = confusion_matrix(y_test, y_pred_knn, labels=knn.classes_)
disp_knn = ConfusionMatrixDisplay(cm_knn, display_labels=knn.classes_)
disp_knn.plot()
plt.title(f'kNN Confusion Matrix (k={best_k})')
plt.show()

print('kNN Classification Report:')
print(classification_report(y_test, y_pred_knn))

knn_roc = RocCurveDisplay.from_estimator(knn, X_test, y_test)
plt.title(f'kNN ROC Curve (k={best_k})')
plt.show()

# Hyperparameter tuning for kNN
knn = KNeighborsClassifier()
param_grid = {'n_neighbors': np.arange(1, 25), 'metric': ['euclidean', 'manhattan']}
knn_gscv = GridSearchCV(knn, param_grid, cv=5, scoring='roc_auc')
knn_gscv.fit(X, y)

print('Best kNN Parameters:', knn_gscv.best_params_)

knn_opt = KNeighborsClassifier(n_neighbors=knn_gscv.best_params_['n_neighbors'],
                               metric=knn_gscv.best_params_['metric'])
knn_opt.fit(X_train, y_train)
y_pred_knn_opt = knn_opt.predict(X_test)

knn_opt_comparison = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_knn_opt})
knn_opt_comparison.to_csv('./results/kNN_Optimized_Predictions.csv', index=True)

print('kNN Optimized Prediction Comparison:')
print(knn_opt_comparison.head())

cm_knn_opt = confusion_matrix(y_test, y_pred_knn_opt, labels=knn_opt.classes_)
disp_knn_opt = ConfusionMatrixDisplay(cm_knn_opt, display_labels=knn_opt.classes_)
disp_knn_opt.plot()
plt.title('Optimized kNN Confusion Matrix')
plt.show()

print('Optimized kNN Classification Report:')
print(classification_report(y_test, y_pred_knn_opt))

knn_opt_roc = RocCurveDisplay.from_estimator(knn_opt, X_test, y_test)
plt.title('Optimized kNN ROC Curve')
plt.show()

# Model comparison and summary
model_comparison = pd.DataFrame({'Actual': y_test,
                                 'NB_Predicted': y_pred_nb,
                                 'Logistic_Regression': y_pred_lr,
                                 'kNN_Predicted': y_pred_knn,
                                 'kNN_Optimized_Predicted': y_pred_knn_opt})

model_comparison.to_csv('./results/Model_Comparison_Predictions.csv', index=True)

print('Model comparison:')
print(model_comparison.head())

print('Model performance summary:')
print('1. Naive Bayes: Check classification report and ROC curve for accuracy and AUC.')
print('2. Logistic Regression: Check classification report and ROC curve for accuracy and AUC.')
print(f'3. kNN (k={best_k}): Check classification report and ROC curve for accuracy and AUC.')
print('4. Optimized kNN: Check classification report and ROC curve for accuracy and AUC.')
print('Compare models based on accuracy, precision, recall, F1-score, and AUC to select the best model.')