**Python Notebook 1: Data Understanding and Preprocessing**

**Author:** Dinuka Induwara,

---
This notebook focuses on data understanding and preprocessing for the breast cancer dataset as per the coursework requirments. In this notebook we created two separate datasets for classification and regression for cancer mortality status and cancer survial months.


In [None]:
# 1. IMPORT LIBRARIES
import pandas as pd
import numpy as np
import os

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import plotly.express as px

# Set pandas options
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 180)


In [None]:
# 2. LOAD RAW DATA
data_path = '../data/raw/breast_cancer.csv'  # Make sure this file is here
df = pd.read_csv(data_path)
df.head()


In [None]:
# 3. INITIAL INSPECTION
print("Columns:", list(df.columns))
print("\nData Info:")
df.info()

print("\nMissing Values (%):")
print(df.isna().sum() / len(df) * 100)

print("\nTarget Distribution:")
print(df['Mortality_Status'].value_counts())
fig = px.bar(df, x='Mortality_Status', title='Distribution of Mortality Status')
fig.show()


In [None]:
# 4. IMPUTATION
imputer = SimpleImputer(strategy='median')
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print("Remaining Missing (%):")
print(df_imputed.isna().sum() / len(df_imputed) * 100)


In [None]:
# 5. ENCODING CATEGORICAL COLUMNS
categorical_cols = ['Sex', 'T_Stage', 'N_Stage', '6th_Stage', 'Differentiated', 'Grade', 'A_Stage',
                    'Estrogen_Status', 'Progesterone_Status']
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_imputed[col] = le.fit_transform(df_imputed[col].astype(str))
    label_encoders[col] = le


In [None]:
# 6. SCALING NUMERICAL FEATURES
numerical_cols = ['Age', 'Tumor_Size', 'Regional_Node_Examined', 'Regional_Node_Positive']
scaler = StandardScaler()
df_imputed[numerical_cols] = scaler.fit_transform(df_imputed[numerical_cols])


In [None]:
# 7. SAVE CLASSIFICATION & REGRESSION DATASETS
classification_data_path = '../data/processed/Prepared_Breast_Cancer_Classification.csv'
regression_data_path = '../data/processed/Prepared_Breast_Cancer_Regression.csv'

# Save for classification (drop rows with missing target)
df_imputed.to_csv(classification_data_path, index=False)

# Save for regression if needed (optional)
if 'Survival_Months' in df_imputed.columns:
    df_reg = df_imputed.dropna(subset=['Survival_Months'])
    df_reg.to_csv(regression_data_path, index=False)
