# Import Necessary Libraries and Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('dataSets/Breast_Cancer.csv')

## Analysis

In [3]:
df.head()

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
0,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive
1,50,White,Married,T2,N2,IIIA,Moderately differentiated,2,Regional,35,Positive,Positive,14,5,62,Alive
2,58,White,Divorced,T3,N3,IIIC,Moderately differentiated,2,Regional,63,Positive,Positive,14,7,75,Alive
3,58,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,18,Positive,Positive,2,1,84,Alive
4,47,White,Married,T2,N1,IIB,Poorly differentiated,3,Regional,41,Positive,Positive,3,1,50,Alive


In [4]:
df.isnull().sum()

Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64

In [5]:
df[df.duplicated()]

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status
436,63,White,Married,T1,N1,IIA,Moderately differentiated,2,Regional,17,Positive,Positive,9,1,56,Alive


In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df[df.duplicated()]

Unnamed: 0,Age,Race,Marital Status,T Stage,N Stage,6th Stage,differentiate,Grade,A Stage,Tumor Size,Estrogen Status,Progesterone Status,Regional Node Examined,Reginol Node Positive,Survival Months,Status


In [8]:
mixed_cols = []

for col in df.columns:
    print(f"{col}: {df[col].nunique()}")

Age: 40
Race: 3
Marital Status: 5
T Stage : 4
N Stage: 3
6th Stage: 5
differentiate: 4
Grade: 4
A Stage: 2
Tumor Size: 110
Estrogen Status: 2
Progesterone Status: 2
Regional Node Examined: 54
Reginol Node Positive: 38
Survival Months: 107
Status: 2


In [9]:
# For the continuous features (age, tumor size, regional node examined, and reginol node positive), we have to decide 
# if there are any of the we want to bin
# We will try both

In [10]:
# Without binning

from sklearn.preprocessing import OrdinalEncoder

ordinalEncoder = OrdinalEncoder()
categoricalCols = df.nunique()[df.nunique() <= 10].drop('Status').index

df[categoricalCols] = ordinalEncoder.fit_transform(df[categoricalCols])

In [11]:
# With binning

df['Tumor Size Binned'] = pd.qcut(df['Tumor Size'], q=5, duplicates='drop', labels=False)
df['Regional Node Examined Binned'] = pd.qcut(df['Regional Node Examined'], q=5, duplicates='drop', labels=False)
df['Reginol Node Positive Binned'] = pd.qcut(df['Reginol Node Positive'], q=5, duplicates='drop', labels=False)

In [12]:
from sklearn.model_selection import train_test_split


target = 'Status'
X = df.drop(columns=[target])
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # stratify preserves split ratio

### Decision Tree Model

In [13]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier


param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10]
}

model = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid, cv=5, scoring='accuracy')
model.fit(X_train, y_train)

best_model = model.best_estimator_

In [14]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = best_model.predict(X_test)

print("Best Parameters:", model.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Best Parameters: {'criterion': 'entropy', 'max_depth': 5, 'min_samples_split': 5}
Accuracy: 0.9031055900621118
Report:
               precision    recall  f1-score   support

       Alive       0.90      0.99      0.95       682
        Dead       0.88      0.42      0.57       123

    accuracy                           0.90       805
   macro avg       0.89      0.71      0.76       805
weighted avg       0.90      0.90      0.89       805

Confusion Matrix:
 [[675   7]
 [ 71  52]]


In [15]:
# Based on these metrics, let's try to tune it
# What can be seen is that it struggles to identify dead
# So, we will first try to use class_weight='balanced'
clf = DecisionTreeClassifier(class_weight='balanced', criterion='entropy', max_depth=3, min_samples_split=2)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7453416149068323
Report:
               precision    recall  f1-score   support

       Alive       0.94      0.74      0.83       682
        Dead       0.35      0.75      0.47       123

    accuracy                           0.75       805
   macro avg       0.64      0.75      0.65       805
weighted avg       0.85      0.75      0.78       805

Confusion Matrix:
 [[508 174]
 [ 31  92]]
