In [156]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [157]:

df = pd.read_csv('Breast Cancer Classification.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [158]:
print(df.shape)
print(df.columns)
print(df.isnull().sum())

(569, 33)
Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')
id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean  

In [159]:
df.drop(['Unnamed: 32','id'],axis=1,inplace=True)
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [160]:
X = df.drop('diagnosis',axis=1)
y = df['diagnosis']

In [161]:
Xtrain,Xtest,ytrain,ytest = train_test_split(X,y,test_size=0.3,random_state=1,shuffle=True)


## SciKit Learn Implementation of Decision Tree

### Using Gini Index Criterion.

In [162]:
model = DecisionTreeClassifier(random_state=1) # default criterion (method) used is Gini Index.
model.fit(Xtrain,ytrain)

In [163]:
predictions = model.predict(Xtest)
report = classification_report(ytest,predictions)
print(report)

              precision    recall  f1-score   support

           B       0.90      1.00      0.95       108
           M       1.00      0.81      0.89        63

    accuracy                           0.93       171
   macro avg       0.95      0.90      0.92       171
weighted avg       0.94      0.93      0.93       171



### Changing/Tuning Tree Structure by specifying minimum numer of samples required for splitting into a node.

In [164]:
model = DecisionTreeClassifier(min_samples_split = 4, random_state=1) # default criterion (method) used is Gini Index.
model.fit(Xtrain,ytrain)

predictions = model.predict(Xtest)
report = classification_report(ytest,predictions)
print(report)

              precision    recall  f1-score   support

           B       0.93      0.99      0.96       108
           M       0.98      0.87      0.92        63

    accuracy                           0.95       171
   macro avg       0.96      0.93      0.94       171
weighted avg       0.95      0.95      0.95       171



As we can observe in the classification reports, specifying the minimum numer of samples required for splitting into a node has a positive impact on the performance of the model, particularly for the Malignant class which is important since we are training the model for medical diagnosis for breast cancer classification:
Recall: 0.81 => 0.87. (For Malignant Class)


### Using Entropy Criterion.

In [143]:
entropy_model = DecisionTreeClassifier(criterion='entropy',random_state=1) # changing the criterion to Entropy.
entropy_model.fit(Xtrain,ytrain)

In [144]:
entropy_predictions = entropy_model.predict(Xtest)
entropy_report = classification_report(ytest,entropy_predictions)
print(entropy_report)

              precision    recall  f1-score   support

           B       0.93      0.94      0.93       108
           M       0.89      0.87      0.88        63

    accuracy                           0.91       171
   macro avg       0.91      0.90      0.91       171
weighted avg       0.91      0.91      0.91       171



### Specifying min_samples_split along with Entropy criterion.

In [155]:
entropy_model = DecisionTreeClassifier(min_samples_split=4,criterion='entropy',random_state=1)
entropy_model.fit(Xtrain,ytrain)

entropy_predictions = entropy_model.predict(Xtest)
entropy_report = classification_report(ytest,entropy_predictions)
print(entropy_report)

              precision    recall  f1-score   support

           B       0.93      0.94      0.93       108
           M       0.89      0.87      0.88        63

    accuracy                           0.91       171
   macro avg       0.91      0.90      0.91       171
weighted avg       0.91      0.91      0.91       171



Specifying min_samples_split = 4 along with Entropy criterion does not have any effect on the performance of the model in this case.

## GINI Index Implementation

In [146]:

def gini(y):
    """Calculate Gini Impurity"""
    m = len(y)
    if m == 0:
        return 0
    proportions = np.bincount(y) / m
    return 1 - np.sum(proportions ** 2)

def entropy(y):
    """Calculate Entropy"""
    m = len(y)
    if m == 0:
        return 0
    proportions = np.bincount(y) / m
    return -np.sum([p * np.log2(p) for p in proportions if p > 0])

def best_split(X, y, criterion='gini'):
    """Find the best feature and threshold to split on"""
    best_feature, best_threshold, best_score = None, None, float('inf')
    m, n = X.shape

    for feature in range(n):
        thresholds = np.unique(X[:, feature])
        for threshold in thresholds:
            left_mask = X[:, feature] <= threshold
            right_mask = ~left_mask
            left_y, right_y = y[left_mask], y[right_mask]

            if len(left_y) == 0 or len(right_y) == 0:
                continue

            if criterion == 'gini':
                impurity = (len(left_y) * gini(left_y) + len(right_y) * gini(right_y)) / m
            elif criterion == 'entropy':
                impurity = (len(left_y) * entropy(left_y) + len(right_y) * entropy(right_y)) / m
            else:
                raise ValueError("Invalid criterion. Choose 'gini' or 'entropy'.")

            if impurity < best_score:
                best_feature, best_threshold, best_score = feature, threshold, impurity

    return best_feature, best_threshold

def build_tree(X, y, depth=0, max_depth=None, min_samples_split=2, criterion='gini'):
    """Recursively build the decision tree"""
    num_samples, num_features = X.shape

    # Stopping criteria
    if (max_depth is not None and depth == max_depth) or (num_samples < min_samples_split) or (len(set(y)) == 1):
        return np.argmax(np.bincount(y))  # Return majority class

    feature, threshold = best_split(X, y, criterion)
    if feature is None:
        return np.argmax(np.bincount(y))

    left_mask = X[:, feature] <= threshold
    right_mask = ~left_mask

    return {
        'feature': feature,
        'threshold': threshold,
        'left': build_tree(X[left_mask], y[left_mask], depth + 1, max_depth, min_samples_split, criterion),
        'right': build_tree(X[right_mask], y[right_mask], depth + 1, max_depth, min_samples_split, criterion)
    }

def predict_single(x, tree, features):
    """Predict a single sample"""
    # Convert x to a pandas Series if it's not already for feature lookup
    x = pd.Series(x, index=features) if not isinstance(x, pd.Series) else x

    if isinstance(tree, dict):
        feature_val = x[tree['feature']]
        if feature_val <= tree['threshold']:
            return predict_single(x, tree['left'], features)
        else:
            return predict_single(x, tree['right'], features)
    return tree

def predict(X, tree, features):
    """Predict multiple samples"""
    # Convert X to NumPy array if it is a DataFrame for compatibility
    X_np = X.to_numpy() if isinstance(X, pd.DataFrame) else X
    return np.array([predict_single(sample, tree, features) for sample in X_np])



ytrain_encoded = ytrain.map({'B': 0, 'M': 1})  # Mapping 'B' to 0 and 'M' to 1
ytest_encoded = ytest.map({'B': 0, 'M': 1})

tree = build_tree(Xtrain.to_numpy(), ytrain_encoded, max_depth=5, min_samples_split=2, criterion='gini')


predictions = predict(Xtest, tree, Xtrain.columns)
report = classification_report(ytest_encoded, predictions)
print(report)

              precision    recall  f1-score   support

           0       0.91      0.98      0.95       108
           1       0.96      0.84      0.90        63

    accuracy                           0.93       171
   macro avg       0.94      0.91      0.92       171
weighted avg       0.93      0.93      0.93       171



  feature_val = x[tree['feature']]
