In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import numpy as np
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data'
column_names = ['id', 'clump_thickness', 'uniformity_of_cell_size', 'uniformity_of_cell_shape',
                'marginal_adhesion', 'single_epithelial_cell_size', 'bare_nuclei',
                'bland_chromatin', 'normal_nucleoli', 'mitoses', 'class']
df = pd.read_csv(url, names=column_names)
df = df.replace('?', np.nan)
df = df.dropna()
le = LabelEncoder()
df['class'] = le.fit_transform(df['class'])
X = df.drop(['id', 'class'], axis=1)
y = df['class']
clf = DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5, max_depth=2)
clf.fit(X, y)
feature_index = clf.tree_.feature[0]
threshold = clf.tree_.threshold[0]
feature_name = X.columns[feature_index]
def entropy(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return -np.sum(probabilities * np.log2(probabilities))
def gini(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return 1 - np.sum(probabilities**2)
def misclassification_error(y):
    classes, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return 1 - np.max(probabilities)
parent_entropy = entropy(y)
parent_gini = gini(y)
parent_misclassification = misclassification_error(y)
left_indices = X[feature_name] <= threshold
right_indices = X[feature_name] > threshold
left_y = y[left_indices]
right_y = y[right_indices]
left_entropy = entropy(left_y)
right_entropy = entropy(right_y)
left_gini = gini(left_y)
right_gini = gini(right_y)
left_misclassification = misclassification_error(left_y)
right_misclassification = misclassification_error(right_y)
entropy_gain = parent_entropy - (len(left_y) / len(y) * left_entropy + len(right_y) / len(y) * right_entropy)
gini_gain = parent_gini - (len(left_y) / len(y) * left_gini + len(right_y) / len(y) * right_gini)
misclassification_gain = parent_misclassification - (len(left_y) / len(y) * left_misclassification + len(right_y) / len(y) * right_misclassification)

print(f"Entropy of the first split: {entropy_gain}")
print(f"Gini of the first split: {gini_gain}")
print(f"Misclassification Error of the first split: {misclassification_gain}")
print(f"Information Gain: {entropy_gain}")
print(f"Feature selected for the first split: {feature_name}")
print(f"Value determining the decision boundary: {threshold}")

Entropy of the first split: 0.5889187667244618
Gini of the first split: 0.32550820073530584
Misclassification Error of the first split: 0.2767203513909224
Information Gain: 0.5889187667244618
Feature selected for the first split: uniformity_of_cell_size
Value determining the decision boundary: 2.5
