In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2

In [10]:
#select the features
data = pd.read_csv('./data/breast-cancer.csv')
data = data.dropna()
data.drop('id', axis=1, inplace=True)
features = data.drop('diagnosis', axis=1)
target = data['diagnosis']

selector = SelectKBest(score_func=chi2, k=5) # select 5 most relevant features using chi2 scores
selector.fit_transform(features, target)
feature_name = features.columns[selector.get_support()]
target_name = 'diagnosis'
print(feature_name)
features = features[feature_name]
print(features)

Index(['perimeter_mean', 'area_mean', 'area_se', 'perimeter_worst',
       'area_worst'],
      dtype='object')
     perimeter_mean  area_mean  area_se  perimeter_worst  area_worst
0            122.80     1001.0   153.40           184.60      2019.0
1            132.90     1326.0    74.08           158.80      1956.0
2            130.00     1203.0    94.03           152.50      1709.0
3             77.58      386.1    27.23            98.87       567.7
4            135.10     1297.0    94.44           152.20      1575.0
..              ...        ...      ...              ...         ...
564          142.00     1479.0   158.70           166.10      2027.0
565          131.20     1261.0    99.04           155.00      1731.0
566          108.30      858.1    48.55           126.70      1124.0
567          140.10     1265.0    86.22           184.60      1821.0
568           47.92      181.0    19.15            59.16       268.6

[569 rows x 5 columns]


In [11]:
class Node:
    def __init__(self, data, left_child=None, right_child=None, feature=None, threshold=None, value=None):
        self.data = data
        self.left_child = left_child
        self.right_child = right_child
        self.value = value
        self.feature = feature
        self.threshold = threshold

In [16]:
class DecisionTree:
    def __init__(self, max_depth=2, min_node_size=2):
        self.max_depth = max_depth
        self.min_node_size = min_node_size
        self.depth = 0
        
    def entropy(self, y):
        classes = np.unique(y)
        result = 0
        for c in classes:
            selected_y = y[y==c]
            p = len(selected_y)/len(y)
            result += -p*np.log2(p)
        return result
        
    def information_gain(self, node):
        parent_y = node.data.iloc[:,-1]
        left_y = node.left_child.data.iloc[:,-1]
        right_y = node.right_child.data.iloc[:,-1]
        
        parent_entropy = self.entropy(parent_y)
        left_entropy = self.entropy(left_y)
        right_entropy = self.entropy(right_y)
        
        w1 = len(node.left_child.data) / len(node.data)
        w2 = len(node.right_child.data) / len(node.data)
        return parent_entropy - w1*left_entropy - w2*right_entropy
    
    def split_metric(self, node):
        result_feature = None
        result_threshold = None
        max_gain = 0
        feature_name = node.data.iloc[:,:-1].columns
        
        for feature in feature_name:
            for real_value in node.data[feature]:
                test_node = Node(data=node.data)
                test_node.left_child = Node(data=node.data[node.data[feature]<real_value])
                test_node.right_child = Node(data=node.data[node.data[feature]>=real_value])
                test_gain = self.information_gain(test_node)
                if test_gain > max_gain:
                    result_feature = feature
                    result_threshold = real_value
                    max_gain = test_gain
        
        return result_feature, result_threshold
    
    def split_node(self, node):
        feature, threshold = self.split_metric(node)
        left_data = node.data[node.data[feature]<threshold]
        right_data = node.data[node.data[feature]>=threshold]
        return left_data, right_data, feature, threshold
    
    def build_tree(self, dataset):
        # terminate conditon
        if len(dataset)<=self.min_node_size or self.depth>=self.max_depth:
            y = dataset.iloc[:,-1]
            y = list(y)
            leaf_value = max(y, key=y.count)
            return Node(data=dataset, value=leaf_value)
        
        self.depth += 1
        node = Node(data=dataset)
        left_data, right_data, feature, threshold = self.split_node(node)
        node.left_child = self.build_tree(left_data)
        node.right_child = self.build_tree(right_data)
        node.feature = feature
        node.threshold = threshold
        return node
    
    def fit(self, X, y):
        self.dataset = pd.concat([X,y], axis=1)
        self.root = self.build_tree(self.dataset)
        
    def make_prediction(self, sample, node):
        if node.value !=None:
            return node.value
        
        target = node.target
        threshold = node.threshold
        if sample[target] >= threshold:
            self.make_prediction(sample, node.right_child)
        else:
            self.make_prediction(sample, node.left_child)
        
    def predict(self, X):#traverse from the root to the leaf
        result = []
        for sample in X:
            result.append(sample,self.root)
        return result

In [17]:
model = DecisionTree()
target = target.values.reshape(-1,1)
target = pd.DataFrame(target, columns=['diagnosis'])
model.fit(features, target)