In [2]:
import numpy as np
from collections import Counter
def Entropy(value):
    numcount = np.bincount(value)
    probarray = numcount/len(value)
    return -np.sum([p*(np.log2(p)) for p in probarray if p>0])

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None,  value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
    
    def isLeafNode(self):
        return self.value is not None
    

class DecisionTree:
    def __init__(self, minSamples=2, maxDepth=100, nFeatures=None):
        self.minSamples = minSamples
        self.maxDepth = maxDepth
        self.nFeatures = nFeatures
        self.root = None
        
    def newNode(self, X,y):
        self.nFeatures = X.shape[1] 
        self.root = self.growTree(X,y)
        
    def growTree(self, X, y, depth=0):
        totalsamples = X.shape[0]
        newfeatures = X.shape[1]
        nresults = len(np.unique(y))
        
        if(depth>=self.maxDepth or nresults==1 or totalsamples<self.minSamples):
            leafval = self.mostcommonresult(y)
            return Node(value=leafval)
        
        featureIndices = np.random.choice(newfeatures, self.nFeatures, replace=False)
        bestFeatureIndex, bestThreshold = self.bestSplitCriteria(X,y,featureIndices)
        
        leftIndices, rightIndices = self.split(X[:,bestFeatureIndex], bestThreshold)
        leftTree = self.growTree(X[leftIndices, :], y[leftIndices], depth+1)
        rightTree = self.growTree(X[rightIndices, :], y[rightIndices], depth+1)
        return Node(bestFeatureIndex,bestThreshold, leftTree, rightTree)
    
    
    def bestSplitCriteria(self,X,y,featureIndices):
        bestGain = -1
        bestIndex, bestThresh = None, None
        for index in featureIndices:
            featureColumn = X[:, index]
            thresholds = np.unique(featureColumn)
            for thresh in thresholds:
                gain = self.informationGain(featureColumn, y, thresh)
                if gain>bestGain:
                    bestGain = gain
                    bestIndex = index
                    bestThresh = thresh
        return bestIndex, bestThresh           
    
    
    def informationGain(self, featureColumn, y, thresh):
        parentEntropy = Entropy(y)
        leftIndices, rightIndices = self.split(featureColumn, thresh)
        if len(leftIndices)==0 or len(rightIndices==0):
            return 0
        total = len(y)
        nleft, nright = len(leftIndices), len(rightIndices)
        entleft, entright = Entropy(y[leftIndices]), Entropy(y[rightIndices])
        childEntropy = (nleft/total)*entleft + (nright/total)*entright
        return parentEntropy - childEntropy
      

    def split(self, featureColumn, bestThresh):
        leftIndices = np.argwhere(featureColumn<=bestThresh).flatten()
        rightIndices = np.argwhere(featureColumn>bestThresh).flatten()
        return leftIndices, rightIndices
    
    
    def traverseTree(self, x, node):
        if node.isLeafNode():
            return node.value
        if x[node.feature] <= node.threshold:
            return self.traverseTree(x, node.left)
        return self.traverseTree(x , node.right)
    
    def predict(self, X):
        return np.array([self.traverseTree(x, self.root) for x in X])    
 
    def mostcommonresult(self,y):
        return Counter(y).most_common(1)[0][0]
        

In [3]:
if __name__ == "__main__":
    # Imports
    from sklearn import datasets
    from sklearn.model_selection import train_test_split

    def accuracy(y_true, y_pred):
        accuracy = np.sum(y_true == y_pred) / len(y_true)
        return accuracy

    data = datasets.load_breast_cancer()
    X, y = data.data, data.target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=1234
    )

    clf = DecisionTree(maxDepth=10)
    clf.newNode(X_train, y_train)

    y_pred = clf.predict(X_test)
    acc = accuracy(y_test, y_pred)

    print("Accuracy:", acc)

Accuracy: 0.5964912280701754
