In [3]:
pip install scikit-learn


Note: you may need to restart the kernel to use updated packages.


In [15]:
import numpy as np

# entorpy calculation
def entropy(y):
    unique,count= np.unique(y,return_counts=True)
    probabilites= count/count.sum()
    # compute entropy

    return -np.sum(probabilites* np.log2(probabilites))

def information_gain(X_col, y, threshold):
    parent_entropy = entropy(y)
    left_mask = X_col <= threshold
    right_mask = X_col > threshold
    
    # no split
    if len(y[left_mask]) == 0 or len(y[right_mask]) == 0:
        return 0
    
    n = len(y)
    e_left = entropy(y[left_mask])
    e_right = entropy(y[right_mask])
    n_left, n_right = len(y[left_mask]), len(y[right_mask])
    child_entropy = (n_left/n)*e_left + (n_right/n)*e_right
    return parent_entropy - child_entropy




In [17]:
class DecisionTree:
    def __init__(self,max_depth=3):
        # depth of the tree
        self.max_depth=max_depth
        self.tree=None 

    def fit(self,X,y,depth=0):
        """ 
          X: Training features (2D numpy array)
          y: labels (1D Numpy array)
          depth: depth of the tree 
        """

        n_samples,n_feature= X.shape # number of sample and number of feature 
        unique_classes=np.unique(y) # unique 

        # -- stopping Conditon -------

        # if all labels are same return -> that class
        # if max_depth reached -> return majority class 

        if len(unique_classes)==1 or depth >= self.max_depth:
            return np.argmax(np.bincount(y)) # most common label 
        
        # find the best feature and best threasold

        best_gain = -1 
        best_feat,best_thre= None,None 
        # Loop through each feature 
        for feat_idx in range(n_feature):
            thresolds= np.unique(X[:,feat_idx])
            for thresold in thresolds:
                gain= information_gain(X[:,feat_idx],y,thresold)
                if gain>best_gain:
                    best_gain= gain 
                    best_feat= feat_idx
                    best_thre=thresold 
    
        # If no good split found, return the majority class
        if best_gain==0:
            return np.argmax(np.bincount(y))
        
        # split dataset feature and thresold 

        left_subset= X[:,best_feat]<= best_thre 
        right_subset= X[:,best_feat]> best_thre 


        # recursively build the left and right subtree

        left_subtree= self.fit(X[left_subset],y[left_subset],depth+1)
        right_subtree= self.fit(X[right_subset],y[right_subset],depth+1)

       # Store the decision rule as a tuple:
       # (feature_index, threshold, left_subtree, right_subtree)

        self.tree = (best_feat,best_thre,left_subtree,right_subtree)
        return self.tree 
    
    # predict one sample 
    def predict_one(self,X,tree=None):
        """ 
            predict labele for one sample x
        """
        if tree is None:
            tree= self.tree 

        feat,thresh,left,right= tree 
        # If both children are leaves (integers = class labels)
        if isinstance(left,int) and isinstance(right,int):
           return left if X[left]<= thresh else right
        
        if X[feat]<=thresh:
            # if left child is another tree,recurse
            if isinstance(left,tuple):
                return self.predict_one(X,left)
            else:
                return left 
        else: 

            if isinstance(right,tuple):
                return self.predict_one(X,right)
            else:
                return right
    
    def predict(self,X):
        return np.array([self.predict_one(x) for x in X])
    


  
if __name__ == "__main__":
    # Import dataset and helper tools
        from sklearn.datasets import load_iris
        from sklearn.model_selection import train_test_split
        from sklearn.metrics import accuracy_score

        # Load sample data (Iris)
        X, y = load_iris(return_X_y=True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

        # Create and train decision tree
        tree = DecisionTree(max_depth=3)
        tree.fit(X_train, y_train)

        # Predict and evaluate
        preds = tree.predict(X_test)
        print("Predictions:", preds[:100])
        print("Accuracy:", accuracy_score(y_test, preds))
    


            



     
        

        





Predictions: [0 1 1 2 1 2 2 1 2 2 2 0 2 1 1 1 1 1 2 0 0 1 2 1 2 0 1 2 0 1 2 1 0 1 2 0 2
 2 2 1 1 2 1 1 2]
Accuracy: 0.8666666666666667


AttributeError: 'DecisionTree' object has no attribute 'self'