In [1]:
import numpy as np 
import pandas as pd

In [2]:
dF = pd.read_csv('data//csvs//dataframeV1.csv', index_col=0)
dF = dF.drop(['id', 'uri'], axis = 1)
dF.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,label
0,0.275,0.157,7,-18.752,1,0.0636,0.89,0.842,0.186,0.304,73.289,152280,4,classic
1,0.221,0.126,0,-25.427,1,0.0447,0.989,0.897,0.102,0.216,133.63,139307,4,classic
2,0.289,0.0306,9,-30.79,0,0.0446,0.987,0.911,0.102,0.118,125.61,212067,3,classic
3,0.0753,0.07,2,-27.272,1,0.044,0.918,0.947,0.146,0.0625,79.801,365147,4,classic
4,0.13,0.158,2,-16.132,1,0.035,0.748,0.924,0.1,0.0998,85.031,302093,4,classic


In [3]:
dF.label = pd.Categorical(dF.label)
dF['Y'] = dF.label.cat.codes
dF = dF.drop(['label'], axis = 1)
dF.head()

Unnamed: 0,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature,Y
0,0.275,0.157,7,-18.752,1,0.0636,0.89,0.842,0.186,0.304,73.289,152280,4,0
1,0.221,0.126,0,-25.427,1,0.0447,0.989,0.897,0.102,0.216,133.63,139307,4,0
2,0.289,0.0306,9,-30.79,0,0.0446,0.987,0.911,0.102,0.118,125.61,212067,3,0
3,0.0753,0.07,2,-27.272,1,0.044,0.918,0.947,0.146,0.0625,79.801,365147,4,0
4,0.13,0.158,2,-16.132,1,0.035,0.748,0.924,0.1,0.0998,85.031,302093,4,0


In [4]:
class Node : 

    def __init__(self, feature, thresh,  left = None, right = None, value = None) : 
        self.feature = feature
        self.value = value
        self.left = left
        self.right = right
        self.thresh = thresh
    
    def is_leaf(self): 
        return self.value is not None

In [5]:
class Tree : 

    def __init__(self, max_depth = 6, min_sample_per_split=2) : 

        self.max_depth = max_depth 
        self.min_sample_per_split = min_sample_per_split
        self.root = None

    def _split(self, X, thresh) : 
        """
        Split on X based on thresh

        args : 
            X : axis 
            thresh : value of split
        return : 
            left_idx : where X <= thresh
            right_idx : where X > thresh
        """
        left_idx = np.argwhere(X <= thresh).flatten()
        right_idx = np.argwhere(X > thresh).flatten()

        return (left_idx, right_idx)   

    def _entropyImpurity(Y) : 
        """
        Calculate the entropy impurity on Y
        """
        print(Y)
        a = Y.value_counts() / Y.shape[0]
        entropy = -np.sum(a * np.log2(a + 1e-9))
        return entropy

    def _infGain(self, X, y, thresh) : 
        """
        Calculate the information gain. 
        """
        my_loss = self._entropyImpurity()

        l_idx, r_idx = self._split(X, thresh)

        n, n_l, n_r = len(y), len(l_idx), len(r_idx)

        chid_loss = (n_l / n) * self._entropyImpurity(y[l_idx]) + (n_r / n) * self._entropyImpurity(y(r_idx))

        return my_loss - chid_loss
    
    def _bestSplit(self, X, y, features) :

        split = {
            'score' : -1, 
            'feature' : None, 
            'threshold' : None
        }

        for feature in features : 
            X_feat = X[:, feature]
            thresholds = np.unique(X_feat)
            for t in thresholds : 
                score = self._infGain(X_feat, y, t)
                if score > split['score'] : 
                    split['score'] = score
                    split['feature'] = feature
                    split['threshold'] = t


    # Helper function to stop recursion 
    def _finished(self, depth) : 
        if depth > self.max_depth or self.n_samples < self.min_sample_per_split or self.n_classes == 1 : 
            return True
        return False

    def _build(self, X, y, depth = 0) : 

        self.n_samples, self.n_features = X.shape
        self.n_classes = len(np.bincount(y))

        # base case
        if self._finished(depth) : 
            return Node(value = np.argmax(np.unique(y)))
        
        
        # At the moment we select random features but we can choose the one with the smallest entropy as well
        feats = np.random.choice(self.n_features, self.n_features, replace=False)
        best_feat, best_thresh = self._bestSplit(X, y, feats)

        #recursive step
        l_idx, r_idx = self._split( X[:, best_feat], best_thresh)
        l = self.build( X[l_idx, :], y[l_idx], depth + 1)
        r = self.build(X[r_idx, :], y[r_idx], depth + 1)

        return Node(best_feat, best_thresh, l, r)
    
    def fit(self, X, y): 
        self.root = self._build(X,y)
    

    def _traverse(self, x, node): 
        if node.is_leaf():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse(x, node.left)
        return self._traverse(x, node.right)

    def predict(self, X) : 
        predictions = [self._traverse(x, self.root) for x in X]
        return predictions



In [6]:

from sklearn.model_selection import train_test_split

X = dF.drop(['Y'], axis = 1)
y = dF.Y

X_train, X_test, Y_train, Y_test = train_test_split(X, y, train_size=0.8, random_state=88)

tree = Tree()
tree.fit(X_train.to_numpy(), Y_train.to_numpy().reshape(-1))

<__main__.Tree object at 0x0000025211B148E0>


AttributeError: 'Tree' object has no attribute 'value_counts'