In [1]:
import numpy as np
import pandas as pd

In [2]:
class Node():
    
    def __init__(self, node_type = 'Node', side=None, feature = None, split_value = None, parent=None, *, node_depth = None,  ID = None, value = None, children=[]): #class initialization
        self.node_type = node_type #Leaf or Node        
        self.side = side #LEft or Right
        self.feature = feature
        self.split_value = split_value
        self.parent = parent
        
        self.node_depth = node_depth 
        self.ID = ID
        self.value = value #only for Leaf  
        self.children = []
        
        self.Np = 0 #number of points in the node
        
            
        
        
    def add_child(self, new_value):
        self.children.append(new_value)
#*********************************************************************************************************************
#*********************************************************************************************************************

class MyTreeClf():
    
    def __init__(self, max_depth = 5,  min_samples_split =2, max_leafs = 20, bins = None, criterion = 'entropy'): 
        self.max_depth = max_depth #maximum possible depth of tree
        self.min_samples_split = min_samples_split #minimum sample split
        self.max_leafs = max_leafs #maximum possible number of leaves in a tree
        self.bins = bins
        
        #tree parameters
        self.tree = []
        self.leafs_cnt = 0 #number of created leaves in the tree
        self.potential_leafs_cnt = 1 #counting potential leaves
        self.leafs_sum = 0 #sum of the leaves values
        
        self.histogram = {}
        
        self.criterion = criterion
        
        self.fi = {}
        
        self.N = 0
        
    def __repr__(self):
        return f'MyTreeClf class: max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}'
    
    #------------------------------------------------------------------------------------------------------------
    def fit(self, X, y, N): #receives panda dataframe and series
        if self.bins != None:
            for feature in X.columns:
                self.histogram.update({feature: self.get_hist_delimeters(X[feature].values)})
                
        for feature in X.columns:
            self.fi.update({feature: 0})
        
        self.N = N
        
        #Create root node
        feature, split_value, ig = self.get_best_split(X,y)
        X_left, y_left, X_right, y_right = self.split_dataframe(X, y, feature, split_value)
        if ig == 0.0 or len(y_left) == 0 or len(y_right) == 0:            
            print('All targets belong to class:', np.sum(y.values)/len(y.values) )
        else:
            _node = self.register_Node("Node", None, feature, split_value, None, y)
            #--------feature importance update---
            _node.Np = len(y.values)
            self.update_fi(y, _node, None)
            #----------------------------------
            self.grow_tree(X_left, y_left, 'Left', _node)
            self.grow_tree(X_right, y_right, 'Right', _node)
            
        
    def grow_tree(self, X, y, side, parent): #receives panda dataframe and series, string and Node
        feature, split_value, ig = self.get_best_split(X,y)
        X_left, y_left, X_right, y_right = self.split_dataframe(X, y, feature, split_value)
        
        if ig != 0.0 and len(y_left) != 0 and len(y_right) != 0 and (parent.node_depth < self.max_depth) and (len(y.values) >= self.min_samples_split) and (self.leafs_cnt + self.potential_leafs_cnt < self.max_leafs):
            _node = self.register_Node('Node', side, feature, split_value, parent, y)
            #--------feature importance update---
            _node.Np = len(y.values)
            self.update_fi(y, _node, parent)
            #----------------------------------
            self.grow_tree(X_left, y_left, 'Left', _node)
            self.grow_tree(X_right, y_right, 'Right', _node)
        else:
            _node = self.register_Node('Leaf', side, feature, split_value, parent, y)
            #--------feature importance update---
            _node.Np = len(y.values)
            self.update_fi(y, _node, parent)
            #----------------------------------
            return _node
            
    #-------------------------------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------------------------------
    def register_Node(self, node_type, side, feature, split_value, parent, y):
        #1.setting node depth
        if parent != None:
            node_depth = parent.node_depth +1
        else:
            node_depth = 1
            
        #2.setting node ID
        if node_type == 'Node':
            if side == 'Left':
                 ID = parent.ID + '.1' 
            elif side == 'Right':
                 ID = parent.ID + '.2'
            else:
                ID = '1'
        
        if node_type == 'Leaf':
            ID = parent.ID
            
        #3.Setting node value
        if node_type == 'Leaf':
            value = np.sum(y.values)/len(y.values)
        else:
            value = None
        
        new_node = Node(node_type, side, feature, split_value, parent, node_depth=node_depth, ID=ID, value=value)
        self.tree.append(new_node)
        
        #add as a child to parent node
        if parent != None :                            
            parent.add_child(new_node)
            
        #update counts
        if node_type == "Node":
            self.potential_leafs_cnt = self.potential_leafs_cnt + 1
        elif node_type == "Leaf":
            self.leafs_cnt = self.leafs_cnt + 1
            self.potential_leafs_cnt = self.potential_leafs_cnt - 1
            self.leafs_sum = self.leafs_sum + value
        return new_node
        
            
    def print_tree_full(self):
        for node in self.tree:
                print(node.__dict__)
    
    def print_tree(self):
        for node in self.tree:
            if node.node_type == 'Node':
                
                print(node.__dict__['ID'], node.__dict__['feature'], '>', node.__dict__['split_value'])
            else:
                print(node.__dict__['ID'],node.__dict__['side'], '-', node.__dict__['value'])
                
    def move_up_the_tree(self, X, _node,i):
        if _node.node_type =='Leaf':
            self.predictions[i]=float(_node.value)
        elif _node.node_type =='Node': 
            if X[_node.feature] <= _node.split_value:
                if _node.children[0].side =='Left':
                    self.move_up_the_tree(X, _node.children[0],i)
                else: self.move_up_the_tree(X, _node.children[1],i)
               
            else:
                if _node.children[0].side =='Right':
                    self.move_up_the_tree(X, _node.children[0],i)
                else: self.move_up_the_tree(X, _node.children[1],i)
    #-------------------------------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------------------------------
    def predict_proba(self,X):
        self.predictions = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            self.move_up_the_tree(X.iloc[i,:], self.tree[0],i)
        return(self.predictions)
            
    def predict(self,X):
        self.predict_proba(X)
        return (self.predictions > 0.5)*1 
    #-------------------------------------------------------------------------------------------------------------
    #-------------------------------------------------------------------------------------------------------------
    #Calculate the best split
    def gini(self, y):
        p1 = len(y[y == 0])/len(y)
        p2 = len(y[y == 1])/len(y)
        return 1 - p1**2 - p2**2
    
    def entropy(self, y): # receives 1D numpy array
        p1 = len(y[y == 0])/len(y)
        p2 = len(y[y == 1])/len(y)
        if p1 == 0 or p2 == 0:
            return 0.0
        else:
            return -p1*np.log2(p1) - p2*np.log2(p2)

    def data_split(self, X, y, threshold): #receives two 1D numpy arrays and a float
        X_left = X[X <= threshold]
        y_left = y[X <= threshold]
        X_right = X[X > threshold]
        y_right = y[X > threshold]
        return X_left, y_left, X_right, y_right
    
    def get_IG(self, X, y, threshold): #receives two numpy arrays and a float
        #split the data by the threshold
        _, y_left, __, y_right = self.data_split(X, y, threshold)
        if len(y_left) == 0 or len(y_right) == 0: #threshold does not split the data
            return 0.0
        else:
            if self.criterion == 'entropy':
                S0 = self.entropy(y)
                S1 = self.entropy(y_left)*len(y_left)/len(y)
                S2 = self.entropy(y_right)*len(y_right)/len(y)
                IG = S0 - S1 -S2
            elif self.criterion == 'gini':
                Gp = self.gini(y)
                Gl = self.gini(y_left)*len(y_left)/len(y)
                Gr = self.gini(y_right)*len(y_right)/len(y)
                IG = Gp - Gl - Gr
            return IG
        
    def get_native_delimeters(self, X): #receives 1D numpy array
        X_unique = np.unique(np.sort(X))
        native_delimeters = [np.mean([X_unique[i-1], X_unique[i]]) for i in range(1, len(X_unique))]
        return native_delimeters
    
    def get_hist_delimeters(self, X): #receives 1D numpy array
        hist_delimeters = np.histogram(X, self.bins)[1][1:-1]
        return hist_delimeters
    
    def get_best_split(self, X, y): #receives panda dataframe and panda series
        feature_best_split = {}
        for feature in X.columns:
            if len(X[feature].values) == 0 or np.max(X[feature].values) == np.min(X[feature].values):
                feature_best_split.update({feature: [None, 0.0]}) #feature has no values or any delimeters
            else:
                if self.bins == None:                    
                    feature_delimeters = self.get_native_delimeters(X[feature].values)
                else: 
                    X_unique = np.unique(np.sort(X))
                    #if len(X_unique) <= self.bins:
                        #feature_delimeters = self.get_native_delimeters(X[feature].values)
                    #else:
                    feature_delimeters = self.histogram[feature]
                    
                feature_igs = [self.get_IG(X[feature].values, y.values, feature_delimeters[i]) for i in range(len(feature_delimeters))]
                feature_best_split.update({feature: [feature_delimeters[np.argmax(feature_igs)],np.max(feature_igs)]})
        
        split_value, ig   = max(feature_best_split.values(), key=lambda x: x[1])
        feature = next(k for k, v in feature_best_split.items() if v == [split_value, ig])
        return feature, split_value, ig
    #-------------------------------------------------------------------------------------------------------------
        
    def split_dataframe(self, X, y, feature, threshold): #X,y - np.arrays, threshold - float
        X_left = X[X[feature] <= threshold].reset_index(drop = True)
        y_left = y[X[feature] <= threshold].reset_index(drop = True)
        X_right = X[X[feature] > threshold].reset_index(drop = True)
        y_right = y[X[feature] > threshold].reset_index(drop = True)
        return X_left, y_left, X_right, y_right
    
    def update_fi(self, y, _node, parent):
        if _node.node_type == 'Node':
            if self.criterion == 'entropy':
                FI = self.entropy(y.values)*_node.Np/self.N
            elif self.criterion == 'gini':
                FI = self.gini(y.values)*_node.Np/self.N
            self.fi.update({_node.feature: self.fi[_node.feature] + FI })
        
        if parent != None:
            if self.criterion == 'entropy':
                FI = self.entropy(y.values)*_node.Np/self.N
            elif self.criterion == 'gini':
                FI = self.gini(y.values)*_node.Np/self.N
            self.fi.update({parent.feature: self.fi[parent.feature] - FI })

In [3]:
class MyForestClf():
    
    def __init__(self, n_estimators=10, max_features=0.5, max_samples=0.5, max_depth=5, min_samples_split=2, max_leafs=20, bins=16, criterion = 'entropy', random_state=42, oob_score = None):
        self.n_estimators = n_estimators
        self.max_features = max_features
        self.max_samples = max_samples
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.max_leafs = max_leafs
        self.bins = bins
        self.criterion = criterion
        self.random_state = random_state
        
        self.leafs_cnt = 0
        self.forest = []
        
        self.fi = {}
        self.N = 0
        
        self.oob_score = oob_score #accuracy, precision, recall, f1, roc_auc
        self.oob_score_ = 0
        
        self.OOB_predictions = {}
        
        
    def __repr__(self):
        return f'MyForestClf class: n_estimators={self.n_estimators}, max_features={self.max_features}, max_samples={self.max_samples}, max_depth={self.max_depth}, min_samples_split={self.min_samples_split}, max_leafs={self.max_leafs}, bins={self.bins}, criterion={self.criterion}, random_state={self.random_state}'
    
    
    def fit(self,X,y): #receives panda daraframe and series
        
        for feature in X.columns:
            self.fi.update({feature: 0})
            
        self.N = len(y.values)
        
        if self.oob_score != None:
            for i in range(X.shape[0]):
                self.OOB_predictions.update({str(i): np.zeros(0)})
        
        #1.Make a sample from the original dataset taking into account the max_samples and max_features parameters.
        random.seed(self.random_state)
        
        for i in range(self.n_estimators):
            cols_idx = random.sample(list(X.columns), int(np.round(self.max_features*len(X.columns),0))) #select features
            rows_idx = random.sample(range(X[cols_idx].shape[0]), int(np.round(self.max_samples*X[cols_idx].shape[0],0))) #select samples
            #2.Create a tree using the code from the previous module. Pass it parameters unique to the tree.
            tree_object = MyTreeClf(self.max_depth,  self.min_samples_split, self.max_leafs, self.bins)
            #3Train the tree by feeding it a sample
            tree_object.fit(X.loc[rows_idx,cols_idx],y.loc[rows_idx], self.N)
            #4.Save the tree inside the forest instance.
            self.forest.append(tree_object)
            #4.1 update deature importance
            for feature, feature_importance in tree_object.fi.items():
                self.fi.update({feature:self.fi[feature] + feature_importance})
            #5.Count the number of resulting leaves in the leafs_cnt variable
            self.leafs_cnt = self.leafs_cnt + tree_object.leafs_cnt
            #6.Calculate out-of-bag error
            #6.1 create OOB sample
            if self.oob_score != None:
                X_oob = X.drop(rows_idx)
                y_oob = y.drop(rows_idx).reset_index()
                tree_object.predict_proba(X_oob)
                for i in range(y_oob.shape[0]):
                    temp = np.append(self.OOB_predictions[str(y_oob['index'][i])],tree_object.predictions[i])
                    self.OOB_predictions.update({str(y_oob['index'][i]): temp })
        
        if self.oob_score != None:
            y1 = np.zeros(0)
            y2 = np.zeros(0)
            scores = np.zeros(0)
            for index, array in self.OOB_predictions.items():
                if len(array) != 0:
                    y1=np.append(y1, y.values[int(index)]) #actual values
                    y2=np.append(y2, (np.mean(array) > 0.5)*1) #prediction
                    if self.oob_score == 'roc_auc':
                        scores=np.append(scores, np.mean(array))
                        
                
            TP=np.count_nonzero((y1==1)&(y2==1))
            TN = np.count_nonzero((y1==0)&(y2==0))
            FP = np.count_nonzero((y1==0)&(y2==1))
            FN = np.count_nonzero((y1==1)&(y2==0))
            
            
            self.calculate_metric(TP, TN, FP, FN, y1, scores)
    
    #-------------------------------------------------------------------------------------------------------------------
    def predict_proba(self, X):
        predictions = np.zeros(X.shape[0])
        for tree_object in self.forest:
            predictions = predictions + tree_object.predict_proba(X)
        return predictions/self.n_estimators
    
    def predict_class(self, X):
        class_labels = np.zeros([X.shape[0],self.n_estimators])
        for j in range(self.n_estimators):
            for tree_object in self.forest:
                class_labels[:,j] = (tree_object.predict_proba(X)>0.5)*1
        classes = np.array([int(np.unique(class_labels[i,:], return_counts=True)[0][0]) for i in range(X.shape[0])])
        return classes
        
    def predict(self,X,type):
         if type == 'mean':
            predictions = self.predict_proba(X)
            return (predictions >0.5)*1
         elif type == 'vote':
            return self.predict_class(X)
        
    #-------------------------------------------------------------------------------------------------------------------    
    def calculate_metric(self, TP, TN, FP, FN, y_, probs, beta = 1):
        if self.oob_score == 'accuracy':
            self.oob_score_ = (TP+TN)/(TP+TN+FP+FN)
        elif self.oob_score == 'precision':
            self.oob_score_ =  TP/(TP+FP)
        elif self.oob_score == 'recall':
            self.oob_score_ =  TP/(TP+FN)
        elif self.oob_score == 'f1':
            pres = TP/(TP+FP)
            rec = TP/(TP+FN)
            self.oob_score_ =  (1+np.square(beta))*pres*rec/(np.square(beta)*pres + rec)
        elif self.oob_score  == 'roc_auc':
            probs = np.round(probs,10)
            sorted_idx = np.argsort(-probs)
            probs_sorted = probs[sorted_idx]
            y_sorted = y_[sorted_idx]
            
            sum=0.
            P = len(np.where(y_sorted==1)[0])
            N = len(np.where(y_sorted==0)[0])
            
            
            for prob, class_ in zip(probs_sorted,y_sorted):
                if class_ == 0.:
                    sum = sum + len(np.where(y_sorted[probs_sorted > prob]==1)[0])
                    sum = sum + 0.5*len(np.where(y_sorted[probs_sorted == prob]==1)[0])
            self.oob_score_ = sum/(P*N) 