# Brief way to implement a random forest

In [2]:
#Import train_test_split from sklearn
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

path = './data/IRIS.csv'
#Function to read csv data
def read_csv(file_name):
    data = pd.read_csv(file_name)
    return data 

In [3]:
df= read_csv(path)
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width      species
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa


In [None]:
#take a look at the data
print(df.describe())
print(df.columns.names)

       sepal_length  sepal_width  petal_length  petal_width
count    150.000000   150.000000    150.000000   150.000000
mean       5.843333     3.054000      3.758667     1.198667
std        0.828066     0.433594      1.764420     0.763161
min        4.300000     2.000000      1.000000     0.100000
25%        5.100000     2.800000      1.600000     0.300000
50%        5.800000     3.000000      4.350000     1.300000
75%        6.400000     3.300000      5.100000     1.800000
max        7.900000     4.400000      6.900000     2.500000
[None]


In [19]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, info_gain=None, value=None):

        '''Constructor'''
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.inofo_gain = info_gain

        # for leaf node
        self.value = value
        

In [20]:
class DecisionTreeClassifier():
    def __init__(self, min_samples_split=2,max_depth=2):
        '''Constructor'''
        # initialize the root of the tree
        self.root = None
        # Stopping conditions
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def build_tree(self, dataset, curr_depth=0):
        '''Building the tree with the recursivity function'''

        X, Y = dataset[:,:-1], dataset[:,-1]
        num_samples, num_features = np.shape(X)
        # split until stopping conditions are met
        if num_samples >= self.min_samples_split and curr_depth <= self.max_depth:
            # find the best split
            best_split = self.get_best_split(dataset, num_samples, num_features)
            # check if information gain is positive
            if best_split['info_gain'] > 0:
                # recur left and right
                left_tree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                right_tree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return Node(best_split['feature_index'], best_split['threshold'], left_tree, right_tree, best_split['info_gain'])
            else:
                # return leaf node
                return Node(value=self.get_majority_class(Y))

        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return Node(value=leaf_value)


    def get_best_split(self,dataset, num_samples, num_features):
        '''Function to finde the best split'''
        best_split = {} 
        max_info_gain = -float("inf") 
        # loop over all the features values present in the dataset
        for feature_index in range(num_features):
            # get the unique values for the feature
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the datasetç
            for threshold in possible_thresholds:
                # get current split dataset
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null 
                if len(dataset_left) > 0 and len(dataset_right) > 0:
                    y, left_y, right_y = dataset[:,1], dataset_left[:,-1], dataset_right[:,-1]
                    # now, we compute the information gain 
                    info_gain = self.information_gain(y, left_y, right_y, "gini")
                    # update the best split if needed
                    if info_gain > max_info_gain:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["info_gain"] = info_gain
                        max_info_gain = info_gain

        return best_split


def split(self, dataset, feature_index, threshold):
    '''Function to split the dataset'''
    
    dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
    dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
    
    return dataset_left, dataset_right


def information_gain(self, parent, l_child, r_child, mode ="entropy"):
    """Function to compute the information gain"""

    weight_l = len(l_child) / len(parent)
    weight_r = len(r_child) / len(parent)
    if mode == "gini":
        gain = self.gini_index(parent) - (weight_l * self.gini_index(l_child) + weight_r * self.gini_index(r_child))
    else:
        gain = self.entropy(parent) - (weight_l * self.entropy(l_child) + weight_r * self.entropy(r_child))
    return gain

def entropy(self, y):
    """Function to compute the entropy"""

    class_labels = np.unique(y)
    entropy = 0
    for cls in class_labels:
        p_cls = len(y[y == cls]) / len(y)
        entropy += -p_cls * np.log2(p_cls)
    return entropy


def gini_index(self, y):
    """Function to compute the gini index"""
    
    class_labels = np.unique(y)
    gini = 0
    for cls in class_labels:
        p_cls = len(y[y == cls]) / len(y)
        gini += p_cls ** 2
    return 1 - gini


def calculate_leaf_value(self, Y):
    """Function to calculate the leaf value"""

    Y = list(Y)
    return max(set(Y), key=Y.count)

def print_tree(self,tree=None, indent=""):
    '''Function to print the tree'''
    if tree is None:
        tree = self.root
    if tree.value is not None:
        print(tree.value)
    else:
        print("X_"+str(tree.feature_index), "<=",tree.threshold, "?",tree.info_gain)
        print("%sleft:" % (indent), end="")
        self.print_tree(tree.left,indent+"  ")
        print("%sright:" % (indent), end="")
        self.print_tree(tree.right,indent+indent)

def fit(self, X,Y):
    '''Function to fit the model'''
    dataset = np.concatenate((X,Y), axis=1)
    self.root = self.build_tree(dataset)

def predict(self,X):
    ''''Function to predict the labels'''
    predictions = [self.make_prediction(x, self.root) for x in X]

    return predictions


def make_prediction(self, x, tree): 
    '''Function to make the prediction'''
    if tree.value!=None: return tree.value
    feature_val = x[tree.feature_index]
    if feature_val <= tree.threshold:
        return self.make_prediction(x, tree.left)
    else:
        return self.make_prediction(x, tree.right)
    





In [28]:
#TRAIN  TEST SPLIT
X = df.iloc[:,:-1].values
Y = df.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=2, random_state=41)

In [31]:
#FIT THE MODEL
classifier = DecisionTreeClassifier(min_samples_split=3, max_depth=3)

classifier.fit(X_train,Y_train)
classifier.print_tree()

AttributeError: 'DecisionTreeClassifier' object has no attribute 'fit'

In [30]:
https://github.com/Suji04/ML_from_Scratch/blob/master/decision%20tree%20classification.ipynb

AttributeError: 'DecisionTreeClassifier' object has no attribute 'fit'

In [10]:
#Split the data into training and testing sets
X = df.drop(' Species', axis=1)
y = df[' Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)


KeyError: "[' Species'] not found in axis"