# **Classification using random forest**

In [1]:
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
class Node:
    def __init__(
        self,
        split_feature_idx=None,
        treshhold=None,
        left_subtree=None,
        right_subtree=None,
        node_variance_reduction=None,
        leaf_node_value=None
    ):
        self.split_feature_idx = split_feature_idx
        self.treshhold = treshhold
        self.left_subtree = left_subtree
        self.right_subtree = right_subtree
        self.node_variance_reduction = node_variance_reduction

        self.leaf_node_value = leaf_node_value

In [3]:
class Decision_tree_classifier:
    def __init__(self, n_min_samples=2, max_depth=2):
        self.n_min_samples = n_min_samples
        self.max_depth = max_depth

        self.root = None

    def fit(self, X, y):
        data = np.concatenate((X, y), axis=1, dtype=object)
        self.root = self.build_tree(data)

    def build_tree(self, data, current_depth=0):
        X = data[:, :-1]
        y = data[:, -1]

        n_rows, n_columns = X.shape

        if (
            n_rows >= self.n_min_samples
            and current_depth <= self.max_depth
        ):
            best_split_values = self.get_best_split(data, n_columns)

            if best_split_values != {}:
                if best_split_values["node_info_gain"] > 0:
                    left_subtree = self.build_tree(
                        best_split_values["data_left_split"], current_depth + 1
                    )
                    right_subtree = self.build_tree(
                        best_split_values["data_right_split"], current_depth + 1
                    )

                    return Node(
                        best_split_values["split_feature_idx"],
                        best_split_values["treshhold"],
                        left_subtree,
                        right_subtree,
                        best_split_values["node_info_gain"]
                    )

        leaf_node_value = self.calculate_leaf_node(y)

        return Node(leaf_node_value=leaf_node_value)

    def get_best_split(self, data, n_columns):
        max_info_gain = -float("inf")
        best_split_values = {}

        for split_feature_idx in range(n_columns):
            possible_treshholds = np.unique(data[:, split_feature_idx])

            for treshhold in possible_treshholds:
                data_left_split, data_right_split = self.make_split(
                    data, split_feature_idx, treshhold
                )

                if len(data_left_split) > 0 and len(data_right_split) > 0:
                    targets = data[:, -1]
                    left_split_targets = data_left_split[:, -1]
                    right_split_targets = data_right_split[:, -1]

                    info_gain = self.calculate_info_gain(
                        targets,
                        left_split_targets,
                        right_split_targets
                    )

                    if info_gain > max_info_gain:
                        best_split_values["split_feature_idx"] = split_feature_idx
                        best_split_values["treshhold"] = treshhold
                        best_split_values["data_left_split"] = data_left_split
                        best_split_values["data_right_split"] = data_right_split
                        best_split_values["node_info_gain"] = info_gain
                        max_info_gain = info_gain

        return best_split_values

    def make_split(
        self, data, split_feature_idx, treshhold
    ):
        data_left_split = np.array(
            [
                row
                for row in data
                if row[split_feature_idx] <= treshhold
            ]
        )
        data_right_split = np.array(
            [
                row
                for row in data
                if row[split_feature_idx] > treshhold
            ]
        )

        return data_left_split, data_right_split

    def calculate_info_gain(
        self, targets, left_split_targets, right_split_targets
    ):
        n_left_split_targets = len(left_split_targets)
        n_right_split_targets = len(right_split_targets)
        n_targets = len(targets)

        left_split_targets_entropy = self.calculate_entropy(
            left_split_targets
        )
        right_split_targets_entropy = self.calculate_entropy(
            right_split_targets
        )
        targets_entropy = self.calculate_entropy(targets)

        data_impurity = n_targets * targets_entropy
        split_impurity = (
            n_left_split_targets * left_split_targets_entropy
            + n_right_split_targets * right_split_targets_entropy
        )

        info_gain = data_impurity - split_impurity

        return info_gain

    def calculate_entropy(self, data):
        data_labels = np.unique(data)
        data_entropy = 0

        for label in data_labels:
            n_label_objects = len(data[data == label])
            p_label = n_label_objects / len(data)
            data_entropy += -p_label * np.log2(p_label)

        return data_entropy

    def calculate_leaf_node(self, targets):
        targets = list(targets)

        return max(targets, key=targets.count)

    def predict(self, data):
        predictions = np.array(
            [self.make_prediction(sample, self.root) for sample in data]
        )

        return predictions.reshape(-1, 1)

    def make_prediction(self, sample, current_node):
        if current_node.leaf_node_value != None:
            return current_node.leaf_node_value
        else:
            feature_value = sample[current_node.split_feature_idx]

            if feature_value <= current_node.treshhold:
                return self.make_prediction(sample, current_node.left_subtree)
            else:
                return self.make_prediction(sample, current_node.right_subtree)

In [4]:
class Random_forest_classification:
    def __init__(
        self,
        n_min_samples=2,
        max_depth=2,
        n_trees=10,
        subset_size=10,
        n_random_features=-1
    ):
        self.n_min_samples = n_min_samples
        self.max_depth = max_depth
        self.n_trees = n_trees
        self.subset_size = subset_size
        self.n_random_features = n_random_features
        self.data_subsets = []
        self.base_algorithms = []

    def fit(self, X, y):
        data = np.concatenate((X, y), axis=1)
        
        _, n_columns = X.shape

        if self.n_random_features == -1:
            self.n_random_features = int(np.sqrt(n_columns))

        self.data_subsets = self.make_subsets(
            data, self.n_trees, self.subset_size
        )

        for data_subset in self.data_subsets:
            subset_X = data_subset["subset"][:, :-1]
            subset_y = data_subset["subset"][:, -1].reshape(-1, 1)

            model = Decision_tree_classifier(
                n_min_samples=self.n_min_samples,
                max_depth=self.max_depth
            )
            model.fit(subset_X, subset_y)

            random_feature_indices = data_subset["random_feature_indices"]

            self.base_algorithms.append(
                {"model": model, "random_feature_indices": random_feature_indices}
            )

    def make_subsets(self, data, n_subsets, subset_size):
        data_subsets = []

        for i in range(n_subsets):
            data_subsets.append(self.make_subset(data, subset_size))

        return data_subsets

    def make_subset(self, data, subset_size):
        _, n_columns = data.shape
        n_columns -= 1

        data_rows_subset = np.array(random.sample(list(data), subset_size))

        random_feature_indices = np.sort(
            np.random.choice(
                np.arange(n_columns),
                self.n_random_features,
                replace=False
            )
        )

        data_features_subset = data_rows_subset[:, random_feature_indices]

        data_subset = np.concatenate(
            (data_features_subset, data_rows_subset[:, -1].reshape(-1, 1)), axis=1
        )

        return {
            "subset": data_subset,
            "random_feature_indices": random_feature_indices
        }

    def predict(self, X):
        main_predictions = []
        all_predictions = []

        for base_algorithm in self.base_algorithms:
            random_feature_indices = base_algorithm["random_feature_indices"]
            model = base_algorithm["model"]

            prediction = model.predict(X[:, random_feature_indices])

            all_predictions.append(prediction.reshape(-1))

        all_predictions = np.transpose(all_predictions)
        
        for prediction in all_predictions:
            prediction = list(prediction)
            main_predictions.append(max(prediction, key=prediction.count))
            
        return np.array(main_predictions).reshape(-1, 1)

In [5]:
path = "/content/sample_data/Iris.csv"
data = pd.read_csv(path).iloc[:, 1:]
data

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values.reshape(-1, 1)

In [7]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42
)

In [8]:
model = Random_forest_classification(
    n_min_samples=2,
    max_depth=17,
    n_trees=30,
    subset_size=20
)
model.fit(X_train, y_train)

In [9]:
y_pred = model.predict(X_test)

In [10]:
accuracy = accuracy_score(y_test, y_pred)
accuracy

1.0

In [11]:
test_data = pd.DataFrame(np.concatenate((X_test, y_test), axis=1))
test_data

Unnamed: 0,0,1,2,3,4
0,6.1,2.8,4.7,1.2,Iris-versicolor
1,5.7,3.8,1.7,0.3,Iris-setosa
2,7.7,2.6,6.9,2.3,Iris-virginica
3,6.0,2.9,4.5,1.5,Iris-versicolor
4,6.8,2.8,4.8,1.4,Iris-versicolor
5,5.4,3.4,1.5,0.4,Iris-setosa
6,5.6,2.9,3.6,1.3,Iris-versicolor
7,6.9,3.1,5.1,2.3,Iris-virginica
8,6.2,2.2,4.5,1.5,Iris-versicolor
9,5.8,2.7,3.9,1.2,Iris-versicolor


In [12]:
pred_data = pd.DataFrame(np.concatenate((X_test, y_pred), axis=1))
pred_data

Unnamed: 0,0,1,2,3,4
0,6.1,2.8,4.7,1.2,Iris-versicolor
1,5.7,3.8,1.7,0.3,Iris-setosa
2,7.7,2.6,6.9,2.3,Iris-virginica
3,6.0,2.9,4.5,1.5,Iris-versicolor
4,6.8,2.8,4.8,1.4,Iris-versicolor
5,5.4,3.4,1.5,0.4,Iris-setosa
6,5.6,2.9,3.6,1.3,Iris-versicolor
7,6.9,3.1,5.1,2.3,Iris-virginica
8,6.2,2.2,4.5,1.5,Iris-versicolor
9,5.8,2.7,3.9,1.2,Iris-versicolor
