# **Classification using random forest**

In [1]:
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
class Node:
    def __init__(
        self,
        split_feature_index=None,
        split_treshhold=None,
        left_subtree=None,
        right_subtree=None,
        node_variance_reduction=None,
        leaf_node_value=None,
    ):
        """This class is simply a collection of data that the top of the decision tree should store."""

        self.split_feature_index = split_feature_index
        self.split_treshhold = split_treshhold
        self.left_subtree = left_subtree
        self.right_subtree = right_subtree
        self.node_variance_reduction = node_variance_reduction

        self.leaf_node_value = leaf_node_value

In [3]:
class Decision_tree_classifier:
    def __init__(self, node_min_samples_quant=2, max_depth=2):
        """This is the model class for decision tree classification"""

        self._node_min_samples_quant = node_min_samples_quant
        self._max_depth = max_depth

        self._root = None

    def fit(self, X, Y):
        """This function we need for training the model on the training set"""

        dataset = np.concatenate((X, Y), axis=1, dtype=object)
        self._root = self._build_tree(dataset)

    def _build_tree(self, dataset, current_depth=0):
        """This function we need for building а decision tree"""

        X = dataset[:, :-1]
        Y = dataset[:, -1]

        dataset_rows_quant, dataset_features_quant = X.shape

        if (
            dataset_rows_quant >= self._node_min_samples_quant
            and current_depth <= self._max_depth
        ):
            best_split_values = self._get_best_split(dataset, dataset_features_quant)

            if best_split_values != {}:
                if best_split_values["node_information_gain"] > 0:
                    left_subtree = self._build_tree(
                        best_split_values["dataset_left_split"], current_depth + 1
                    )
                    right_subtree = self._build_tree(
                        best_split_values["dataset_right_split"], current_depth + 1
                    )

                    return Node(
                        best_split_values["split_feature_index"],
                        best_split_values["split_treshhold"],
                        left_subtree,
                        right_subtree,
                        best_split_values["node_information_gain"],
                    )

        leaf_node_value = self._calculate_leaf_node_value(Y)

        return Node(leaf_node_value=leaf_node_value)

    def _get_best_split(self, dataset, dataset_features_quant):
        """This function finds and makes the best split"""

        max_information_gain = -float("inf")
        best_split_values = {}

        for current_split_feature_idx in range(dataset_features_quant):
            possible_treshholds = np.unique(dataset[:, current_split_feature_idx])

            for current_split_treshhold in possible_treshholds:
                dataset_left_split, dataset_right_split = self._make_split(
                    dataset, current_split_feature_idx, current_split_treshhold
                )

                if len(dataset_left_split) > 0 and len(dataset_right_split) > 0:
                    dataset_targets = dataset[:, -1]
                    dataset_left_split_targets = dataset_left_split[:, -1]
                    dataset_right_split_targets = dataset_right_split[:, -1]

                    current_information_gain = self._calculate_information_gain(
                        dataset_targets,
                        dataset_left_split_targets,
                        dataset_right_split_targets,
                    )

                    if current_information_gain > max_information_gain:
                        best_split_values[
                            "split_feature_index"
                        ] = current_split_feature_idx
                        best_split_values["split_treshhold"] = current_split_treshhold
                        best_split_values["dataset_left_split"] = dataset_left_split
                        best_split_values["dataset_right_split"] = dataset_right_split
                        best_split_values[
                            "node_information_gain"
                        ] = current_information_gain
                        max_information_gain = current_information_gain

        return best_split_values

    def _make_split(
        self, dataset, dataset_split_feature_index, dataset_split_treshhold
    ):
        """This function produces the best sample split"""

        dataset_left_split = np.array(
            [
                row
                for row in dataset
                if row[dataset_split_feature_index] <= dataset_split_treshhold
            ]
        )
        dataset_right_split = np.array(
            [
                row
                for row in dataset
                if row[dataset_split_feature_index] > dataset_split_treshhold
            ]
        )

        return dataset_left_split, dataset_right_split

    def _calculate_information_gain(
        self, dataset_targets, dataset_left_split_targets, dataset_right_split_targets
    ):
        """This function calculates the difference between the impurity of the parent
        node and the sum of the impurity of the child nodes"""

        card_dataset_left_split_targets = len(dataset_left_split_targets)
        card_dataset_right_split_targets = len(dataset_right_split_targets)
        card_dataset_targets = len(dataset_targets)

        dataset_left_split_targets_entropy = self._calculate_entropy(
            dataset_left_split_targets
        )
        dataset_right_split_targets_entropy = self._calculate_entropy(
            dataset_right_split_targets
        )
        dataset_targets_entropy = self._calculate_entropy(dataset_targets)

        dataset_impurity = card_dataset_targets * dataset_targets_entropy
        split_impurity = (
            card_dataset_left_split_targets * dataset_left_split_targets_entropy
            + card_dataset_right_split_targets * dataset_right_split_targets_entropy
        )

        information_gain = dataset_impurity - split_impurity

        return information_gain

    def _calculate_entropy(self, data):
        """This function calculates the entropy of the data to calculate the impurity"""

        data_classes = np.unique(data)
        data_entropy = 0

        for current_class in data_classes:
            current_class_objects_quant = len(data[data == current_class])
            p_current_class = current_class_objects_quant / len(data)
            data_entropy += -p_current_class * np.log2(p_current_class)

        return data_entropy

    def _calculate_leaf_node_value(self, dataset_targets):
        """This function calculates the value that will be contained in the node if it is a leaf"""

        dataset_targets = list(dataset_targets)

        return max(dataset_targets, key=dataset_targets.count)

    def predict(self, data):
        """This function predicts the values (class labels) for the sample that was passed as an argument"""

        predictions = np.array(
            [self._make_prediction(data_sample, self._root) for data_sample in data]
        )

        return predictions.reshape(-1, 1)

    def _make_prediction(self, sample, current_node):
        """This function predicts a value (class label) for a particular sample object"""

        if current_node.leaf_node_value != None:
            return current_node.leaf_node_value
        else:
            feature_value = sample[current_node.split_feature_index]

            if feature_value <= current_node.split_treshhold:
                return self._make_prediction(sample, current_node.left_subtree)
            else:
                return self._make_prediction(sample, current_node.right_subtree)

In [4]:
class Classification_using_random_forest:
    def __init__(
        self,
        tree_node_min_samples_quant=2,
        trees_max_depth=2,
        trees_count=10,
        subset_size=10,
        random_features_quant=-1,
    ):
        """This is a regression using bagging on decision trees model class"""

        self._tree_node_min_samples_quant = tree_node_min_samples_quant
        self._trees_max_depth = trees_max_depth
        self._trees_count = trees_count
        self._subset_size = subset_size
        self._random_features_quant = random_features_quant
        self._dataset_subsets = []
        self._decision_trees_base_algorithms = []

    def fit(self, X, Y):
        """This function we need for training the model on the training set"""
        
        dataset = np.concatenate((X, Y), axis=1)
        
        _, dataset_features_quant = X.shape

        if self._random_features_quant == -1:
            self._random_features_quant = int(np.sqrt(dataset_features_quant))

        self._dataset_subsets = self._make_subsets(
            dataset, self._trees_count, self._subset_size
        )

        # This loop train our self._trees_count decision trees, using all of self._dataset_subsets
        for dataset_subset in self._dataset_subsets:
            subset_X = dataset_subset["subset"][:, :-1]
            subset_Y = dataset_subset["subset"][:, -1].reshape(-1, 1)

            model = Decision_tree_classifier(
                node_min_samples_quant=self._tree_node_min_samples_quant,
                max_depth=self._trees_max_depth,
            )
            model.fit(subset_X, subset_Y)

            random_feature_indices = dataset_subset["random_feature_indices"]

            # Now we need to save our model on self._decision_trees_base_algorithms list
            self._decision_trees_base_algorithms.append(
                {"model": model, "random_feature_indices": random_feature_indices}
            )

    def _make_subsets(self, dataset, subsets_quant, subset_size):
        """This function makes a array of dataset sets with size subset_size"""

        dataset_subsets = []

        for i in range(subsets_quant):
            dataset_subsets.append(self._make_subset(dataset, subset_size))

        return dataset_subsets

    def _make_subset(self, dataset, subset_size):
        """This function makes dataset set with size subset_size"""

        # Our subset has struct line this:
        # model: some_model
        # random_feature_indices: indices of random features that we took for training

        _, dataset_features_quant = dataset.shape
        dataset_features_quant -= 1

        dataset_rows_subset = np.array(random.sample(list(dataset), subset_size))

        random_feature_indices = np.sort(
            np.random.choice(
                np.arange(dataset_features_quant),
                self._random_features_quant,
                replace=False,
            )
        )

        dataset_features_subset = dataset_rows_subset[:, random_feature_indices]

        dataset_subset = np.concatenate(
            (dataset_features_subset, dataset_rows_subset[:, -1].reshape(-1, 1)), axis=1
        )

        return {
            "subset": dataset_subset,
            "random_feature_indices": random_feature_indices,
        }

    def predict(self, X):
        """This function predicts the values for the sample that was passed as an argument"""

        main_predictions = []
        all_predictions = []

        for base_algorithm in self._decision_trees_base_algorithms:
            random_feature_indices = base_algorithm["random_feature_indices"]
            model = base_algorithm["model"]

            prediction = model.predict(X[:, random_feature_indices])

            all_predictions.append(prediction.reshape(-1))

        all_predictions = np.transpose(all_predictions)
        
        for prediction in all_predictions:
            prediction = list(prediction)
            main_predictions.append(max(prediction, key=prediction.count))
            
        return np.array(main_predictions).reshape(-1, 1)

In [5]:
input_data_path = "D://ADMP_Anastasia/Git_reposeroties/ADMP_machine_learning_practice/datasets/iris.csv"
df = pd.read_csv(input_data_path).iloc[:, 1:]
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [6]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values.reshape(-1, 1)

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(
    X, Y, test_size=0.4, random_state=42
)

In [8]:
model = Classification_using_random_forest(
    tree_node_min_samples_quant=2,
    trees_max_depth=17,
    trees_count=30,
    subset_size=20,
)
model.fit(X_train, Y_train)

In [9]:
Y_pred = model.predict(X_test)

In [10]:
accuracy_score_value = accuracy_score(Y_test, Y_pred)
accuracy_score_value

0.9833333333333333

In [11]:
test_df = pd.DataFrame(np.concatenate((X_test, Y_test), axis=1))
test_df

Unnamed: 0,0,1,2,3,4
0,6.1,2.8,4.7,1.2,Iris-versicolor
1,5.7,3.8,1.7,0.3,Iris-setosa
2,7.7,2.6,6.9,2.3,Iris-virginica
3,6.0,2.9,4.5,1.5,Iris-versicolor
4,6.8,2.8,4.8,1.4,Iris-versicolor
5,5.4,3.4,1.5,0.4,Iris-setosa
6,5.6,2.9,3.6,1.3,Iris-versicolor
7,6.9,3.1,5.1,2.3,Iris-virginica
8,6.2,2.2,4.5,1.5,Iris-versicolor
9,5.8,2.7,3.9,1.2,Iris-versicolor


In [12]:
pred_df = pd.DataFrame(np.concatenate((X_test, Y_pred), axis=1))
pred_df

Unnamed: 0,0,1,2,3,4
0,6.1,2.8,4.7,1.2,Iris-versicolor
1,5.7,3.8,1.7,0.3,Iris-setosa
2,7.7,2.6,6.9,2.3,Iris-virginica
3,6.0,2.9,4.5,1.5,Iris-versicolor
4,6.8,2.8,4.8,1.4,Iris-versicolor
5,5.4,3.4,1.5,0.4,Iris-setosa
6,5.6,2.9,3.6,1.3,Iris-versicolor
7,6.9,3.1,5.1,2.3,Iris-virginica
8,6.2,2.2,4.5,1.5,Iris-versicolor
9,5.8,2.7,3.9,1.2,Iris-versicolor
