# **Regression using bagging on decision trees**

## **library import**

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## **class Node**

In [None]:
class Node():
    def __init__(self, split_feature_index = None, split_treshhold = None, left_subtree = None,
                 right_subtree = None, node_variance_reduction = None, leaf_node_value = None):
        '''This class is simply a collection of data that the top of the decision tree should store.'''

        # Для внутренних вершин, содержащих предикат
        self.split_feature_index = split_feature_index
        self.split_treshhold = split_treshhold
        self.left_subtree = left_subtree
        self.right_subtree = right_subtree
        self.node_variance_reduction = node_variance_reduction

        # Для листовых вершин, содержащих лишь значение
        self.leaf_node_value = leaf_node_value

## **class Decision_tree_classifier**

In [None]:
class Decision_tree_regression():
    def __init__(self, node_min_samples_quant = 2, max_depth = 2):
        '''This is the model class for decision tree regression'''

        self._node_min_samples_quant = node_min_samples_quant
        self._max_depth = max_depth

        self._root = None
    
    def fit(self, X, Y):
        '''This function we need for training the model on the training set'''

        dataset = np.concatenate((X, Y), axis = 1, dtype = object)
        self._root = self._build_tree(dataset)
    
    def _build_tree(self, dataset, current_depth = 0):
        '''This function we need for building а deсision tree'''
        X = dataset[:, :-1]
        Y = dataset[:, -1]
        
        dataset_rows_quant, dataset_features_quant = X.shape

        if dataset_rows_quant >= self._node_min_samples_quant and current_depth <= self._max_depth:
            best_split_values = self._get_best_split(dataset, dataset_features_quant)

            if best_split_values["node_variance_reduction"] > 0:
                left_subtree = self._build_tree(best_split_values["dataset_left_split"], current_depth + 1)
                right_subtree = self._build_tree(best_split_values["dataset_right_split"], current_depth + 1)

                return Node(best_split_values["split_feature_index"], best_split_values["split_treshhold"],
                            left_subtree, right_subtree, best_split_values["node_variance_reduction"])
        
        leaf_node_value = self._calculate_leaf_node_value(Y)

        return Node(leaf_node_value = leaf_node_value)

    def _get_best_split(self, dataset, dataset_features_quant):
        '''This function finds and makes the best split'''

        max_variance_reduction = -float("inf")
        best_split_values = {}

        for current_split_feature_idx in range(dataset_features_quant):
            possible_treshholds = np.unique(dataset[:, current_split_feature_idx])

            for current_split_treshhold in possible_treshholds:
                dataset_left_split, dataset_right_split = self._make_split(dataset, current_split_feature_idx,
                                                                           current_split_treshhold)

                if len(dataset_left_split) > 0 and len(dataset_right_split) > 0:
                    current_variance_reduction = self._calculate_variance_reduction(dataset, dataset_left_split,
                                                                                dataset_right_split)

                    if current_variance_reduction > max_variance_reduction:
                        best_split_values["split_feature_index"] = current_split_feature_idx
                        best_split_values["split_treshhold"] = current_split_treshhold
                        best_split_values["dataset_left_split"] = dataset_left_split
                        best_split_values["dataset_right_split"] = dataset_right_split
                        best_split_values["node_variance_reduction"] = current_variance_reduction
                        max_variance_reduction = current_variance_reduction
        
        return best_split_values

    def _make_split(self, dataset, dataset_split_feature_index, dataset_split_treshhold):
        '''This function produces the best sample split'''
        dataset_left_split = np.array([row for row in dataset if row[dataset_split_feature_index] <= dataset_split_treshhold])
        dataset_right_split = np.array([row for row in dataset if row[dataset_split_feature_index] > dataset_split_treshhold])

        return dataset_left_split, dataset_right_split
    
    def _calculate_variance_reduction(self, dataset, dataset_left_split, dataset_right_split):
        '''This function calculates the reduction between the variance of the parent
        node and the sum of the variances of the child nodes'''

        dataset_targets = dataset[:, -1]
        dataset_left_split_targets = dataset_left_split[:, -1]
        dataset_right_split_targets = dataset_right_split[:, -1]

        card_dataset_left_split = len(dataset_left_split)
        card_dataset_right_split = len(dataset_right_split)
        card_dataset = len(dataset_targets)

        dataset_left_split_targets_variance = np.var(dataset_left_split_targets)
        dataset_right_split_targets_variance = np.var(dataset_right_split_targets)
        dataset_targets_variance = np.var(dataset_targets)

        dataset_variance = card_dataset * dataset_targets_variance
        split_variance = card_dataset_left_split * dataset_left_split_targets_variance + \
                         card_dataset_right_split * dataset_right_split_targets_variance

        variance_reduction = dataset_variance - split_variance

        return variance_reduction

    def _calculate_leaf_node_value(self, dataset_targets):
        '''This function calculates the value that will be contained in the node if it is a leaf'''

        return np.mean(dataset_targets)

    def predict(self, data):
        '''This function predicts the values (class labels) for the sample that was passed as an argument'''

        predictions = np.array([self._make_prediction(data_sample, self._root) for data_sample in data])

        return predictions

    def _make_prediction(self, sample, current_node):
        '''This function predicts a value (class label) for a particular sample object'''

        if current_node.leaf_node_value != None:
            return current_node.leaf_node_value
        else:
            feature_value = sample[current_node.split_feature_index]

            if feature_value <= current_node.split_treshhold:
                return self._make_prediction(sample, current_node.left_subtree)
            else:
                return self._make_prediction(sample, current_node.right_subtree)

## **class Bagging_using_decision_trees**

In [None]:
class Bagging_using_decision_trees():
    def __init__(self, dt_node_min_samples_quant = 2, dt_max_depth = 2, dt_quant = 10, bootstrap_set_size = 10):
        ''''''

        self._dt_node_min_samples_quant = dt_node_min_samples_quant
        self._dt_max_depth = dt_max_depth
        self._dt_quant = dt_quant
        self._bootstrap_set_size = bootstrap_set_size
        self._dataset_bootstrap_sets = []
        self._decision_trees_base_algorithms = []

    def fit(X, Y):
        '''This function we need for training the model on the training set'''
        dataset = np.concatenate((X, Y), axis = 1, dtype = object)

        self._dataset_bootstrap_sets = make_set_using_bootstrap(dataset, self._dt_quant,
                                                                self._bootstrap_set_size)

        for dataset_bootstrap_set in self._dataset_bootstrap_sets:
            X = dataset_bootstrap_set[:, :-1]
            Y = dataset_bootstrap_set[:, -1]

            model = Decision_tree_regression(node_min_samples_quant = self._dt_node_min_samples_quant,
                                             max_depth = self._dt_max_depth)
            model.fit(X, Y)

            self._decision_trees_base_algorithms.append(model)
    
    def _bootstrap():
        pass

    def _predict():
        '''This function predicts the values for the sample that was passed as an argument'''
        
        pass

## **initialize input data**

In [2]:
df = pd.read_csv("D://ADMP_Anastasia/machine_learning/datasets/dataset_for_bagging_using_decision_trees.csv")
df

NameError: name 'pd' is not defined

In [None]:
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values.reshape(-1,1)

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 1/3, random_state = 42)

X_test_len = len(X_test)
Y_test_len = len(Y_test)

## **creating and training decision tree classifier model on our dataset**

In [None]:
model = Bagging_using_decision_trees(dt_node_min_samples_quant = 2, dt_max_depth = 7,
                                     dt_quant = 10, bootstrap_set_size = 20)
model.fit(X_train, Y_train)

## **making predictions**

In [None]:
Y_pred = model.predict(X_test)
Y_pred = Y_pred.reshape(-1, 1)
Y_pred_len = len(Y_pred)

## **displaying a graph with a test sample, true targets and model predictions**

In [None]:
graph_X_values = X_test[:, 0]
graph_Y_test_values = Y_test
grapg_Y_pred_values = Y_pred

figure, ax = plt.subplots()

figure.set_figwidth(7)
figure.set_figheight(5)

ax.set_title('Тестовая выборка и предсказания модели')

ax.scatter(graph_X_values, graph_Y_test_values, color = "tomato",
           label = "Тестовая выборка", marker = "x")
ax.scatter(graph_X_values, grapg_Y_pred_values, color = "gold",
           edgecolors='orange', label = "Предсказания модели", alpha = 0.5)

ax.legend()

plt.show()

## **displaying accuracy score of our predictions**

In [None]:
mean_squared_error_value = mean_squared_error(Y_test, Y_pred)
mean_squared_error_value

## **displaying a table with test data and targets**

In [None]:
test_df = pd.DataFrame(np.concatenate((X_test, Y_test), axis = 1),
                       columns = ["x", "x**3", "x**5", "x**7", "y"])
test_df.head()

## **displaying a table with test data and our model predictions**

In [None]:
pred_df = pd.DataFrame(np.concatenate((X_test, Y_pred), axis = 1),
                       columns = ["x", "x**3", "x**5", "x**7", "y"])
pred_df.head()

NameError: name 'pd' is not defined