# **Decision tree regression**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter 

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
class Node:
    def __init__(
        self,
        split_feature_idx=None,
        treshhold=None,
        left_subtree=None,
        right_subtree=None,
        variance_reduction=None,
        leaf_node_value=None
    ):
        self.split_feature_idx = split_feature_idx
        self.treshhold = treshhold
        self.left_subtree = left_subtree
        self.right_subtree = right_subtree
        self.variance_reduction = variance_reduction

        self.leaf_node_value = leaf_node_value

In [3]:
class Decision_tree_regression:
    def __init__(self, n_min_samples=2, max_depth=2):
        self.n_min_samples = n_min_samples
        self.max_depth = max_depth

        self.root = None

    def fit(self, X, y):
        data = np.concatenate((X, y), axis=1)
        self.root = self.build_tree(data)

    def build_tree(self, data, current_depth=0):
        X = data[:, :-1]
        y = data[:, -1]

        n_rows, n_columns = X.shape

        if (
            n_rows >= self.n_min_samples
            and current_depth <= self.max_depth
        ):
            best_split_values = self.get_best_split(data, n_columns)
            
            if best_split_values != {}:
                if best_split_values["variance_reduction"] > 0:
                    left_subtree = self.build_tree(
                        best_split_values["data_left_split"], current_depth + 1
                    )
                    right_subtree = self.build_tree(
                        best_split_values["data_right_split"], current_depth + 1
                    )

                    return Node(
                        best_split_values["split_feature_index"],
                        best_split_values["treshhold"],
                        left_subtree,
                        right_subtree,
                        best_split_values["variance_reduction"]
                    )

        leaf_node_value = self.calculate_leaf_node_value(y)

        return Node(leaf_node_value=leaf_node_value)

    def get_best_split(self, data, n_columns):
        max_variance_reduction = -1
        best_split_values = {}

        for split_feature_idx in range(n_columns):
            possible_treshholds = np.unique(data[:, split_feature_idx])

            for treshhold in possible_treshholds:
                data_left_split, data_right_split = self.make_split(
                    data, split_feature_idx, treshhold
                )

                if len(data_left_split) > 0 and len(data_right_split) > 0:
                    variance_reduction = self.calculate_variance_reduction(
                        data, data_left_split, data_right_split
                    )

                    if variance_reduction > max_variance_reduction:
                        best_split_values["split_feature_index"] = split_feature_idx
                        best_split_values["treshhold"] = treshhold
                        best_split_values["data_left_split"] = data_left_split
                        best_split_values["data_right_split"] = data_right_split
                        best_split_values["variance_reduction"] = variance_reduction
                        max_variance_reduction = variance_reduction

        return best_split_values

    def make_split(
        self, data, split_feature_index, treshhold
    ):
        data_left_split = np.array(
            [
                row
                for row in data
                if row[split_feature_index] <= treshhold
            ]
        )
        data_right_split = np.array(
            [
                row
                for row in data
                if row[split_feature_index] > treshhold
            ]
        )

        return data_left_split, data_right_split

    def calculate_variance_reduction(
        self, data, data_left_split, data_right_split
    ):
        data_targets = data[:, -1]
        data_left_split_targets = data_left_split[:, -1]
        data_right_split_targets = data_right_split[:, -1]

        data_left_split_size = len(data_left_split)
        data_right_split_size = len(data_right_split)
        data_size = len(data_targets)

        data_left_split_targets_variance = np.var(data_left_split_targets)
        data_right_split_targets_variance = np.var(data_right_split_targets)
        data_targets_variance = np.var(data_targets)

        data_variance = data_size * data_targets_variance
        split_variance = (
            data_left_split_size * data_left_split_targets_variance
            + data_right_split_size * data_right_split_targets_variance
        )

        variance_reduction = data_variance - split_variance

        return variance_reduction

    def calculate_leaf_node_value(self, targets):
        return np.mean(targets)

    def predict(self, data):
        predictions = np.array(
            [self.make_prediction(data, self.root) for data in data]
        )

        return predictions.reshape(-1, 1)

    def make_prediction(self, sample, node):
        if node.leaf_node_value != None:
            return node.leaf_node_value
        else:
            feature_value = sample[current_node.split_feature_index]

            if feature_value <= node.treshhold:
                return self.make_prediction(sample, node.left_subtree)
            else:
                return self.make_prediction(sample, node.right_subtree)

In [4]:
data = pd.read_csv(
    "D://admp_northallerton/my_machine_learning/datas/regression_data.csv"
)
data

Unnamed: 0,x,x**3,x**5,x**7,y
0,-2.074295,-8.925074,-3.840193e+01,-1.652321e+02,-3.768735
1,2.359096,13.129152,7.306810e+01,4.066483e+02,3.923851
2,2.820229,22.431223,1.784110e+02,1.419026e+03,2.512261
3,-0.035897,-0.000046,-5.960887e-08,-7.681310e-11,-0.000046
4,-0.791447,-0.495754,-3.105348e-01,-1.945155e-01,-0.445595
...,...,...,...,...,...
295,-2.650994,-18.630574,-1.309314e+02,-9.201555e+02,-3.311162
296,-2.554505,-16.669404,-1.087761e+02,-7.098178e+02,-3.614725
297,-2.871734,-23.682768,-1.953083e+02,-1.610679e+03,-2.198575
298,-2.158768,-10.060459,-4.688454e+01,-2.184950e+02,-3.877667


In [5]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values.reshape(-1, 1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

In [7]:
model = Decision_tree_regression(n_min_samples=3, max_depth=6)
model.fit(X_train, y_train)

In [8]:
y_pred = model.predict(X_test)

In [9]:
mse = mean_squared_error(y_test, y_pred)
mse

0.013693893093191491

In [10]:
test_data = pd.DataFrame(
    np.concatenate((X_test, y_test), axis=1), columns=["x", "x**3", "x**5", "x**7", "y"]
)
test_data.head()

Unnamed: 0,x,x**3,x**5,x**7,y
0,-2.918149,-24.849758,-211.610369,-1801.987284,-1.886965
1,-3.112051,-30.139792,-291.899774,-2827.009467,-0.286062
2,2.194387,10.566701,50.882178,245.014604,3.909025
3,-2.355538,-13.069849,-72.518855,-402.375298,-3.925998
4,-2.265922,-11.634161,-59.73448,-306.700953,-3.943084


In [11]:
pred_data = pd.DataFrame(
    np.concatenate((X_test, y_pred), axis=1), columns=["x", "x**3", "x**5", "x**7", "y"]
)
pred_data.head()

Unnamed: 0,x,x**3,x**5,x**7,y
0,-2.918149,-24.849758,-211.610369,-1801.987284,-1.896429
1,-3.112051,-30.139792,-291.899774,-2827.009467,-0.57175
2,2.194387,10.566701,50.882178,245.014604,3.887526
3,-2.355538,-13.069849,-72.518855,-402.375298,-3.819603
4,-2.265922,-11.634161,-59.73448,-306.700953,-3.819603
