# **Decision tree regression**

## **Libraries**

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from collections import Counter 

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

## **Decision tree regressor components**

### **Node class**

In [2]:
class Node:
    def __init__(self,
                 split_idx=None,
                 treshhold=None,
                 l_subtree=None,
                 r_subtree=None,
                 var_reduction=None,
                 leaf_node_value=None):
        self.split_idx = split_idx
        self.treshhold = treshhold
        self.l_subtree = l_subtree
        self.r_subtree = r_subtree
        self.var_reduction = var_reduction

        self.leaf_node_value = leaf_node_value

### **DT regressor**

In [3]:
class Decision_tree_regression:
    def __init__(self, n_min_samples=2, max_depth=2):
        self.n_min_samples = n_min_samples
        self.max_depth = max_depth

        self.root = None

    def fit(self, X, y):
        data = np.concatenate((X, y), axis=1)
        self.root = self.build_tree(data)

    def build_tree(self, data, current_depth=0):
        X = data[:, :-1]
        y = data[:, -1]

        n_rows, n_columns = X.shape

        if n_rows >= self.n_min_samples and current_depth <= self.max_depth:
            best_split_values = self.get_best_split(data, n_columns)
            
            if best_split_values != {}:
                if best_split_values["var_reduction"] > 0:
                    l_subtree = self.build_tree(
                        best_split_values["l_split"], current_depth + 1
                    )
                    r_subtree = self.build_tree(
                        best_split_values["r_split"], current_depth + 1
                    )

                    return Node(
                        best_split_values["split_idx"],
                        best_split_values["treshhold"],
                        l_subtree,
                        r_subtree,
                        best_split_values["var_reduction"]
                    )

        leaf_node_value = self.calculate_leaf_node(y)

        return Node(leaf_node_value=leaf_node_value)

    def get_best_split(self, data, n_columns):
        max_var_reduction = -1
        best_split_values = {}

        for split_feature_idx in range(n_columns):
            treshholds = np.unique(data[:, split_feature_idx])

            for treshhold in treshholds:
                l_split, r_split = self.make_split(
                    data, split_feature_idx, treshhold
                )

                if len(l_split) > 0 and len(r_split) > 0:
                    var_reduction = self.calculate_var_reduction(
                        data, l_split, r_split
                    )

                    if var_reduction > max_var_reduction:
                        best_split_values["split_idx"] = split_feature_idx
                        best_split_values["treshhold"] = treshhold
                        best_split_values["l_split"] = l_split
                        best_split_values["r_split"] = r_split
                        best_split_values["var_reduction"] = var_reduction
                        max_var_reduction = var_reduction

        return best_split_values

    def make_split(self, data, split_idx, treshhold):
        l_split = np.array([
            row
            for row in data
            if row[split_idx] <= treshhold
        ])
        r_split = np.array([
            row
            for row in data
            if row[split_idx] > treshhold
        ])

        return l_split, r_split

    def calculate_var_reduction(self, data, l_split, r_split):
        targets = data[:, -1]
        l_split_targets = l_split[:, -1]
        r_split_targets = r_split[:, -1]

        l_split_size = len(l_split)
        r_split_size = len(r_split)
        data_size = len(targets)

        l_split_targets_var = np.var(l_split_targets)
        r_split_targets_var = np.var(r_split_targets)
        targets_var = np.var(targets)

        data_var = data_size * targets_var
        l_split_var = l_split_size * l_split_targets_var
        r_split_var = r_split_size * r_split_targets_var
        split_var = l_split_var + r_split_var

        var_reduction = data_var - split_var

        return var_reduction

    def calculate_leaf_node(self, targets):
        return np.mean(targets)

    def predict(self, data):
        predictions = np.array([
            self.make_prediction(x, self.root) for x in data
        ])

        return predictions.reshape(-1, 1)

    def make_prediction(self, x, node):
        if node.leaf_node_value != None:
            return node.leaf_node_value
        else:
            feature_value = x[node.split_idx]

            if feature_value <= node.treshhold:
                return self.make_prediction(x, node.l_subtree)
            else:
                return self.make_prediction(x, node.r_subtree)

## **Working with data**

### **Load data**

In [4]:
data = pd.read_csv(
    "D://admp_northallerton/my_machine_learning/datas/regression_data.csv"
)
data

Unnamed: 0,x,x**3,x**5,x**7,y
0,-2.074295,-8.925074,-3.840193e+01,-1.652321e+02,-3.768735
1,2.359096,13.129152,7.306810e+01,4.066483e+02,3.923851
2,2.820229,22.431223,1.784110e+02,1.419026e+03,2.512261
3,-0.035897,-0.000046,-5.960887e-08,-7.681310e-11,-0.000046
4,-0.791447,-0.495754,-3.105348e-01,-1.945155e-01,-0.445595
...,...,...,...,...,...
295,-2.650994,-18.630574,-1.309314e+02,-9.201555e+02,-3.311162
296,-2.554505,-16.669404,-1.087761e+02,-7.098178e+02,-3.614725
297,-2.871734,-23.682768,-1.953083e+02,-1.610679e+03,-2.198575
298,-2.158768,-10.060459,-4.688454e+01,-2.184950e+02,-3.877667


### **Preprocessing data**

In [5]:
X = data.iloc[:, :-1].values
y = data.iloc[:, -1].values.reshape(-1, 1)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

## **Model**

In [7]:
model = Decision_tree_regression(n_min_samples=3, max_depth=6)
model.fit(X_train, y_train)

## **Test**

In [8]:
y_pred = model.predict(X_test)

In [9]:
mse = mean_squared_error(y_test, y_pred)
mse

0.013693893093191491

In [10]:
test_data = pd.DataFrame(
    np.concatenate((X_test, y_test), axis=1), columns=["x", "x**3", "x**5", "x**7", "y"]
)
test_data.head()

Unnamed: 0,x,x**3,x**5,x**7,y
0,-2.918149,-24.849758,-211.610369,-1801.987284,-1.886965
1,-3.112051,-30.139792,-291.899774,-2827.009467,-0.286062
2,2.194387,10.566701,50.882178,245.014604,3.909025
3,-2.355538,-13.069849,-72.518855,-402.375298,-3.925998
4,-2.265922,-11.634161,-59.73448,-306.700953,-3.943084


In [11]:
pred_data = pd.DataFrame(
    np.concatenate((X_test, y_pred), axis=1), columns=["x", "x**3", "x**5", "x**7", "y"]
)
pred_data.head()

Unnamed: 0,x,x**3,x**5,x**7,y
0,-2.918149,-24.849758,-211.610369,-1801.987284,-1.896429
1,-3.112051,-30.139792,-291.899774,-2827.009467,-0.57175
2,2.194387,10.566701,50.882178,245.014604,3.887526
3,-2.355538,-13.069849,-72.518855,-402.375298,-3.819603
4,-2.265922,-11.634161,-59.73448,-306.700953,-3.819603
