In [1]:
# Data wrangling 
import pandas as pd 

# Array math
import numpy as np 

# Quick value count calculator
from collections import Counter


class NodeRegression():
    """
    Class to grow a regression decision tree
    """
    def __init__(
        this, 
        Y: list,
        X: pd.DataFrame,
        min_samples_split=None,
        max_depth=None,
        depth=None,
        node_type=None,
        rule=None
    ):
        # Saving the data to the node 
        this.Y = Y 
        this.X = X

        # Saving the hyper parameters
        this.min_samples_split = min_samples_split if min_samples_split else 20
        this.max_depth = max_depth if max_depth else 5

        # Default current depth of node 
        this.depth = depth if depth else 0

        # Extracting all the features
        this.features = list(this.X.columns)

        # Type of node 
        this.node_type = node_type if node_type else 'root'

        # Rule for spliting 
        this.rule = rule if rule else ""

        # Getting the mean of Y 
        this.ymean = np.mean(Y)

        # Getting the residuals 
        this.residuals = this.Y - this.ymean

        # Calculating the mse of the node 
        this.mse = this.get_mse(Y, this.ymean)

        # Saving the number of observations in the node 
        this.n = len(Y)

        # Initiating the left and right nodes as empty nodes
        this.left = None 
        this.right = None 

        # Default values for splits
        this.best_feature = None 
        this.best_value = None 

    @staticmethod
    def get_mse(ytrue, yhat) -> float:
        """
        Method to calculate the mean squared error 
        """
        # Getting the total number of samples
        n = len(ytrue)

        # Getting the residuals 
        r = ytrue - yhat 

        # Squering the residuals 
        r = r ** 2

        # Suming 
        r = np.sum(r)

        # Getting the average and returning 
        return r / n

    @staticmethod
    def ma(x: np.array, window: int) -> np.array:
        """
        Calculates the moving average of the given list. 
        """
        return np.convolve(x, np.ones(window), 'valid') / window

    def best_split(this) -> tuple:
        """
        Given the X features and Y targets calculates the best split 
        for a decision tree
        """
        # Creating a dataset for spliting
        df = this.X.copy()
        df['Y'] = this.Y

        # Getting the GINI impurity for the base input 
        mse_base = this.mse

        # Finding which split yields the best GINI gain 
        #max_gain = 0

        # Default best feature and split
        best_feature = None
        best_value = None

        for feature in this.features:
            # Droping missing values
            Xdf = df.dropna().sort_values(feature)

            # Sorting the values and getting the rolling average
            xmeans = this.ma(Xdf[feature].unique(), 2)

            for value in xmeans:
                # Getting the left and right ys 
                left_y = Xdf[Xdf[feature]<value]['Y'].values
                right_y = Xdf[Xdf[feature]>=value]['Y'].values

                # Getting the means 
                left_mean = np.mean(left_y)
                right_mean = np.mean(right_y)

                # Getting the left and right residuals 
                res_left = left_y - left_mean 
                res_right = right_y - right_mean

                # Concatenating the residuals 
                r = np.concatenate((res_left, res_right), axis=None)

                # Calculating the mse 
                n = len(r)
                r = r ** 2
                r = np.sum(r)
                mse_split = r / n

                # Checking if this is the best split so far 
                if mse_split < mse_base:
                    best_feature = feature
                    best_value = value 

                    # Setting the best gain to the current one 
                    mse_base = mse_split

        return (best_feature, best_value)

    def grow_tree(this):
        """
        Recursive method to create the decision tree
        """
        # Making a df from the data 
        df = this.X.copy()
        df['Y'] = this.Y

        # If there is GINI to be gained, we split further 
        if (this.depth < this.max_depth) and (this.n >= this.min_samples_split):

            # Getting the best split 
            best_feature, best_value = this.best_split()

            if best_feature is not None:
                # Saving the best split to the current node 
                this.best_feature = best_feature
                this.best_value = best_value

                # Getting the left and right nodes
                left_df, right_df = df[df[best_feature]<=best_value].copy(), df[df[best_feature]>best_value].copy()

                # Creating the left and right nodes
                left = NodeRegression(
                    left_df['Y'].values.tolist(), 
                    left_df[this.features], 
                    depth=this.depth + 1, 
                    max_depth=this.max_depth, 
                    min_samples_split=this.min_samples_split, 
                    node_type='left_node',
                    rule=f"{best_feature} <= {round(best_value, 3)}"
                    )

                this.left = left 
                this.left.grow_tree()

                right = NodeRegression(
                    right_df['Y'].values.tolist(), 
                    right_df[this.features], 
                    depth=this.depth + 1, 
                    max_depth=this.max_depth, 
                    min_samples_split=this.min_samples_split,
                    node_type='right_node',
                    rule=f"{best_feature} > {round(best_value, 3)}"
                    )

                this.right = right
                this.right.grow_tree()

    def print_info(this, width=4):
        """
        Method to print the infromation about the tree
        """
        # Defining the number of spaces 
        const = int(this.depth * width ** 1.5)
        spaces = "-" * const
        
        if this.node_type == 'root':
            print("Root")
        else:
            print(f"|{spaces} Split rule: {this.rule}")
        print(f"{' ' * const}   | MSE of the node: {round(this.mse, 2)}")
        print(f"{' ' * const}   | Count of observations in node: {this.n}")
        print(f"{' ' * const}   | Prediction of node: {round(this.ymean, 3)}")   

    def print_tree(this):
        """
        Prints the whole tree from the current node to the bottom
        """
        this.print_info() 
        
        if this.left is not None: 
            this.left.print_tree()
        
        if this.right is not None:
            this.right.print_tree()

In [10]:
# Reading data
d = pd.read_csv("auto-mpg.csv")
# Subsetting
d = d[d['horsepower']!='?']
# Constructing the X and Y matrices
features = ['horsepower', 'weight']
# Ensuring the correct types 
for ft in features:
    d[ft] = pd.to_numeric(d[ft])
# Constructing the X and Y matrices
X = d[features]
Y = d['mpg'].values.tolist()

  res_values = method(rvalues)


In [7]:
# Initiating the Node
root = NodeRegression(Y, X, max_depth=2, min_samples_split=3)
# Growing the tree
root.grow_tree()

In [7]:
# Reading data
d_vw = pd.read_csv("Data/vw.csv")
d_vw.reset_index(drop=True, inplace=True)

# Constructing the X and Y matrices
features = ["year","mileage","tax","mpg","engineSize"]
# Ensuring the correct types
for ft in features:
    d_vw[ft] = pd.to_numeric(d_vw[ft])

# Constructing the X and Y matrices
X_vw = d_vw[features]
Y_vw = d_vw['price'].values.tolist()

In [14]:
 from sklearn.model_selection import train_test_split
 X_train, X_test, y_train, y_test = train_test_split(X_vw, Y_vw, random_state=0)

In [15]:
# Initiating the Node
root = NodeRegression(y_train, X_train, max_depth=2, min_samples_split=3)
# Growing the tree
root.grow_tree()

In [17]:
# Printing tree
root.print_tree()

Root
   | MSE of the node: 59808615.31
   | Count of observations in node: 11367
   | Prediction of node: 16830.805
|-------- Split rule: mpg <= 50.9
           | MSE of the node: 61054768.38
           | Count of observations in node: 4938
           | Prediction of node: 22258.837
|---------------- Split rule: mpg <= 35.1
                   | MSE of the node: 91222696.96
                   | Count of observations in node: 579
                   | Prediction of node: 34684.128
|---------------- Split rule: mpg > 35.1
                   | MSE of the node: 33816543.2
                   | Count of observations in node: 4359
                   | Prediction of node: 20608.403
|-------- Split rule: mpg > 50.9
           | MSE of the node: 18839036.48
           | Count of observations in node: 6429
           | Prediction of node: 12661.63
|---------------- Split rule: engineSize <= 1.3
                   | MSE of the node: 6798986.55
                   | Count of observations in node: 2548

In [21]:
from sklearn.tree import DecisionTreeRegressor
from sklearn import tree

DTR = DecisionTreeRegressor(max_depth=2, min_samples_split=3)
DTR.fit(X_vw, Y_vw)
y_pred = DTR.predict(X_test)


In [22]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test,y_pred)

25715893.696507894

In [11]:
print(tree.export_text(DTR))

|--- feature_3 <= 50.90
|   |--- feature_3 <= 35.10
|   |   |--- value: [34664.28]
|   |--- feature_3 >  35.10
|   |   |--- value: [20660.66]
|--- feature_3 >  50.90
|   |--- feature_4 <= 1.30
|   |   |--- value: [10030.64]
|   |--- feature_4 >  1.30
|   |   |--- value: [14425.99]



SCALING DATA:

In [25]:
data_vw = pd.read_csv("Data/vw.csv")
data_vw_expanded = pd.get_dummies(data_vw)

In [26]:
from sklearn.preprocessing import StandardScaler

std = StandardScaler()
data_vw_expanded_std = std.fit_transform(data_vw_expanded)
data_vw_expanded_std = pd.DataFrame(data_vw_expanded_std, columns = data_vw_expanded.columns)
print(data_vw_expanded_std.shape)
data_vw_expanded_std.head()

(15157, 40)


Unnamed: 0,year,price,mileage,tax,mpg,engineSize,model_ Amarok,model_ Arteon,model_ Beetle,model_ CC,...,model_ Touareg,model_ Touran,model_ Up,transmission_Automatic,transmission_Manual,transmission_Semi-Auto,fuelType_Diesel,fuelType_Hybrid,fuelType_Other,fuelType_Petrol
0,0.849595,1.052392,-0.387209,0.50812,-0.304459,0.864902,-0.085892,-0.128974,-0.074204,-0.079418,...,-0.156643,-0.154194,-0.248868,2.594834,-1.280856,-0.576411,1.174175,-0.09828,-0.075981,-1.138035
1,0.849595,1.295211,-0.828948,0.50812,-0.304459,0.864902,-0.085892,-0.128974,-0.074204,-0.079418,...,-0.156643,-0.154194,-0.248868,2.594834,-1.280856,-0.576411,1.174175,-0.09828,-0.075981,-1.138035
2,0.849595,0.407627,-0.69409,0.50812,-0.245816,0.864902,-0.085892,-0.128974,-0.074204,-0.079418,...,-0.156643,-0.154194,-0.248868,-0.385381,0.780728,-0.576411,1.174175,-0.09828,-0.075981,-1.138035
3,0.849595,2.147462,-0.816512,0.50812,-1.557966,0.864902,-0.085892,-0.128974,-0.074204,-0.079418,...,-0.156643,-0.154194,-0.248868,2.594834,-1.280856,-0.576411,-0.851661,-0.09828,-0.075981,0.878707
4,0.849595,0.781591,-0.737309,0.586884,-1.022843,-0.218101,-0.085892,-0.128974,-0.074204,-0.079418,...,-0.156643,-0.154194,-0.248868,-0.385381,-1.280856,1.734874,-0.851661,-0.09828,-0.075981,0.878707


In [38]:
# Reading data
raw_d_vw = pd.read_csv("Data/vw.csv")
raw_d_vw.reset_index(drop=True, inplace=True)

# Constructing the X and Y matrices
features = ["year","mileage","tax","mpg","engineSize","price"]
# Ensuring the correct types
for ft in features:
    d_vw[ft] = pd.to_numeric(raw_d_vw[ft])

# Constructing the X and Y matrices
#X_vw = d_vw[features]
#Y_vw = d_vw['price'].values.tolist()



In [40]:
std = StandardScaler()
d_vw_scaled = std.fit_transform(d_vw)
d_vw_scaled = pd.DataFrame(d_vw_scaled, columns = feat)
print(d_vw_scaled.shape)
d_vw_scaled.head()

AttributeError: 'numpy.ndarray' object has no attribute 'columns'

In [27]:
X_train, X_test, y_train, y_test = train_test_split(data_vw_expanded_std.drop(columns = ['price']), data_vw_expanded_std[['price']])

In [28]:
# Initiating the Node
root = NodeRegression(y_train, X_train, max_depth=2, min_samples_split=3)
# Growing the tree
root.grow_tree()
# Printing tree
root.print_tree()

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [30]:
DTR = DecisionTreeRegressor(max_depth=2, min_samples_split=3)
DTR.fit(X_train, y_train)
y_pred = DTR.predict(X_test)
mean_squared_error(y_test,y_pred)

0.4234713519350358