In [40]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_diabetes
from sklearn.tree import DecisionTreeRegressor

##### Break down of functions

* __init__: <br>

> - This function initializes a tree with x (features matrix) <br>
- y (the label) <br>
- min_leaf (minimum number of samples in leaf node) <br>
- n (total number of rows) <br>
- c (number of columns) <br>
- val (mean of label at a node) <br>
- score (initialize with inf ) -- stores the variance of resulting split <br>
- find_varsplit() -- splits the tree and create left and right subtrees <br>

* __find_better_split__: Find the best split for a single feature column in `self.x` <br>

> - set x and y to feature column and label
 - sort the values of x and corresponding y
 - `_cnt` stores the number of data points in a node
 - `_sum` stores sum of labels in a node
 - `sum2` stores square of sum of labels in a node
 - `_sum` and `sum2` are used for variance calculation
>> - Initially all the elements are assigned to the right node
    - Then fill left node with atleast `min_leaf` number of elements and calculate the weighted resulting variance across left and right node for every single element added from right to left node
    - Store the split that gives the lowest resulting variance

* __find_varsplit__: 

>- Check for the best split and score(variance of left+right) across all columns
- Initialize the left and right node with the resulting data of split

* __split_col__:
>- returns a column that was chosen by find_better_split
* __split_name__:
>- returns the column name that was that was chosen by find_better_split
* __predict__:
>- Predict output by passing each data point through predict_row, that returns the mean of observations in the leaf node

In [34]:
class DecisionTree():
    def __init__(self, x, y, idxs = None, min_leaf=2):
        if idxs is None: idxs=np.arange(len(y))
        self.x,self.y,self.idxs,self.min_leaf = x,y,idxs,min_leaf
        self.n,self.c = len(idxs), x.shape[1]
        self.val = np.mean(y[idxs])
        self.score = float('inf')
        self.find_varsplit()
        
    def find_varsplit(self):
        for i in range(self.c): self.find_better_split(i)
        if self.score == float('inf'): return
        x = self.split_col
        lhs = np.nonzero(x<=self.split)[0]
        rhs = np.nonzero(x>self.split)[0]
        self.lhs = DecisionTree(self.x, self.y, self.idxs[lhs])
        self.rhs = DecisionTree(self.x, self.y, self.idxs[rhs])

    def find_better_split(self, var_idx):
        x,y = self.x.values[self.idxs,var_idx], self.y[self.idxs]
        sort_idx = np.argsort(x)
        sort_y,sort_x = y[sort_idx], x[sort_idx]
        rhs_cnt,rhs_sum,rhs_sum2 = self.n, sort_y.sum(), (sort_y**2).sum()
        lhs_cnt,lhs_sum,lhs_sum2 = 0,0.,0.

        for i in range(0,self.n-self.min_leaf-1):
            xi,yi = sort_x[i],sort_y[i]
            lhs_cnt += 1; rhs_cnt -= 1
            lhs_sum += yi; rhs_sum -= yi
            lhs_sum2 += yi**2; rhs_sum2 -= yi**2
            if i<self.min_leaf or xi==sort_x[i+1]:
                continue

            lhs_std = std_agg(lhs_cnt, lhs_sum, lhs_sum2)
            rhs_std = std_agg(rhs_cnt, rhs_sum, rhs_sum2)
            curr_score = lhs_std*lhs_cnt + rhs_std*rhs_cnt
            if curr_score<self.score: 
                self.var_idx,self.score,self.split = var_idx,curr_score,xi

    @property
    def split_name(self): return self.x.columns[self.var_idx]
    
    @property
    def split_col(self): return self.x.values[self.idxs,self.var_idx]

    @property
    def is_leaf(self): return self.score == float('inf')
    
    def __repr__(self):
        s = f'n: {self.n}; val:{self.val}'
        if not self.is_leaf:
            s += f'; score:{self.score}; split:{self.split}; var:{self.split_name}'
        return s

    def predict(self, x):
        if isinstance(x, pd.DataFrame):
            x = x.values
        
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        t = self.lhs if xi[self.var_idx]<=self.split else self.rhs
        return t.predict_row(xi)

    
def std_agg(cnt, s1, s2): 
    return (s2/cnt) - (s1/cnt)**2

In [35]:
db = load_diabetes()
df = pd.DataFrame(data = db.data, columns= db.feature_names)

In [36]:
dt_r = DecisionTree(df,db.target,np.array(range(len(db.target))),min_leaf=3)

`__repr__` returns this

In [50]:
y_pred = dt_r.predict(df)
np.var(y_pred - db.target)

783.8222473604827

In [43]:
sk_dt = DecisionTreeRegressor(min_samples_leaf=3)
sk_dt.fit(df, db.target)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=3,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [49]:
y_pred_sk = sk_dt.predict(df)
np.var(y_pred_sk - db.target)

783.8222473604827

Pretty accurate !

This implementation is borrowed from this amazing post https://www.kaggle.com/grroverpr/gradient-boosting-simplified/