In [22]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy import optimize
from sklearn import datasets, cross_validation, metrics, neighbors
from matplotlib.colors import ListedColormap
from pandas import DataFrame
%pylab inline

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy


In [23]:
boston = datasets.load_boston()

In [40]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(
    boston['data'], boston['target'], test_size=0.25, random_state=1
)

In [41]:
def H(data):
    return np.sum([(data[i] - np.mean(data))**2 for i in range(len(data))]) / len(data)

In [42]:
def G(data_1, data_2):
    l = len(data_1)
    r = len(data_2)
    q = l + r
    return l / q * H(data_1) + r / q * H(data_2)

In [43]:
def depth(node):
    k = 0
    while node.parent != 0:
        node = node.parent
        k += 1
    return k + 1

In [44]:
class Node:
    def __init__(self, feature, step, parent = 0, kid_1 = 0, kid_2 = 0):
        self.feature = feature
        self.step = step
        self.answer = 0
        self.parent = parent
        self.kid_1 = kid_1
        self.kid_2 = kid_2

### Подбираем параметры дерева для лучшего обучения

In [45]:
steps_num = 70
objects_num = 9
depth_num = 7

In [46]:
def tree(self, data, targets, node):
    best_feature = 0
    best_step = 0
    min_G = 1000
    
    for feature in range(13):
        min_value = np.min([data[i][feature] for i in range(len(data))])
        max_value = np.max([data[i][feature] for i in range(len(data))])
        for step in [min_value + (i + 1)*(max_value - min_value) / steps_num for i in range(steps_num - 1)]:
            if G(split(data, targets, feature, step)[2], 
                 split(data, targets, feature, step)[3]) < min_G:
                min_G = G(split(data, targets, feature, step)[2], 
                        split(data, targets, feature, step)[3])
                best_feature = feature
                best_step = step
                
    node.feature = best_feature
    node.step = best_step
    node.kid_1 = Node(0, 0)
    node.kid_2 = Node(0, 0)
    node.kid_1.answer = np.mean(split(data, targets, best_feature, best_step)[2])
    node.kid_2.answer = np.mean(split(data, targets, best_feature, best_step)[3])
    node.kid_1.parent = node
    node.kid_2.parent = node
    self.nodes.append(node)
    if (depth(node) < depth_num and len(split(data, targets, best_feature, best_step)[0]) > objects_num and 
        len(split(data, targets, best_feature, best_step)[1]) > objects_num):
        tree(self, split(data, targets, best_feature, best_step)[0], 
                     split(data, targets, best_feature, best_step)[2], node.kid_1) 
        tree(self, split(data, targets, best_feature, best_step)[1], 
                     split(data, targets, best_feature, best_step)[3], node.kid_2)

In [47]:
def split(data, targets, feature, step):
    left_feature = []
    right_feature = []
    left_target = []
    right_target = []
    for i in range(len(data)):
        if data[i][feature] < step:
            left_feature.append(data[i])
            left_target.append(targets[i])
        else:
            right_feature.append(data[i])
            right_target.append(targets[i])
    return (left_feature, right_feature, left_target, right_target)

In [48]:
class DecisionTree(BaseEstimator):
        
    def fit(self, X_data, y_data):
        self.nodes = []
        self.head = Node(0, 0)
        tree(self, X_data, y_data, self.head)
    
    def predict(self, X_data):
        res = [0.0 for i in range(len(X_data))]
        for i in range(len(X_data)):
            node = self.head
            while (node.kid_1 != 0 or node.kid_2 != 0):
                if (X_data[i][node.feature] < node.step):
                    node = node.kid_1
                else:
                    node = node.kid_2
            res[i] = node.answer
        return res

In [49]:
algo = DecisionTree()

In [50]:
algo.fit(X_train, y_train)

In [51]:
test = algo.predict(X_test)

In [54]:
print("MSE for my tree:", mean_squared_error(y_test, test))
print("MSE for tests:", np.var(y_test))

MSE for my tree: 17.7551008228
MSE for tests: 99.0584735569


In [55]:
tree = DecisionTreeRegressor(max_depth = 7)
tree.fit(X_train, y_train)
print("MSE for sklearn.tree:", mean_squared_error(y_test, tree.predict(X_test)))

MSE for sklearn.tree: 15.0583676059


### Из сравнения видно, что написанный класс работает почти так же, как реализация sklearn, и гораздо лучше, чем просто отвечать средним. 