### Random forest

Библиотеки

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.metrics import r2_score

In [2]:
def mse(y):
    return np.mean((y - y.mean())**2)

In [3]:
def gain(root_data, right_data, left_data):
    p = float(right_data.shape[0]) / (right_data.shape[0] + left_data.shape[0])
    return mse(root_data) - p*mse(right_data) - (1-p)*mse(left_data)

In [4]:
def split(X, y, index, t):
    
    left = np.where(X[:, index] <= t)
    right = np.where(X[:, index] > t)
    
    false_data = X[right]
    true_data = X[left]
    
    false_pred = y[right]
    true_pred = y[left]
    
    return true_data, false_data, true_pred, false_pred

In [5]:
def find_best_split(X, y):
    
    min_samples_leaf = 5
    
    best_gain = 0
    best_t = None
    best_index = None
    
    n_features = np.random.choice(X.shape[1], size=1, replace=False)
    for index in n_features:
        
        t_values = np.unique(X[:, index])
        
        for t in t_values:
            true_data, false_data, true_pred, false_pred  = split(X, y, index, t)
            
            if len(false_data) < min_samples_leaf or len(true_data) < min_samples_leaf:
                continue
                
            current_gain = gain(y, false_pred, true_pred)
            
            if current_gain > best_gain:
                best_gain = current_gain
                best_t = t
                best_index = index
        
    return best_gain, best_t, best_index 

In [6]:
def build_tree(X, y):
    gain, t, index = find_best_split(X, y)

    if gain == 0:
        return Leaf(X, y)
    true_data, false_data, true_pred, false_pred = split(X, y, index, t)
    
    true_branch = build_tree(true_data, true_pred)
    
    false_branch = build_tree(false_data, false_pred)
    
    return Node(index, t, true_branch, false_branch)

In [7]:
class Node:
    def __init__(self, index, t, true_branch, false_branch):
        self.index = index #Индекс признака, по которому мы сравниваем значения с пороговым
        self.t = t #пороговое значение
        self.true_branch = true_branch # Поддерево удовлетворяещее условию 
        self.false_branch = false_branch # Поддерево не удовлетворяещее условию 
    

In [8]:
class Leaf:
    def __init__(self, X, y):
        self.X = X
        self.y = y
        self.predict = self.y.mean()

In [9]:
def classify_object(obj, node):

    #  Останавливаем рекурсию, если достигли листа
    if isinstance(node, Leaf):
        answer = node.predict
        return answer

    if obj[node.index] <= node.t:
        return classify_object(obj, node.true_branch)
    else:
        return classify_object(obj, node.false_branch)

In [10]:
def predict(data, tree):
    
    prediction_y = list()
    for obj in data:
        prediction = classify_object(obj, tree)
        prediction_y.append(prediction)
    return prediction_y

In [11]:
def fit_random_forest(data, y, count_trees):
    mass_trees = []
    bootstraps = []
    for tree in range(count_trees):
        obj_random = np.random.randint(0, data.shape[0], data.shape[0])
        mass_trees.append(build_tree(data[obj_random], y[obj_random]))
        bootstraps.append(obj_random)
    return(mass_trees, bootstraps)

### Инициализируем данные

In [12]:

X, y = make_regression(n_samples=50, n_features=1, noise=2, random_state=42)
from sklearn.model_selection import train_test_split

train_X, test_X, train_y, test_y = train_test_split(X, 
                                       y, 
                                       test_size=0.3,
                                       random_state=1)

In [13]:
random_forest, bootstraps = fit_random_forest(train_X, train_y, 5)

In [14]:
def RF_PRED(forest, test_data, test_y, ):
    pred_trees = []
    pred_mean_y = []
    for tree in forest:
        pred_trees.append(predict(test_data, tree))
    
    pred_trees = np.array(pred_trees)
    for i in range(len(test_y)):
        pred_mean_y.append((pred_trees[0,i]+pred_trees[1,i]+pred_trees[2,i]+pred_trees[3,i]+pred_trees[4,i])/5)
        
    return pred_trees, pred_mean_y

In [15]:
pred_trees, pred_mean_y = RF_PRED(random_forest, test_X, test_y)

In [16]:
r2_score(test_y, pred_mean_y)

0.8738846052276187