In [66]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import model_selection, metrics, datasets, ensemble, tree

In [2]:
def write_answer(filename, answer):
    file_obj = open(filename, "w")
    file_obj.write(' '.join(np.array(answer, dtype=str)))
    file_obj.close()
    
    

In [128]:
data, target = datasets.load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = model_selection.train_test_split(data, 
                                                                    target,
                                                                    test_size=0.25,
                                                                    shuffle=False)
#X_train, X_test = np.split(data, [int(data.shape[0]*0.75)])
#y_train, y_test = np.split(target, [int(data.shape[0]*0.75)])

In [159]:
# in case where L(y, z) = (y - z)^2
def get_ans_for_learning(target, prediction):
    return target - prediction


def gbm_predict(X, trees, coeffs):
    assert np.array(trees).size == np.array(coeffs).size
    
    return [sum([coeff * algo.predict([x])[0] for algo, coeff in 
                 zip(trees, coeffs)]) for x in X]


def my_gbm_predict(X, trees, coeffs):
    assert np.array(trees).size == np.array(coeffs).size
    
    predictions = np.array(list(map(lambda tree: tree.predict(X), trees)))
    return (predictions.T @ coeffs[:, np.newaxis]).ravel()

In [165]:
n_trees = 50
eta = 0.9
my_tree = tree.DecisionTreeRegressor(max_depth=5, random_state=42)

def gbm_fit(X_train, trees, coeffs):
    assert np.array(trees).size == np.array(coeffs).size
    
    n_trees = np.array(trees).size
    # a_i(x) = a_i-1(x) + b_i(x)
    current_ans = np.zeros(X_train.shape[0])

    for i in range(n_trees):
        learning_ans = get_ans_for_learning(y_train, current_ans)
        trees[i].fit(X_train, learning_ans)
        current_ans += coeffs[i] * trees[i].predict(X_train)
        
    return trees

In [184]:
my_tree = tree.DecisionTreeRegressor(max_depth=5, random_state=42)
coeffs = np.ones(n_trees) * eta

trees = []
for i in range(50):
    trees.append(tree.DecisionTreeRegressor(max_depth=5, random_state=42))

trees = gbm_fit(X_train, trees, coeffs)
y_pred = gbm_predict(X_test, trees, coeffs)
rmse = metrics.mean_squared_error(y_test, y_pred) ** 0.5
print(rmse)
write_answer('2.txt', [rmse])

5.455623403859612


In [185]:
desc_coeffs = eta / (1 + np.arange(50))
desc_trees = gbm_fit(X_train, np.repeat([my_tree], n_trees), desc_coeffs)

y_desc_pred = gbm_predict(X_test, trees, desc_coeffs)
desc_rmse = metrics.mean_squared_error(y_test, y_desc_pred) ** 0.5
print(desc_rmse)
write_answer('3.txt', [desc_rmse])

4.459384603684657


In [113]:
trees_depth = np.append(np.arange(3, 22, 2), None)
depth_scoring = []

for depth in trees_depth:
    depth_tree = tree.DecisionTreeRegressor(max_depth=depth, random_state=42)
    depth_trees = gbm_fit(X_train, np.repeat([depth_tree], n_trees), desc_coeffs)
    
    y_depth_pred = gbm_predict(X_test, depth_trees, desc_coeffs)
    depth_rmse = metrics.mean_squared_error(y_test, y_depth_pred) ** 0.5
    depth_scoring.append(depth_rmse)
    
print('test:', list(map(lambda x: round(x, 3), depth_scoring)))

[18.204, 14.599, 15.506, 15.66, 15.671, 15.654, 15.648, 15.662, 15.667, 15.667, 15.667]


In [105]:
n_trees = (np.arange(30, 201, 20))
trees_cnt_scoring = []

for trees_cnt in n_trees:
    coeffs = eta / (1 + np.arange(trees_cnt))
    trees = gbm_fit(X_train, np.repeat([my_tree], trees_cnt), coeffs)
    
    y_pred = gbm_predict(X_test, trees, coeffs)
    rmse = metrics.mean_squared_error(y_test, y_pred) ** 0.5
    trees_cnt_scoring.append(rmse)
    
print('test:', list(map(lambda x: round(x, 3), trees_cnt_scoring)))

[14.887, 16.251, 14.939, 15.855, 15.851, 15.889, 15.45, 15.893, 15.747]


In [114]:
write_answer('4.txt', [2, 3])

In [115]:
from sklearn.linear_model import LinearRegression

In [118]:
gb_regressor = ensemble.GradientBoostingRegressor()
gb_regressor.fit(X_train, y_train)
gb_pred = gb_regressor.predict(X_test)
gb_rmse = metrics.mean_squared_error(y_test, gb_pred) ** 0.5

In [123]:
lin_regressor = LinearRegression()
lin_regressor.fit(X_train, y_train)
lin_pred = lin_regressor.predict(X_test)
lin_rmse = metrics.mean_squared_error(y_test, lin_pred) ** 0.5

In [122]:
write_answer('5.txt', [lin_rmse])

In [154]:
base_algorithms = []
coefficients = []
y_train_cur = y_train 

for i in range(50):
    alg = tree.DecisionTreeRegressor(max_depth=5, random_state=42) 
    alg.fit(X_train, y_train_cur)
    coefficients.append(0.9)
    base_algorithms.append(alg)
    y_train_cur = y_train - gbm_predict(X_train, base_algorithms, coefficients)

mse = metrics.mean_squared_error(y_test, gbm_predict(X_test, base_algorithms, coefficients)) 
print(mse ** 0.5)

25.074406332453822 50.0 11.8
2.5074406332453805 12.29 -5.44576923076923
0.2507440633245382 7.594213948883663 -5.270147901175655
0.025074406332453718 4.0188476307123295 -4.773003052038462
0.0025074406332451517 3.6723038406349175 -4.251845824804832
0.0002507440633245611 3.346403674619868 -4.051409807713149
2.507440633285966e-05 2.8765511894043883 -2.6183794237498077
2.5074406324516874e-06 2.6066456628886208 -2.5495702041753425
2.507440638797828e-07 1.7524028681485753 -1.8999288571317123
2.507440655577135e-08 1.7221407138708926 -1.9301910114093914
5.455623403859612


In [146]:
oth_y_pred = np.array(gbm_predict(X_test, base_algorithms, coefficients))
oth_y_pred

array([14.07851444, 15.48786565, 14.13609346, 15.70622297, 15.11445482,
       17.078542  , 15.36294293, 15.37267209, 17.74131403, 15.41355703,
       14.45699626, 15.77397676, 18.3122189 , 15.41390164, 21.6588864 ,
       16.37525386, 14.99893368, 14.06318248, 15.15590961, 16.06629272,
       17.77940234, 15.73642428, 13.87693573, 14.07380499, 15.44217311,
       16.87034485, 16.29958999, 14.10248166, 21.48993512, 14.90161664,
       13.88148319, 21.86606246, 13.11590633, 14.90354601, 14.94967086,
       14.87408912, 13.14713407, 13.76279542, 17.15169761, 15.06958081,
       16.25777534, 19.3346192 , 19.77314098, 22.14869987, 15.83400893,
       19.76974598, 15.09879127, 23.50417031, 22.91850711, 16.84440225,
       13.0452222 , 16.11313796, 14.02891462, 25.83760453, 16.60420662,
       20.95645446, 13.88937221, 13.58687476, 14.80785   , 16.17790777,
       17.26127072, 16.13939501, 15.54619476, 15.18358496, 15.68702452,
       15.44501466, 13.13905932, 15.34435838, 16.68813196, 15.83