<a href="https://colab.research.google.com/github/CS129-18-A-2019-2020/GBM-From-Scratch/blob/master/GBMs_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%pylab inline
from sklearn import tree, metrics
import pandas as pd

### Gradient Boosted Machines (GBMs)

GBMs are generally composed of Additive Decision Trees, such that trees are generated sequentially and each new tree builds upon the errors of the tree before it. TL;DR: Use the errors of the previous tree as the new values to be predicted.


#### Generating Sample Data

In [0]:
x = np.arange(0,50)
x = pd.DataFrame({'x':x})
y1 = np.random.uniform(10,15,10)
y2 = np.random.uniform(20,25,10)
y3 = np.random.uniform(0,5,10)
y4 = np.random.uniform(30,32,10)
y5 = np.random.uniform(13,17,10)

y = np.concatenate((y1,y2,y3,y4,y5))
y = y[:,None]

In [0]:
plt.figure(figsize=(7,5))
plt.plot(x,y, 'o')
plt.title("Scatter plot of x vs. y")
plt.xlabel("x")
plt.ylabel("y")
plt.show()

### Creating a Simple GBM

In [0]:
y_current = y
pred_final = None
error_current = 0
trees = []

In [0]:
for i in range(10):
  tree_current = tree.DecisionTreeRegressor(max_depth=2, random_state=42)
  tree_current.fit(x, y_current)
  pred_current = tree_current.predict(x)

  if pred_final is None:
    pred_final = pred_current
  else:
    pred_final = pred_final + pred_current
  
  error_current = (y[:, 0] - pred_final.reshape(-1, 1)[:, 0])
  y_current = error_current

  trees.append(tree_current)

  # Plot out the current predictions and residuals
  xs = np.array(x)
  ys = np.array(pred_final)

  f, (ax1, ax2) = plt.subplots(1, 2, sharey=True, figsize = (13,2.5))

  ax1.plot(x,y, 'o')
  ax1.plot(xs, ys, 'r')
  ax1.set_title(f'Prediction (Iteration {i+1}) | Current MAE: {np.round(metrics.mean_absolute_error(y[:, 0], pred_final), 4)}')
  ax1.set_xlabel('x')
  ax1.set_ylabel('y / y_pred')

  ax2.plot(x, error_current, 'go')
  ax2.set_title(f'Residuals vs. x (Iteration {i+1})')
  ax2.set_xlabel('x')
  ax2.set_ylabel('Residuals')
  plt.show()

In [0]:
def predict_with_gbm(trees, x):
  preds = 0
  for tree in trees:
    preds += tree.predict(x)
  return preds

In [0]:
np.round(metrics.mean_absolute_error(y[:, 0], predict_with_gbm(trees, x)), 4)