Modify the Gradient Boosting scratch code in our lecture such that:
- Notice that we are still using max_depth = 1.  Attempt to tweak min_samples_split, max_depth for the regression and see whether we can achieve better mse on our boston data
- Notice that we only write scratch code for gradient boosting for regression, add some code so that it also works for binary classification.  Load the breast cancer data from sklearn and see that it works.
- Further change the code so that it works for multiclass classification.  Load the digits data from sklearn and see that it works
- Put everything into class

In [90]:
from scipy.special import expit
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
import numpy as np

In [91]:
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

In [92]:
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                        test_size=0.3, random_state=42)

In [93]:
class GradientBoosting:
    
    def __init__(self,n_estimators,max_depth,min_samples_split,regression = False):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split= min_samples_split
        self.regression=regression
        tree_params = {'max_depth': self.max_depth,'min_samples_split': self.min_samples_split}
        self.models = [DecisionTreeRegressor(**tree_params) for _ in range(self.n_estimators)]
    
    def grad(self,y, h):
        return y - h

    def fit(self,X, y):

        self.models_trained = []

        #using DummyRegressor is a good technique for starting model
        first_model = DummyRegressor(strategy='mean')
        first_model.fit(X, y)
        
        self.models_trained.append(first_model)

        #fit the estimators
        for i, model in enumerate(self.models):
            #predict using all the weak learners we trained up to
            #this point
            y_pred = self.pred(X,argmax = False)

            #errors will be the total errors maded by models_trained
            residual = self.grad(y, y_pred)

            #fit the next model with residual
            model.fit(X, residual)

            self.models_trained.append(model)

    def softmax(self,theta_t_x):
        return np.exp(theta_t_x) / np.sum(np.exp(theta_t_x), axis=1, keepdims=True)

    def pred(self,X,argmax = True):
        models = self.models_trained
        learning_rate = 0.1  ##hard code for now
        f0 = models[0].predict(X)  #first use the dummy model
        boosting = sum(learning_rate * model.predict(X) for model in models[1:])
        yhat = f0 + boosting
        
        if self.regression:
            yhat = self.softmax(yhat)
            
            if argmax:
                yhat = np.argmax(yhat, axis=1)
            
        return yhat

In [94]:
models = GradientBoosting(n_estimators=200,max_depth=3,min_samples_split=2)
models.fit(X_train, y_train)
y_pred = models.pred(X_test)

#print metrics
print("Our MSE: ", mean_squared_error(y_test, y_pred))

Our MSE:  7.7006643758730515


In [95]:
from sklearn.datasets import load_breast_cancer
X, y = load_breast_cancer(return_X_y=True)

X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1

In [96]:
models = GradientBoosting(n_estimators=200,max_depth=3,min_samples_split=2,regression = True)
models.fit(X_train, y_train_encoded)
yhat = models.pred(X_test)

#print metrics
from sklearn.metrics import accuracy_score
print("Our accuracy: ", accuracy_score(y_test, yhat))

Our accuracy:  0.9649122807017544


In [97]:
from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.ensemble import GradientBoostingClassifier

X, y = load_digits(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class] = 1

In [98]:
models = GradientBoosting(n_estimators=200,max_depth=3,min_samples_split=2,regression = True)
models.fit(X_train, y_train_encoded)
yhat = models.pred(X_test)

from sklearn.metrics import accuracy_score
print("Our accuracy: ", accuracy_score(y_test, yhat))

Our accuracy:  0.9314814814814815
