Modify the Gradient Boosting scratch code in our lecture such that:
- Notice that we are still using max_depth = 1.  Attempt to tweak min_samples_split, max_depth for the regression and see whether we can achieve better mse on our boston data
- Notice that we only write scratch code for gradient boosting for regression, add some code so that it also works for binary classification.  Load the breast cancer data from sklearn and see that it works.
- Further change the code so that it works for multiclass classification.  Load the digits data from sklearn and see that it works
- Put everything into class

In [20]:
from scipy.special import expit
from sklearn.tree import DecisionTreeRegressor
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_digits

In [21]:
class GradientBoosting:
    def __init__(self, S, learning_rate, max_depth, min_samples_split,regression=True):
        self.S = S
        self.learning_rate = learning_rate
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.regression = regression
        tree_params = {'max_depth':self.max_depth, 'min_samples_split':self.min_samples_split}
        self.models =[DecisionTreeRegressor(**tree_params) for _ in range (S)]
        first_model = DummyRegressor(strategy='mean')
        self.models.insert(0, first_model)
        
    def grad(self,y,h):
        return y-h
    
    def softmax(self,z):
        soft =  np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
        return soft
    
    def fit(self,X, y):
        
        self.models[0].fit(X,y)
        for i in range(self.S):
            yhat = self.predict(X, self.models[:i+1], with_argmax=False)
            residual = self.grad(y, yhat)
            self.models[i+1].fit(X, residual)
        
    def predict(self, X, models=None, with_argmax=True):
        if models is None:
            models=self.models

        f0 = models[0].predict(X)
        boosting = sum(self.learning_rate*model.predict(X) for model in models[1:])
        yhat = f0+boosting
        if not self.regression:
            yhat = self.softmax(yhat)
            if with_argmax:
                yhat = np.argmax(yhat, axis =1)
        return yhat
    

In [22]:
# Regression
X, y = load_boston(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model1 = GradientBoosting(S=200, learning_rate =0.1, max_depth = 1, min_samples_split=2,regression=True)
model1.fit(X_train, y_train)
yhat = model1.predict(X_test)
print("MSE for model 1: ", mean_squared_error(y_test, yhat))

model2 = GradientBoosting(S=200, learning_rate =0.1, max_depth = 3, min_samples_split=3,regression=True)
model2.fit(X_train, y_train)
yhat = model2.predict(X_test)
print("MSE for model 2: ", mean_squared_error(y_test, yhat))


MSE for model 1:  12.945557601580582
MSE for model 2:  7.934273216916185


<b> Mean square error decrease when we increase max_depth and min_samples_split<b>

In [23]:
#Binary Classification
X,y = load_breast_cancer(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0], len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond), each_class]=1

model = GradientBoosting(S=200, learning_rate=0.1, max_depth=3, min_samples_split=3,regression=False)
model.fit(X_train, y_train_encoded)
yhat = model.predict(X_test)
print("Accuracy for Binary classificaton:", accuracy_score(y_test,yhat))

Accuracy for Binary classificaton: 0.9649122807017544


In [24]:
# Multiclass classification
X,y = load_digits(return_X_y=True)
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3, random_state=42)
y_train_encoded = np.zeros((y_train.shape[0],len(set(y))))
for each_class in range(len(set(y))):
    cond = y_train==each_class
    y_train_encoded[np.where(cond),each_class]=1

model = GradientBoosting(S=200, learning_rate=0.1, max_depth=3, min_samples_split=3,regression=False)
model.fit(X_train,y_train_encoded)
yhat = model.predict(X_test)
print("Accuracy for Multiclass classificaton:", accuracy_score(y_test,yhat))

Accuracy for Multiclass classificaton: 0.9314814814814815
