# Chapter 23 - Ensemble Methods
## Building Machine Learning and Deep Learning Models on Google Cloud Platform
### Ekaba Bisong

## Classification and Regression Trees (CART)

### Classification Tree with Scikit-learn

In [0]:
# import packages
from sklearn.tree import DecisionTreeClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [0]:
# load dataset
data = datasets.load_iris()

# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
tree_classifier = DecisionTreeClassifier()

In [0]:
# fit the model on the training set
tree_classifier.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [0]:
# make predictions on the test set
predictions = tree_classifier.predict(X_test)

In [0]:
# evaluate the model performance using accuracy metric
print("Accuracy: %.2f" % accuracy_score(y_test, predictions))

Accuracy: 0.97


### Regression Tree with Scikit-learn

In [0]:
# import packages
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [0]:
# load dataset
data = datasets.load_boston()

# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
tree_reg = DecisionTreeRegressor()

In [0]:
# fit the model on the training set
tree_reg.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
                      max_leaf_nodes=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      presort=False, random_state=None, splitter='best')

In [0]:
# make predictions on the test set
predictions = tree_reg.predict(X_test)

In [0]:
# evaluate the model performance using the root mean square error metric
print("Root Mean squared error: %.2f" % sqrt(mean_squared_error(y_test, predictions)))

Root Mean squared error: 4.93


## Random Forests

### Random Forests for Classification

In [0]:
# import packages
from sklearn.ensemble import RandomForestClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [0]:
# load dataset
data = datasets.load_iris()

# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
rf_classifier = RandomForestClassifier()

In [0]:
# fit the model on the training set
rf_classifier.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [0]:
# make predictions on the test set
predictions = rf_classifier.predict(X_test)

In [0]:
# evaluate the model performance using accuracy metric
print("Accuracy: %.2f" % accuracy_score(y_test, predictions))

Accuracy: 1.00


### Random Forests for Regression

In [0]:
# import packages
from sklearn.ensemble import RandomForestRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [0]:
# load dataset
data = datasets.load_boston()

# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
rf_reg = RandomForestRegressor()

In [0]:
# fit the model on the training set
rf_reg.fit(X_train, y_train)



RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=10,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [0]:
# make predictions on the test set
predictions = rf_reg.predict(X_test)

In [0]:
# evaluate the model performance using the root mean square error metric
print("Root Mean squared error: %.2f" % sqrt(mean_squared_error(y_test, predictions)))

Root Mean squared error: 3.18


## Stochastic Gradient Boosting (SGB)

### SGB for Classification

In [0]:
# import packages
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [0]:
# load dataset
data = datasets.load_iris()

# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
sgb_classifier = GradientBoostingClassifier()

In [0]:
# fit the model on the training set
sgb_classifier.fit(X_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [0]:
# make predictions on the test set
predictions = sgb_classifier.predict(X_test)

In [0]:
# evaluate the model performance using accuracy metric
print("Accuracy: %.2f" % accuracy_score(y_test, predictions))

Accuracy: 0.92


### SGB for Regression

In [0]:
# import packages
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [0]:
# load dataset
data = datasets.load_boston()

# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
sgb_reg = GradientBoostingRegressor()

In [0]:
# fit the model on the training set
sgb_reg.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [0]:
# make predictions on the test set
predictions = sgb_reg.predict(X_test)

In [0]:
# evaluate the model performance using the root mean square error metric
print("Root Mean squared error: %.2f" % sqrt(mean_squared_error(y_test, predictions)))

Root Mean squared error: 2.86


## Extreme Gradient Boosting (XGBoost)

### XGBoost for Classification

In [0]:
# import packages
from xgboost import XGBClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [0]:
# load dataset
data = datasets.load_iris()

# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
xgboost_classifier = XGBClassifier()

In [0]:
# fit the model on the training set
xgboost_classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='multi:softprob', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [0]:
# make predictions on the test set
predictions = xgboost_classifier.predict(X_test)

In [0]:
# evaluate the model performance using accuracy metric
print("Accuracy: %.2f" % accuracy_score(y_test, predictions))

Accuracy: 0.95


### XGBoost for Regression

In [0]:
# import packages
from xgboost import XGBRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [0]:
# load dataset
data = datasets.load_boston()

# separate features and target
X = data.data
y = data.target

In [0]:
# split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True)

In [0]:
# create the model
xgboost_reg = XGBRegressor()

In [0]:
# fit the model on the training set
xgboost_reg.fit(X_train, y_train)



XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

In [0]:
# make predictions on the test set
predictions = xgboost_reg.predict(X_test)

In [0]:
# evaluate the model performance using the root mean square error metric
print("Root Mean squared error: %.2f" % sqrt(mean_squared_error(y_test, predictions)))

Root Mean squared error: 3.69
