In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.datasets import load_breast_cancer

# Random Forests

In [None]:
dataset = load_breast_cancer()
X, y = dataset.data, dataset.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# the random forest classifier uses separate decision trees in order to obtain a result that
# is less prone to overfitting. After the separate decision trees have arrived at their decision
# on which class the data point is the most similar to, a majority vote takes place and the final decision
# is given to the class with the most votes

# here we are saying that we want the trees to look at 8 features when looking for the best split
clf = RandomForestClassifier(max_features=8, random_state=0)
clf.fit(X_train, y_train)


print('Accuracy of RF classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy of RF classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))

# Gradient Boosted Decision Trees

In [None]:
# Gradient boosted decision trees differ from random forests in that each tree that
# is produced tends to learn from the mistakes of the last tree. Errors that are made
# by previous trees tend to be corrected in the next tree

# if learning rate and max depth are not specified, values of 0.1 and 3 are used respecively
clf = GradientBoostingClassifier(random_state=0)
clf.fit(X_train, y_train)

print("lr = 0.1, maxd = 3")
print('Accuracy of GBDT classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy of GBDT classifier on test set: {:.2f}\n'.format(clf.score(X_test, y_test)))

# now trying the model with learning rate = 0.01 and max_depth = 2
clf = GradientBoostingClassifier(learning_rate = 0.01, max_depth = 2, random_state = 0)
clf.fit(X_train, y_train)

print("lr = 0.01, maxd = 2")
print('Accuracy of GBDT classifier on training set: {:.2f}'.format(clf.score(X_train, y_train)))
print('Accuracy of GBDT classifier on test set: {:.2f}'.format(clf.score(X_test, y_test)))