In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
from sklearn.metrics import accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [26]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [27]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard')

voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', RandomFo...f', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='hard', weights=None)

In [28]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.88
SVC 0.888
VotingClassifier 0.904




In [31]:
log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='soft')

voting_clf.fit(X_train, y_train)



VotingClassifier(estimators=[('lr', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)), ('rf', RandomFo...bf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False))],
         flatten_transform=None, n_jobs=None, voting='soft', weights=None)

In [32]:
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.888
SVC 0.888
VotingClassifier 0.92




In [38]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
)

bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(accuracy_score(y_test, y_pred))

0.92


In [43]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True)

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8986666666666666

In [44]:
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

In [45]:
bag_clf.oob_decision_function_

array([[0.30769231, 0.69230769],
       [0.35227273, 0.64772727],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.07894737, 0.92105263],
       [0.43010753, 0.56989247],
       [0.01764706, 0.98235294],
       [1.        , 0.        ],
       [0.97326203, 0.02673797],
       [0.79329609, 0.20670391],
       [0.00502513, 0.99497487],
       [0.79569892, 0.20430108],
       [0.8342246 , 0.1657754 ],
       [0.95744681, 0.04255319],
       [0.03333333, 0.96666667],
       [0.        , 1.        ],
       [0.98823529, 0.01176471],
       [0.96875   , 0.03125   ],
       [0.99438202, 0.00561798],
       [0.01197605, 0.98802395],
       [0.36111111, 0.63888889],
       [0.91219512, 0.08780488],
       [1.        , 0.        ],
       [0.95604396, 0.04395604],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.62765957, 0.37234043],
       [0.

In [54]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.92

In [55]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
y_pred = rnd_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.904

In [51]:
from sklearn.datasets import load_iris
iris = load_iris()

rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris["data"], iris["target"])

for name, score in zip(iris["feature_names"], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.0935777366329314
sepal width (cm) 0.02190735522250755
petal length (cm) 0.449369702624257
petal width (cm) 0.43514520552030395


In [63]:
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1), n_estimators=200,
    algorithm="SAMME.R", learning_rate=0.5
)
ada_clf.fit(X_train, y_train)
y_pred= ada_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.896

In [65]:
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=3, learning_rate=1.0)
gbrt.fit(X, y)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=1.0, loss='ls', max_depth=2, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=3, n_iter_no_change=None, presort='auto',
             random_state=None, subsample=1.0, tol=0.0001,
             validation_fraction=0.1, verbose=0, warm_start=False)

# Exercises

In [3]:
#8
from sklearn.datasets import fetch_openml
import numpy as np
import os
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt

def sort_by_target(mnist):  #From Geron HoM Github
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:, 1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]

In [4]:
mnist = fetch_openml('mnist_784', version=1, cache=True)
mnist.target = mnist.target.astype(np.int8) 
sort_by_target(mnist) 

X,y = mnist["data"],mnist["target"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train,y_train, random_state=42)

In [5]:
from sklearn.ensemble import RandomForestClassifier

RF = RandomForestClassifier(random_state = 42)
RF.fit(X_train, y_train)

y_pred = RF.predict(X_val)
accuracy_score(y_val, y_pred)



0.9376761904761904

In [6]:
from sklearn.ensemble import ExtraTreesClassifier

ET = ExtraTreesClassifier(random_state = 42)
ET.fit(X_train, y_train)

y_pred = ET.predict(X_val)
accuracy_score(y_val, y_pred)



0.942247619047619

In [12]:
from sklearn.svm import LinearSVC

SVM = LinearSVC(random_state = 42, verbose = 1)
SVM.fit(X_train, y_train)

y_pred = SVM.predict(X_val)
accuracy_score(y_val, y_pred)

[LibLinear]



0.8663619047619048

In [22]:
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

extra_clf = ExtraTreesClassifier()
rnd_clf = RandomForestClassifier()
svm_clf = LinearSVC()
mlp = MLPClassifier()

voting_clf_h = VotingClassifier(
    estimators=[('xt', extra_clf), ('rf', rnd_clf), ('svc', svm_clf),('mlp', mlp)],
    voting='hard')

voting_clf_h.fit(X_train, y_train)
y_pred = voting_clf_h.predict(X_val)
accuracy_score(y_val, y_pred)



0.9558095238095238

In [24]:
voting_clf_h = VotingClassifier(
    estimators=[('xt', extra_clf), ('rf', rnd_clf),('mlp', mlp)],
    voting='hard')

voting_clf_h.fit(X_train, y_train)
y_pred = voting_clf_h.predict(X_val)
accuracy_score(y_val, y_pred)



0.9619047619047619

In [25]:
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

extra_clf = ExtraTreesClassifier()
rnd_clf = RandomForestClassifier()
svm_clf = LinearSVC()
mlp = MLPClassifier()

voting_clf_s = VotingClassifier(
    estimators=[('xt', extra_clf), ('rf', rnd_clf),('mlp', mlp)],
    voting='soft')

voting_clf_s.fit(X_train, y_train)
y_pred = voting_clf_s.predict(X_val)
accuracy_score(y_val, y_pred)



0.9651809523809524

In [18]:
y_pred = RF.predict(X_test)
accuracy_score(y_test, y_pred)

0.9401714285714285

In [19]:
y_pred = ET.predict(X_test)
accuracy_score(y_test, y_pred)

0.9453142857142857

In [20]:
y_pred = SVM.predict(X_test)
accuracy_score(y_test, y_pred)

0.8708571428571429

In [31]:
from sklearn.neural_network import MLPClassifier

MLP = MLPClassifier()
MLP.fit(X_train,y_train)
y_pred = MLP.predict(X_test)
accuracy_score(y_test, y_pred)

0.9640571428571428

In [26]:
y_pred = voting_clf_h.predict(X_test)
accuracy_score(y_test, y_pred)

0.9621714285714286

In [27]:
y_pred = voting_clf_s.predict(X_test)
accuracy_score(y_test, y_pred)

0.9651428571428572

In [64]:
#9
val_pred_rf = RF.predict(X_val).reshape((-1, 1)) 
val_pred_et = ET.predict(X_val).reshape((-1, 1)) 
val_pred_svm = SVM.predict(X_val).reshape((-1, 1)) 
val_pred_mlp = MLP.predict(X_val).reshape((-1, 1)) 
val_pred_rf

array([[4],
       [0],
       [4],
       ...,
       [9],
       [7],
       [9]], dtype=int8)

In [71]:
new_train = np.concatenate((val_pred_rf,val_pred_et,val_pred_svm,val_pred_mlp),axis=1)
new_train

array([[4, 4, 4, 4],
       [0, 0, 0, 0],
       [4, 4, 4, 4],
       ...,
       [9, 9, 9, 9],
       [7, 7, 7, 7],
       [9, 9, 7, 9]], dtype=int8)

In [74]:
y_val.reshape((-1, 1)) 

array([[4],
       [0],
       [4],
       ...,
       [9],
       [7],
       [9]], dtype=int8)

In [75]:
mlp_blender = MLPClassifier()

mlp_blender.fit(new_train,y_val)



MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [76]:
#Test stack
#Predict test set values to be fed to blender 
val_pred_rf = RF.predict(X_test).reshape((-1, 1)) 
val_pred_et = ET.predict(X_test).reshape((-1, 1)) 
val_pred_svm = SVM.predict(X_test).reshape((-1, 1)) 
val_pred_mlp = MLP.predict(X_test).reshape((-1, 1))

new_test = np.concatenate((val_pred_rf,val_pred_et,val_pred_svm,val_pred_mlp),axis=1)

#Aggregate predictions with trained blender
blend_predict = mlp_blender.predict(new_test)


In [77]:
accuracy_score(blend_predict,y_test)

0.9557142857142857