In [2]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier

iris = load_iris()

X = iris.data[:, 2:] # length and width of petals
y = iris.target

decisionTree = DecisionTreeClassifier(max_depth=2)
decisionTree.fit(X, y)

In [4]:
from sklearn.tree import export_graphviz

export_graphviz(decisionTree,
                out_file="iris_tree.dot",
                feature_names=iris.feature_names[2:],
                class_names=iris.target_names,
                rounded=True, 
                filled=True)

In [5]:
decisionTree.predict_proba([[5, 1.5]])

array([[0.        , 0.90740741, 0.09259259]])

In [6]:
decisionTree.predict([[5, 1.5]])

array([1])

In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

logistic_clf = LogisticRegression()
random_forest_clf =  RandomForestClassifier()
svm_clf = SVC()

# ensemble time!
voting_clf = VotingClassifier(estimators=[
    ('lr', logistic_clf),
    ('rf', random_forest_clf),
    ('svc', svm_clf)
], voting="hard")

voting_clf.fit(X, y)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_ideal, y_train, y_ideal = train_test_split(X, y)
X_cv, X_test, y_cv, y_test = train_test_split(X_ideal, y_ideal)

In [10]:
from sklearn.ensemble import BaggingClassifier

baggingClassifier = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100,
    bootstrap=True, # bootstrap = True means we are sampling WITH replacement (bagging). bootstrap = False means we'd be sampling WITHOUT replacement (pasting)
    n_jobs = -1 # number of CPU cores to use. -1 means use all available CPU cores
)

baggingClassifier.fit(X_train, y_train)

In [11]:
baggingClassifier.predict(X_test)

array([0, 1, 1, 0, 1, 2, 0, 1, 0, 2])

In [38]:
from sklearn.metrics import accuracy_score

accuracy_score(baggingClassifier.predict(X_cv), y_cv)

1.0

In [15]:
from sklearn.ensemble import AdaBoostClassifier

ada_classifier = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2), n_estimators=128, algorithm='SAMME.R', learning_rate=0.5)
ada_classifier.fit(X_train, y_train)

In [28]:
from sklearn.tree import DecisionTreeRegressor

# Gradient Boosting concept

n_trees = 3
tree_regs = [None] * n_trees

tree_regs[0] = DecisionTreeRegressor(max_depth=2)
tree_regs[0].fit(X_train, y_train)
y_current = y_train

for i in range(1, n_trees):
    tree_regs[i] = DecisionTreeRegressor(max_depth=2)
    
    y_current = y_current - tree_regs[i - 1].predict(X_train)
    tree_regs[i].fit(X_train, y_current)



In [30]:
y_pred = sum(tree.predict(X_cv) for tree in tree_regs)
print(y_pred)

[ 2.          1.04824356 -0.0088993  -0.0088993   1.53407249 -0.0088993
  2.          1.00656716  1.96264392  1.00656716  1.04824356  1.00656716
 -0.0088993  -0.0088993  -0.0088993   1.04824356  1.04824356 -0.0088993
  2.          1.04824356  2.          1.96264392 -0.0088993  -0.0088993
  1.04824356 -0.0088993  -0.0088993  -0.0088993 ]


In [64]:
from sklearn.ensemble import GradientBoostingRegressor

gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=30, learning_rate=1.0)
gbrt.fit(X, y)

In [65]:
import numpy as np
from sklearn.metrics import mean_squared_error

def optimal_num_trees(gradient_boost_reg, X_train, X_val, y_train, y_val):
    gradient_boost_reg.fit(X_train, y_train)
    
    # use staged_predict() in order to find optimal number of trees
    errors = [mean_squared_error(y_val, y_pred) for y_pred in gradient_boost_reg.staged_predict(X_val)]
    
    return np.argmin(errors) + 1


In [66]:
best_n_estimators = optimal_num_trees(GradientBoostingRegressor(max_depth=2, n_estimators=30, learning_rate=1.0), X_train, X_cv, y_train, y_cv)
print(f"optimal number of estimators: {best_n_estimators}")

optimal_gradboost_reg = GradientBoostingRegressor(max_depth=2, n_estimators=best_n_estimators)
optimal_gradboost_reg.fit(X_train, y_train)

optimal number of estimators: 22


In [67]:
mean_squared_error(y_cv, gbrt.predict(X_cv))

6.606315831664515e-06

In [68]:
mean_squared_error(y_cv, optimal_gradboost_reg.predict(X_cv))

0.01398886486868693

In [69]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml("mnist_784", version=1)

In [70]:
import matplotlib.pyplot as plt

X = mnist["data"].to_numpy()
y = mnist["target"].astype(np.uint8)

In [72]:
X_train = X[0:50000]
y_train = y[0:50000]

X_cv = X[50000:60000]
y_cv = y[50000:60000]

X_test = X[60000:]
y_test = y[60000:]

In [86]:
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import SparseCategoricalCrossentropy

rf_classifier = RandomForestClassifier(n_estimators=128, criterion="entropy")
svm_classifier = SVC(kernel='rbf', C=20, probability=True)
extra_trees_clf = ExtraTreesClassifier(n_estimators=90, criterion="gini")

ann_clf = Sequential([
    Dense(28, activation="tanh"),
    Dense(14, activation="relu"),
    Dense(10, activation="linear")
])

ann_clf.compile(loss=SparseCategoricalCrossentropy(from_logits=True), optimizer="adam")

In [74]:
rf_classifier.fit(X_train, y_train)
svm_classifier.fit(X_train, y_train)
extra_trees_clf.fit(X_train, y_train)
ann_clf.fit(X_train, y_train)



<keras.callbacks.History at 0x27c0ba26710>

In [76]:
ann_clf.predict(X_cv)



array([[-3.9210749 , -1.827042  ,  0.5893475 , ..., -7.184546  ,
         1.0236403 , -3.462791  ],
       [-3.8795195 , -2.403033  , -1.1862909 , ..., -3.9193647 ,
         2.037393  , -1.8166441 ],
       [ 0.316308  , -3.7934346 ,  0.96625316, ..., -1.7287471 ,
        -3.9398944 , -1.4988756 ],
       ...,
       [ 0.97462887, -3.5712934 , -2.4642565 , ..., -2.2767558 ,
         2.811289  , -1.4239818 ],
       [ 1.9875188 , -3.6504054 ,  0.69313776, ..., -1.6141804 ,
        -2.483895  , -2.4330978 ],
       [-0.78028107, -4.711868  , -2.0219183 , ..., -5.945429  ,
         2.8253288 , -1.8425511 ]], dtype=float32)

In [78]:
import tensorflow as tf

ann_clf.predict(tf.convert_to_tensor([X_cv[0]], dtype=tf.float64))



array([[-3.921072  , -1.8270508 ,  0.5893477 ,  1.1326705 , -2.7784014 ,
        -0.31802616, -2.5882146 , -7.184548  ,  1.0236413 , -3.4627874 ]],
      dtype=float32)

In [81]:
print(y_cv)

50000    3
50001    8
50002    6
50003    9
50004    6
        ..
59995    8
59996    3
59997    5
59998    6
59999    8
Name: class, Length: 10000, dtype: uint8


In [83]:
from sklearn.metrics import mean_squared_error as mse

print("RandomForest Error: " + str(mse(y_cv, rf_classifier.predict(X_cv))))
print("SVM Error: " + str(mse(y_cv, svm_classifier.predict(X_cv))))
print("ExtraTrees Error: " + str(mse(y_cv, extra_trees_clf.predict(X_cv))))

RandomForest Error: 0.5395
SVM Error: 0.316
ExtraTrees Error: 0.4921


In [87]:
voting_classifier = VotingClassifier(estimators=[
    ('rf', rf_classifier),
    ('svm', svm_classifier),
    ('extratrees', extra_trees_clf)
], voting="soft")

voting_classifier.fit(X_train, y_train)

In [88]:
mse(y_cv, voting_classifier.predict(X_cv))

0.3144