## Apress - Industrialized Machine Learning Examples

Andreas Francois Vermeulen
2019

### This is an example add-on to a book and needs to be accepted as part of that copyright.

# Chapter 005 Example 011

In [1]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, IsolationForest
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix
import numpy as np

In [2]:
sresult=False

ncnt=5000
nfcnt=50
md=2

X, y = make_classification(n_samples=ncnt, 
                           n_features=nfcnt,
                           n_informative=(nfcnt-1), 
                           n_redundant=1, 
                           random_state=1963, 
                           shuffle=True)

## Part A - RandomForestClassifier

In [3]:
clf1 = RandomForestClassifier(n_estimators=1963,
                             criterion='entropy',
                             max_depth=md,
                             n_jobs = -1,
                             oob_score = True,
                             verbose = True,
                             bootstrap = True,
                             max_features = 'auto',
                             class_weight = 'balanced',
                             random_state=0)
clf1.fit(X, y)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:    9.5s
[Parallel(n_jobs=-1)]: Done 1963 out of 1963 | elapsed:   10.3s finished


RandomForestClassifier(bootstrap=True, class_weight='balanced',
                       criterion='entropy', max_depth=2, max_features='auto',
                       max_leaf_nodes=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=1963, n_jobs=-1, oob_score=True,
                       random_state=0, verbose=True, warm_start=False)

In [4]:
print(clf1.feature_importances_)

[4.97396223e-02 9.74828467e-02 5.97330913e-04 4.07509424e-03
 1.44892833e-04 7.46150082e-02 2.42676474e-04 1.11253546e-02
 4.11941044e-03 3.12631291e-03 3.39133678e-03 1.15197872e-02
 4.68975119e-03 7.15522222e-02 1.21933898e-02 6.84408438e-04
 4.18919789e-03 9.64334219e-02 6.39028280e-03 4.12087691e-03
 2.48175259e-03 9.48618676e-02 1.79644552e-03 6.39878772e-05
 5.67609601e-03 1.04049445e-02 6.28165287e-02 4.88071197e-03
 7.00870610e-04 2.36958879e-04 0.00000000e+00 6.66422249e-04
 3.92788868e-05 5.04616792e-03 9.22286740e-03 7.12673498e-02
 9.52951654e-05 8.79350629e-03 2.25189464e-03 4.94988315e-03
 4.03290645e-02 5.37926876e-02 6.57346121e-05 1.78733976e-03
 1.17731029e-02 1.15296887e-03 8.76926937e-02 8.56873901e-03
 4.69032307e-02 1.24838378e-03]


In [5]:
print(clf1.oob_score_)

0.8066


In [6]:
y_pred=clf1.predict(X).reshape(-1,1)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.6s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 1963 out of 1963 | elapsed:    1.4s finished


In [7]:
if sresult:
    rcnt=0
    pcnt=0

    for i in range(X.shape[0]):
        rvalue=int(y[i])
        pvalue=int(y_pred[i])
        if rvalue==pvalue:
            print(X[i], rvalue,pvalue, 'True')
            rcnt+=1
        else:
            print(X[i], rvalue,pvalue, 'False')
            pcnt+=1
    
    print('True: %0.3f %% False: %0.3f %%' % ((rcnt/ncnt)*100,(pcnt/ncnt)*100))

In [8]:
print('Score %0.3f %%' % (clf1.score(X,y)*100))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    0.8s


Score 83.140 %


[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 1963 out of 1963 | elapsed:    1.3s finished


In [9]:
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
print('(tn=%d, fp=%d, fn=%d, tp=%d)' % (tn, fp, fn, tp))

(tn=2101, fp=396, fn=447, tp=2056)


## Part B - ExtraTreesClassifier

In [10]:
clf2 = ExtraTreesClassifier(n_estimators=1963,
                             criterion='entropy',
                             max_depth=md,
                             n_jobs = -1,
                             oob_score = True,
                             verbose = True,
                             bootstrap = True,
                             max_features = 'auto',
                             class_weight = 'balanced',
                             random_state=0)
clf2.fit(X, y)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:    0.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 1963 out of 1963 | elapsed:    4.1s finished


ExtraTreesClassifier(bootstrap=True, class_weight='balanced',
                     criterion='entropy', max_depth=2, max_features='auto',
                     max_leaf_nodes=None, min_impurity_decrease=0.0,
                     min_impurity_split=None, min_samples_leaf=1,
                     min_samples_split=2, min_weight_fraction_leaf=0.0,
                     n_estimators=1963, n_jobs=-1, oob_score=True,
                     random_state=0, verbose=True, warm_start=False)

In [11]:
print(clf2.feature_importances_)

[0.04794546 0.0663212  0.0016558  0.0099317  0.00078419 0.05179468
 0.00088954 0.01373917 0.008435   0.00554951 0.01327338 0.01711741
 0.01179464 0.05988521 0.02264271 0.00217359 0.00905864 0.05695083
 0.00959911 0.01250907 0.00511433 0.06079211 0.00865406 0.0031948
 0.01101608 0.01664324 0.05353967 0.00992054 0.00072473 0.00116221
 0.00030575 0.00140448 0.0009536  0.01285905 0.013292   0.06249037
 0.00081099 0.01352454 0.00781178 0.01025766 0.05524751 0.05518514
 0.00075804 0.00734788 0.0170684  0.00791823 0.06468601 0.01856507
 0.05361366 0.00308725]


In [12]:
print(clf1.oob_score_)

0.8066


In [13]:
y_pred=clf2.predict(X).reshape(-1,1)

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 1963 out of 1963 | elapsed:    1.3s finished


In [14]:
print('Score %0.3f %%' % (clf2.score(X,y)*100))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 1242 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done 1963 out of 1963 | elapsed:    1.4s finished


Score 87.080 %


In [15]:
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
print('(tn=%d, fp=%d, fn=%d, tp=%d)' % (tn, fp, fn, tp))

(tn=2205, fp=292, fn=354, tp=2149)


## Done

In [16]:
import datetime
now = datetime.datetime.now()
print('Done!',str(now))

Done! 2019-10-19 17:54:31.915453
