In [49]:
import numpy as np
import os
import pickle
from bayes_opt import BayesianOptimization
from sklearn.svm import OneClassSVM
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix
# import lsanomaly
import copy

np.random.seed(1)

In [43]:
with open("X.pickle","rb") as f:
    scaler = pickle.load(f)
    X_train = pickle.load(f)
    X_val = pickle.load(f)
    X_test = pickle.load(f)
    
idx = np.random.randint(0, X_test.shape[0], X_test.shape[0])

X_val = X_val[idx]
X_test = X_test[idx]

best_score = 0
best_params = None
best_model = None
y_val = None
y_test = None

In [44]:
X_val.shape

(8424, 14)

In [45]:
def get_diff_score(n_estimators, max_features, bootstrap):
    global best_score, best_params, best_model, y_val, y_test
    
    if max_features > 1:
        max_features = 1.
    n_estimators = int(round(n_estimators))
    bootstrap = bool(round(bootstrap))
    
    clf = IsolationForest(n_estimators=n_estimators, max_features=max_features,
                          bootstrap=bootstrap, n_jobs=os.cpu_count()//2,
                          random_state=0, contamination=0)
        
    clf.fit(X_train)
    
    scaler = MinMaxScaler()
    y_train = scaler.fit_transform(clf.score_samples(X_train).reshape(-1, 1)).reshape(-1)
    y_val = scaler.transform(clf.score_samples(X_val).reshape(-1, 1)).reshape(-1)
    y_test = scaler.transform(clf.score_samples(X_test).reshape(-1, 1)).reshape(-1)
    
    threshold = np.percentile(y_train, 50)
    
    len_val = y_val.shape[0]
    len_test = y_test.shape[0]
    
    TP = y_test[y_test < threshold].shape[0]
    FP = y_val[y_val < threshold].shape[0]
    TN = len_val - FP
    
    acc = (TP + TN) / (len_val + len_test)
    precision = TP / (TP + FP)
    recall = TP / len_test
    f1 = (2*precision*recall) / (precision + recall)
    
    score = 100 * acc
    
    print(f"precision {precision}, recall {recall}")
    print(f"acc {acc}")
    
    if score > best_score:
        best_score = score
        best_model = copy.deepcopy(clf)
        best_params = best_model.get_params()
    
    return score


In [46]:
pbounds = {'n_estimators': (50, 3500), 'max_features': (0.01, 1.001),
           'bootstrap': (0, 1)}
optimizer = BayesianOptimization(f=get_diff_score, pbounds=pbounds, random_state=1)
optimizer.maximize(init_points=15, n_iter=100)
print(best_params)

with open("best_model_if.pickle", "wb") as f:
    pickle.dump(best_model, f)

|   iter    |  target   | bootstrap | max_fe... | n_esti... |
-------------------------------------------------------------
precision 0.5331223398450289, recall 0.5798907882241215
acc 0.5360280151946819
| [0m 1       [0m | [0m 53.6    [0m | [0m 0.417   [0m | [0m 0.7238  [0m | [0m 50.39   [0m |
precision 0.5517932489451477, recall 0.6209639126305793
acc 0.5582858499525166
| [95m 2       [0m | [95m 55.83   [0m | [95m 0.3023  [0m | [95m 0.1554  [0m | [95m 368.6   [0m |
precision 0.5429864253393665, recall 0.5982905982905983
acc 0.5473646723646723
| [0m 3       [0m | [0m 54.74   [0m | [0m 0.1863  [0m | [0m 0.3525  [0m | [0m 1.419e+0[0m |
precision 0.5381019676051745, recall 0.5876068376068376
acc 0.5416073124406457
| [0m 4       [0m | [0m 54.16   [0m | [0m 0.5388  [0m | [0m 0.4254  [0m | [0m 2.414e+0[0m |
precision 0.53323810570267, recall 0.5760921177587844
acc 0.53590930674264
| [0m 5       [0m | [0m 53.59   [0m | [0m 0.2045  [0m | [0m 0.880

precision 0.552054794520548, recall 0.6219135802469136
acc 0.558641975308642
| [0m 47      [0m | [0m 55.86   [0m | [0m 0.1032  [0m | [0m 0.06663 [0m | [0m 1.325e+0[0m |
precision 0.5563636363636364, recall 0.6356837606837606
acc 0.5643993352326686
| [0m 48      [0m | [0m 56.44   [0m | [0m 0.3536  [0m | [0m 0.09243 [0m | [0m 453.1   [0m |
precision 0.5473051365889674, recall 0.6159781576448243
acc 0.5532407407407407
| [0m 49      [0m | [0m 55.32   [0m | [0m 0.04217 [0m | [0m 0.1258  [0m | [0m 1.781e+0[0m |
precision 0.5563738201431386, recall 0.6367521367521367
acc 0.5645180436847104
| [0m 50      [0m | [0m 56.45   [0m | [0m 0.9912  [0m | [0m 0.109   [0m | [0m 368.8   [0m |
precision 0.542295580410468, recall 0.5928300094966762
acc 0.5462369420702754
| [0m 51      [0m | [0m 54.62   [0m | [0m 0.899   [0m | [0m 0.3309  [0m | [0m 368.7   [0m |
precision 0.5517932489451477, recall 0.6209639126305793
acc 0.5582858499525166
| [0m 52      [0m

precision 0.5340008802816901, recall 0.5760921177587844
acc 0.5366809116809117
| [0m 94      [0m | [0m 53.67   [0m | [0m 0.5957  [0m | [0m 0.9519  [0m | [0m 2.1e+03 [0m |
precision 0.5382697311361665, recall 0.5893874643874644
acc 0.5419040835707503
| [0m 95      [0m | [0m 54.19   [0m | [0m 0.8311  [0m | [0m 0.8428  [0m | [0m 1.904e+0[0m |
precision 0.5569489066224479, recall 0.6379392212725546
acc 0.565230294396961
| [0m 96      [0m | [0m 56.52   [0m | [0m 0.5549  [0m | [0m 0.02606 [0m | [0m 371.1   [0m |
precision 0.5500688778213415, recall 0.6162155745489079
acc 0.5560897435897436
| [0m 97      [0m | [0m 55.61   [0m | [0m 0.7926  [0m | [0m 0.1496  [0m | [0m 2.783e+0[0m |
precision 0.552457118804588, recall 0.6232193732193733
acc 0.55917616334283
| [0m 98      [0m | [0m 55.92   [0m | [0m 0.7058  [0m | [0m 0.1942  [0m | [0m 371.2   [0m |
precision 0.55806953814219, recall 0.6382953466286799
acc 0.5664173789173789
| [0m 99      [0m | 

In [47]:
with open("best_model_if.pickle", "rb") as f:
    best_model = pickle.load(f)

In [48]:
scaler = MinMaxScaler()
y_train = scaler.fit(best_model.score_samples(X_train).reshape(-1, 1))
y_val = scaler.transform(best_model.score_samples(X_val).reshape(-1, 1)).reshape(-1)
y_test = scaler.transform(best_model.score_samples(X_test).reshape(-1, 1)).reshape(-1)

len_val = y_val.shape[0]
len_test = y_test.shape[0]

TP = y_test[y_test < 0.8746].shape[0]
FP = y_val[y_val < 0.8746].shape[0]
FN = y_test.shape[0] - TP
TN = y_val.shape[0] - FP

In [58]:
print(TP, FP, TN, FN)
print("False positive rate: ", FP/(TP + TN))

4458 3398 5026 3966
False positive rate:  0.3582876423450021
