In [2]:
from pyod.models.auto_encoder import AutoEncoder
from data import read_dataset_v
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.utils import shuffle

In [3]:
x, y = read_dataset_v()

In [7]:
def create_balanced_folds(X, Y, percentage, rs, at='target'): 
    size_minority = min(Counter(Y).values())

    X[at] = Y
    
    # surffle
    X = shuffle(X, random_state=rs)

    p = np.ceil(size_minority * percentage).astype('int')
    train = []
    test = []
    for classe in X[at].unique():
        
        df_class = X[X[at] == classe]
        
        test.append(df_class.iloc[:p])
        train.append(df_class.iloc[p:])
        
    df_train = pd.concat(train)
    df_test = pd.concat(test)

    y_train = df_train[at]
    y_test = df_test[at]
        
    x_train = df_train.drop([at], axis=1)
    x_test = df_test.drop([at], axis=1)   
    
    return x_train, y_train, x_test, y_test


In [54]:
x_train_raw, y_train_raw, x_test_raw, y_test_raw = create_balanced_folds(x, y, 0.15, 10, False)

In [35]:
from pyod.utils.utility import standardizer

X_train_norm, X_test_norm = standardizer(x_train_raw, x_test_raw)

In [42]:
atcdr = AutoEncoder(epochs=600, contamination=0.05, hidden_neurons =[15, 10, 2, 10, 15])
atcdr.fit(x_train_raw)

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_50 (Dense)            (None, 7)                 56        
                                                                 
 dropout_43 (Dropout)        (None, 7)                 0         
                                                                 
 dense_51 (Dense)            (None, 7)                 56        
                                                                 
 dropout_44 (Dropout)        (None, 7)                 0         
                                                                 
 dense_52 (Dense)            (None, 15)                120       
                                                                 
 dropout_45 (Dropout)        (None, 15)                0         
                                                                 
 dense_53 (Dense)            (None, 10)               

AutoEncoder(batch_size=32, contamination=0.05, dropout_rate=0.2, epochs=600,
      hidden_activation='relu', hidden_neurons=[15, 10, 2, 10, 15],
      l2_regularizer=0.1,
      loss=<function mean_squared_error at 0x7fa0e5cd45e0>,
      optimizer='adam', output_activation='sigmoid', preprocessing=True,
      random_state=None, validation_size=0.1, verbose=1)

In [43]:
Counter(atcdr.labels_)

Counter({0: 1237, 1: 66})

In [45]:
y_test_pred = atcdr.predict(x_test_raw)
y_test_score_pred = atcdr.decision_function(x_test_raw) 
y_test_score_pred



array([1.86318192, 2.39864551, 1.86856813, 1.7432144 , 1.13851118,
       2.2798784 , 1.7221694 , 1.91722083, 2.45006753, 1.95711604,
       3.64032817, 1.69997512, 1.9184544 , 1.26664541])

In [47]:
from pyod.models.xgbod import XGBOD
xgbod = XGBOD(n_components=10, random_state=100) 
xgbod.fit(x_train_raw, y_train_raw)



Parameters: { "n_components", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.




XGBOD(base_score=0.5, booster='gbtree', colsample_bylevel=1,
   colsample_bytree=1,
   estimator_list=[KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=1, p=2,
  radius=1.0), LOF(algorithm='auto', contamination=0.1, leaf_size=30, metric='minkowski',
  metric_params=None, n_jobs=1, n_neighbors=1, no..._features=1.0,
    max_samples='auto', n_estimators=200, n_jobs=1, random_state=100,
    verbose=0)],
   gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
   min_child_weight=1, n_estimators=100, n_jobs=1, nthread=None,
   objective='binary:logistic', random_state=100, reg_alpha=0,
   reg_lambda=1, scale_pos_weight=1, silent=True,
   standardization_flag_list=[True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, True, True, True, True, True, True, True, True, True, True, True, False, False, False, False,

In [48]:
y_test_pred = xgbod.predict(x_test_raw)  # outlier labels (0 or 1)
y_test_scores = xgbod.decision_function(x_test_raw) # outlier scores
y_test_scores  

array([0.05431375, 0.0226093 , 0.03901829, 0.01540772, 0.00375976,
       0.0326301 , 0.0063637 , 0.02441703, 0.11061703, 0.04983323,
       0.04258086, 0.02165323, 0.03383661, 0.02127785], dtype=float32)

In [32]:
y_train_pred = xgbod.labels_
Counter(y_train_pred)

Counter({0: 1287, 1: 16})

In [55]:
from pyod.models.cblof import CBLOF
cblof = CBLOF() 
cblof.fit(x_train_raw, y_train_raw)



CBLOF(alpha=0.9, beta=5, check_estimator=False, clustering_estimator=None,
   contamination=0.1, n_clusters=8, n_jobs=None, random_state=None,
   use_weights=False)

In [57]:
y_test_pred = cblof.predict(x_test_raw)  # outlier labels (0 or 1)
y_test_scores = cblof.decision_function(x_test_raw) # outlier scores
y_test_pred

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0])