## LOF算法实践（pyod）

In [4]:
import numpy as np
import pandas as pd
import pyod
from sklearn.model_selection import train_test_split

In [5]:
# 打印cell中的多个输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [6]:
# 数据源来自kaggle的信用卡欺诈数据 https://www.kaggle.com/mlg-ulb/creditcardfraud
path = 'dataverse_files/'
data = pd.read_csv(path+r"creditcard.csv")

In [7]:
data.shape

(284807, 31)

In [8]:
X = data.iloc[:,data.columns != "Class"]
y = data.iloc[:,data.columns == "Class"]
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X,y,test_size=0.3)

In [9]:
from pyod.models.lof import LOF
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize

In [10]:
clf_name="lof"
clf=LOF(contamination=492/28487)
# clf=LOF()
clf.fit(Xtrain)

LOF(algorithm='auto', contamination=0.017271035911117352, leaf_size=30,
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=20, p=2)

In [11]:
y_train_pred = clf.labels_
y_train_scores = clf.decision_scores_
y_test_pred = clf.predict(Xtest)
y_test_scores = clf.decision_function(Xtest)

In [12]:
# evaluate and print the results
print("\nOn Training Data:")
evaluate_print(clf_name, Ytrain, y_train_scores)
print("\nOn Test Data:")
evaluate_print(clf_name, Ytest, y_test_scores)

from sklearn.metrics import precision_score
precision_score(Ytrain, y_train_pred, average='binary') 

from sklearn.metrics import classification_report
print(classification_report(Ytrain, y_train_pred))
print(classification_report(Ytest, y_test_pred))


On Training Data:
lof ROC:0.7506, precision @ rank n:0.0377

On Test Data:
lof ROC:0.6911, precision @ rank n:0.0136


0.014808362369337979

              precision    recall  f1-score   support

           0       1.00      0.98      0.99    199019
           1       0.01      0.15      0.03       345

    accuracy                           0.98    199364
   macro avg       0.51      0.57      0.51    199364
weighted avg       1.00      0.98      0.99    199364

              precision    recall  f1-score   support

           0       1.00      0.98      0.99     85296
           1       0.01      0.12      0.02       147

    accuracy                           0.98     85443
   macro avg       0.50      0.55      0.51     85443
weighted avg       1.00      0.98      0.99     85443



## LOF算法实践（sklearn）

In [15]:
from sklearn.neighbors import LocalOutlierFactor

In [14]:
# 数据源来自kaggle的信用卡欺诈数据 https://www.kaggle.com/mlg-ulb/creditcardfraud
path = 'dataverse_files/'
data = pd.read_csv(path+r"creditcard.csv")

X = data.iloc[:,data.columns != "Class"]
y = data.iloc[:,data.columns == "Class"]
X_train,X_test,Y_train,Y_test = train_test_split(X,y,test_size=0.3)

In [16]:
# fit the model for novelty detection (novelty=True)
clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=0.1)
# clf = LocalOutlierFactor(n_neighbors=20, novelty=True, contamination=492/28487)
clf.fit(X_train)

LocalOutlierFactor(contamination=0.017271035911117352, novelty=True)

In [17]:
# DO NOT use predict, decision_function and score_samples on X_train as this
# would give wrong results but only on new unseen data (not used in X_train),
# e.g. X_test, X_outliers or the meshgrid
y_pred_test = clf.predict(X_test[Y_test['Class']==0])
y_pred_outliers = clf.predict(X_test[Y_test['Class']==1])
n_error_test = y_pred_test[y_pred_test == -1].size
n_error_outliers = y_pred_outliers[y_pred_outliers == 1].size

In [18]:
# 将非异常点识别为异常点
n_error_test
# 将异常点识别为非异常点
n_error_outliers

1473

133