In [47]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pyod.models.xgbod import XGBOD
from pyod.models.iforest import IForest
from pyod.utils.data import evaluate_print
from pyod.utils.example import visualize
from MulticoreTSNE import MulticoreTSNE as TSNE
import os
from sklearn.model_selection import train_test_split
from pyod.models.knn import KNN
from sklearn.metrics import accuracy_score,recall_score

## 一、Wave Benchmarks

### 合并数据集，得到大概是9万条数据

In [48]:
path = '.\\wave_benchmarks\\wave\\benchmarks'
data_wave = pd.read_csv('.\\wave_benchmarks\\wave\\benchmarks\\wave_benchmark_0001.csv')
for i, file_name in enumerate(os.listdir('.\\wave_benchmarks\\wave\\benchmarks')[:30]):
    data_wave_i = pd.read_csv(path + '\\' + file_name)
    data_wave = pd.concat([data_wave, data_wave_i], ignore_index=True, sort=False)
print(len(data_wave))

93310


### 数据摘要

In [49]:
data_wave.head(5)

Unnamed: 0,point.id,motherset,origin,original.label,diff.score,ground.truth,V,V.1,V.2,V.3,...,noise..54,noise..55,noise..56,noise..57,noise..58,noise..59,noise..60,noise..61,noise..62,noise..63
0,wave_point_0242,wave,multiclass,2,0.00113,nominal,0.440395,-0.293023,-1.374141,-2.382361,...,,,,,,,,,,
1,wave_point_4490,wave,multiclass,0,0.47435,anomaly,0.093905,-0.093717,1.891936,1.002227,...,,,,,,,,,,
2,wave_point_0454,wave,multiclass,2,0.000153,nominal,1.123475,-0.558765,0.688201,-1.336601,...,,,,,,,,,,
3,wave_point_2033,wave,multiclass,0,0.354121,anomaly,1.30167,-1.479368,-1.087939,-1.0257,...,,,,,,,,,,
4,wave_point_2294,wave,multiclass,0,0.349918,anomaly,-0.896067,-0.055754,-0.38085,1.221271,...,,,,,,,,,,


### 去除属性对应的缺失值

In [50]:
print("缺失值总数：", len(data_wave.dropna(axis=1)))
data_wave = data_wave.dropna(axis=1)
data_wave.head(5)

缺失值总数： 93310


Unnamed: 0,point.id,motherset,origin,original.label,diff.score,ground.truth,V,V.1,V.2,V.3,...,V.11,V.12,V.13,V.14,V.15,V.16,V.17,V.18,V.19,V.20
0,wave_point_0242,wave,multiclass,2,0.00113,nominal,0.440395,-0.293023,-1.374141,-2.382361,...,0.235635,0.58192,0.745048,0.674833,0.397361,0.008963,1.677061,0.800549,-0.441398,-1.58327
1,wave_point_4490,wave,multiclass,0,0.47435,anomaly,0.093905,-0.093717,1.891936,1.002227,...,-0.134625,-1.791554,-0.698042,-0.59822,-0.894968,-0.823398,0.799381,-0.744561,0.418648,-1.613359
2,wave_point_0454,wave,multiclass,2,0.000153,nominal,1.123475,-0.558765,0.688201,-1.336601,...,1.53816,1.629397,1.017757,-0.241567,-0.094166,0.272444,-1.012604,-1.337549,-0.885293,-0.219266
3,wave_point_2033,wave,multiclass,0,0.354121,anomaly,1.30167,-1.479368,-1.087939,-1.0257,...,-0.167684,1.193453,1.012076,2.022189,3.236067,2.170706,0.990489,2.095101,2.767593,1.435593
4,wave_point_2294,wave,multiclass,0,0.349918,anomaly,-0.896067,-0.055754,-0.38085,1.221271,...,-1.536327,-1.500924,-0.976433,-0.261381,-0.34269,-1.571924,-0.40389,-0.56917,-1.181223,-0.640502


###  异常数据一共31204条， 正常数据62106条

In [51]:
data_wave['ground.truth'].value_counts()

nominal    62106
anomaly    31204
Name: ground.truth, dtype: int64

In [52]:
def anomally_target(df):
    if df['ground.truth'] == 'anomaly':
        return 1
    else:
        return 0
data_wave_x = data_wave.loc[:, 'V':'V.20']
data_wave['is_anomaly'] = data_wave.apply(anomally_target, axis=1)
data_wave_y = data_wave.loc[:, 'is_anomaly']

In [53]:
data_wave_x_train, data_wave_x_test, data_wave_y_train, data_wave_y_test = train_test_split(data_wave_x, data_wave_y, test_size=0.2, random_state=0)

In [54]:
assert len(data_wave_x_train) == len(data_wave_y_train)

In [55]:
print("训练集数和验证集数分别是：", len(data_wave_x_train), len(data_wave_x_test))

训练集数和验证集数分别是： 74648 18662


### 使用knn进行训练

In [56]:
clf_name = 'KNN'
clf_knn = KNN()
clf_knn.fit(data_wave_x_train)

KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
  radius=1.0)

In [57]:
data_wave_y_train_pred = clf_knn.labels_
data_wave_y_train_scores = clf_knn.decision_scores_

In [58]:
print(data_wave_y_train_scores)
print(data_wave_y_train_pred)

[0. 0. 0. ... 0. 0. 0.]
[0 0 0 ... 0 0 0]


In [60]:
data_wave_y_test_pred = clf_knn.predict(data_wave_x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
data_wave_y_test_scores = clf_knn.decision_function(data_wave_x_test)
print(data_wave_y_test_pred)
print(data_wave_y_test_scores)

[0 0 0 ... 0 0 0]
[0. 0. 0. ... 0. 0. 0.]


In [61]:
evaluate_print(clf_name,data_wave_y_test , data_wave_y_test_scores)

KNN ROC:0.4998, precision @ rank n:0.0


In [66]:
acc, recall =  accuracy_score(data_wave_y_test,data_wave_y_test_pred),recall_score(data_wave_y_test, data_wave_y_test_pred)
print(acc, recall)

0.665148429964634 0.0


## Wine Benchmarks¶


In [82]:
path = '.\\wine_benchmarks\\wine\\benchmarks'
data_wine = pd.read_csv('.\\wine_benchmarks\\wine\\benchmarks\\wine_benchmark_0001.csv')
for i, file_name in enumerate(os.listdir('.\\wine_benchmarks\\wine\\benchmarks')[1:20]):
    data_wine_i = pd.read_csv(path + '\\' + file_name)
    data_wine = pd.concat([data_wine, data_wine_i], ignore_index=True, sort=False)
print(len(data_wine))

74060


### 数据摘要


In [83]:
data_wine.head(5)

Unnamed: 0,point.id,motherset,origin,original.label,diff.score,ground.truth,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,...,noise..24,noise..25,noise..26,noise..27,noise..28,noise..29,noise..30,noise..31,noise..32,noise..33
0,wine_point_3594,wine,regression,7,0.050492,nominal,-1.245962,-0.362411,-0.265853,-0.261304,...,,,,,,,,,,
1,wine_point_5089,wine,regression,5,0.082237,anomaly,0.75954,0.973867,0.215849,-0.53454,...,,,,,,,,,,
2,wine_point_1912,wine,regression,6,0.290201,nominal,-0.088942,-0.969809,-0.403482,-0.870829,...,,,,,,,,,,
3,wine_point_4908,wine,regression,5,0.053559,anomaly,0.219597,0.973867,0.284664,0.138039,...,,,,,,,,,,
4,wine_point_2246,wine,regression,7,0.4203,nominal,0.219597,-0.180191,-0.541112,0.34822,...,,,,,,,,,,


### 去除属性对应的缺失值

In [84]:
print("缺失值总数：", len(data_wave.dropna(axis=1)))
data_wine = data_wine.dropna(axis=1)
data_wine.head(5)

缺失值总数： 93310


Unnamed: 0,point.id,motherset,origin,original.label,diff.score,ground.truth,fixed.acidity,volatile.acidity,citric.acid,residual.sugar,chlorides,free.sulfur.dioxide,total.sulfur.dioxide,density,pH,sulphates,alcohol
0,wine_point_3594,wine,regression,7,0.050492,nominal,-1.245962,-0.362411,-0.265853,-0.261304,-0.343495,1.209882,0.747594,-0.899276,0.817846,-0.613338,0.17456
1,wine_point_5089,wine,regression,5,0.082237,anomaly,0.75954,0.973867,0.215849,-0.53454,0.598458,-0.536656,0.199134,0.968217,0.071518,0.596292,-0.915394
2,wine_point_1912,wine,regression,6,0.290201,nominal,-0.088942,-0.969809,-0.403482,-0.870829,-0.429127,-0.592996,-0.791633,-0.699187,-1.110168,1.402712,-0.496181
3,wine_point_4908,wine,regression,5,0.053559,anomaly,0.219597,0.973867,0.284664,0.138039,0.427194,-0.762016,-0.243173,1.034913,0.817846,1.805921,0.006874
4,wine_point_2246,wine,regression,7,0.4203,nominal,0.219597,-0.180191,-0.541112,0.34822,-0.714567,-0.142276,0.446826,-0.242318,-0.36384,-1.016548,0.090717


### 统计异常数据， 异常点有40638条，非异常数据有70452条

In [85]:
data_wine['ground.truth'].value_counts()

nominal    47049
anomaly    27011
Name: ground.truth, dtype: int64

In [86]:
def anomally_target(df):
    if df['ground.truth'] == 'anomaly':
        return 1
    else:
        return 0
data_wine_x = data_wine.loc[:, 'fixed.acidity':'alcohol']
data_wine['is_anomaly'] = data_wave.apply(anomally_target, axis=1)
data_wine_y = data_wine.loc[:, 'is_anomaly']
data_wine_x = data_wine.loc[:, 'fixed.acidity':'alcohol']
data_wine['is_anomaly'] = data_wine.apply(anomally_target, axis=1)
data_wine_y = data_wine.loc[:, 'is_anomaly']

In [87]:
data_wine_x_train, data_wine_x_test, data_wine_y_train, data_wine_y_test = train_test_split(data_wine_x, data_wine_y, test_size=0.2, random_state=0)

In [88]:
assert len(data_wine_x_train) == len(data_wine_y_train)

In [89]:
print("训练集数和验证集数分别是：", len(data_wine_x_train), len(data_wine_x_test))

训练集数和验证集数分别是： 59248 14812


### 使用KNN进行训练

In [90]:
clf_knn = KNN()
clf_knn.fit(data_wine_x_train)

KNN(algorithm='auto', contamination=0.1, leaf_size=30, method='largest',
  metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=5, p=2,
  radius=1.0)

In [91]:
data_wine_y_train_pred = clf_knn.labels_
data_wine_y_train_scores = clf_knn.decision_scores_
print(data_wine_y_train_scores)
print(data_wine_y_train_pred)

[0. 0. 0. ... 0. 0. 0.]
[0 0 0 ... 0 0 0]


In [92]:
data_wine_y_test_pred = clf_knn.predict(data_wine_x_test)  # 返回未知数据上的分类标签 (0: 正常值, 1: 异常值)
data_wine_y_test_scores = clf_knn.decision_function(data_wine_x_test)
print(data_wine_y_test_pred)
print(data_wine_y_test_scores)

[0 0 0 ... 0 0 0]
[0. 0. 0. ... 0. 0. 0.]


In [93]:
evaluate_print(clf_name,data_wine_y_test , data_wine_y_test_scores)

KNN ROC:0.4993, precision @ rank n:0.3494


In [94]:
acc, recall =  accuracy_score(data_wave_y_test,data_wave_y_test_pred),recall_score(data_wave_y_test, data_wave_y_test_pred)
print(acc, recall)

0.665148429964634 0.0
