In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, jaccard_score, fowlkes_mallows_score, rand_score

## 自构造数据测试

In [52]:
demo = KMeans(n_clusters=2, n_init=10)
X = np.array([
    [1.3, 'add', 'good'],
    [0.6, 'afwe', 'bad'], 
    [0.8, 'add', 'good'], 
    [5, 'add', 'good'], 
    [6, 'afwe', 'bad'], 
    [4.3, 'afwe', 'bad']
])

# print(X)
LE = LabelEncoder()
X[:, 1] = LE.fit_transform(X[:, 1])
print(X)

demo.fit(X[:, 0:2])
print(demo.labels_)

[['1.3' '0' 'good']
 ['0.6' '1' 'bad']
 ['0.8' '0' 'good']
 ['5' '0' 'good']
 ['6' '1' 'bad']
 ['4.3' '1' 'bad']]
[1 1 1 0 0 0]


## 案例测试

### 加载数据

In [3]:
data = pd.read_csv('./dataset/kddcup.data_10_percent_corrected', sep=',')

In [6]:
columns = [
    'duration', 'protocol_type', 'service', 'flag', 'src_bytes',
    'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
    'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
    'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
    'num_access_files', 'num_outbound_cmds', 'is_host_login',
    'is_guest_login', 'count', 'srv_count', 'serror_rate',
    'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
    'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
    'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
    'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
    'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
    'dst_host_srv_rerror_rate', 'outcome'
]
discrete_columns = [
    'protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login'
]
data.columns = columns

### 数据预处理

In [8]:
X = data.drop('outcome', axis=1)
LE = LabelEncoder()
for column in discrete_columns:
    X[column] = LE.fit_transform(X[column]).astype(np.float64)

### 模型训练

In [22]:
N = 5
model = KMeans(n_clusters=N, n_init=10)
model.fit(X)

In [27]:
res = model.labels_
for i in range(N):
    print(i, np.sum(res == i))
print(type(res))
print(res.dtype)

0 493913
1 1
2 59
3 23
4 24
<class 'numpy.ndarray'>
int32


### 性能评估 (外部指标)

In [31]:
label_normal = ['normal.']
label_probe = ['ipsweep.', 'mscan.', 'nmap.', 'portsweep.', 'saint.', 'satan.']
label_dos = [
    'apache2.', 'back.', 'land.', 'neptune.', 'mailbomb.', 'pod.', 'processtable.',
    'smurf.', 'teardrop.', 'udpstorm.'
]
label_u2r = [
    'buffer_overflow.', 'httptunnel.','loadmodule.', 'perl.',
    'ps.', 'rootkit.', 'sqlattack.', 'xterm.'
]
label_r2l = [
    'ftp_write.', 'guess_passwd.', 'imap.', 'multihop.', 'named.', 'phf.',
    'sendmail.', 'snmpgetattack.', 'snmpguess.', 'spy.', 'warezclient.', 'warezmaster.',
    'worm.', 'xlock.', 'xsnoop.'
]

y = data['outcome'].copy()
y[y.isin(label_normal)] = 0
y[y.isin(label_probe)] = 1
y[y.isin(label_dos)] = 2
y[y.isin(label_u2r)] = 3
y[y.isin(label_r2l)] = 4
y = y.astype(np.int32)
print(y.value_counts())

outcome
2    391458
0     97277
1      4107
4      1126
3        52
Name: count, dtype: int64


In [38]:
jc = jaccard_score(y, res, average='micro')
fm = fowlkes_mallows_score(y, res)
ri = rand_score(y, res)

print(f"Jaccard Coefficient (JC): {jc}")
print(f"Fowlkes-Mallows Index (FM): {fm}")
print(f"Rand Index (RI): {ri}")

Jaccard Coefficient (JC): 0.10917026645951018
Fowlkes-Mallows Index (FM): 0.8166832927515518
Rand Index (RI): 0.6671158396250134
