## Data Exploration

In [None]:
!pip install sklearn
!pip install xgboost
!pip install lightgbm

 ## Library Description
Axes3D: 绘制3D图形  
StandarScaler: 用于归一化处理  
SelectFromModel: 特征选择  
GridSearchCV: 网格搜索
joblib: 用于保存模型  
preprocessing: 用于数据预处理  
Consult: https://blog.csdn.net/weixin_40807247/article/details/82793220

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt 
import numpy as np 
import os 
import pandas as pd 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn import preprocessing
from sklearn.externals import joblib
from matplotlib.pyplot import plot
%matplotlib inline

## Data Description

In [None]:
# 对数据的41项特征进行命名,最后一项attack类型单独处理
cols = """
duration,
protocol_type,
service,
flag,
src_bytes,
dst_bytes,
land,
wrong_fragment,
urgent,
hot,
num_failed_logins,
logged_in,
num_compromised,
root_shell,
su_attempted,
num_root,
num_file_creations,
num_shells,
num_access_files,
num_outbound_cmds,
is_host_login,
is_guest_login,
count,
srv_count,
serror_rate,
srv_serror_rate,
rerror_rate,
srv_rerror_rate,
same_srv_rate,
diff_srv_rate,
srv_diff_host_rate,
dst_host_count,
dst_host_srv_count,
dst_host_same_srv_rate,
dst_host_diff_srv_rate,
dst_host_same_src_port_rate,
dst_host_srv_diff_host_rate,
dst_host_serror_rate,
dst_host_srv_serror_rate,
dst_host_rerror_rate,
dst_host_srv_rerror_rate"""
cols = [c.strip() for c in cols.split(",") if c.strip()]
# 对最后一列单独加上target标签
cols.append('target')

In [None]:
# 将源数据中标签对应的攻击类型进行归类
attacks_type = {
'normal': 'normal',
'back': 'dos',
'buffer_overflow': 'u2r',
'ftp_write': 'r2l',
'guess_passwd': 'r2l',
'imap': 'r2l',
'ipsweep': 'probe',
'land': 'dos',
'loadmodule': 'u2r',
'multihop': 'r2l',
'neptune': 'dos',
'nmap': 'probe',
'perl': 'u2r',
'phf': 'r2l',
'pod': 'dos',
'portsweep': 'probe',
'rootkit': 'u2r',
'satan': 'probe',
'smurf': 'dos',
'spy': 'r2l',
'teardrop': 'dos',
'warezclient': 'r2l',
'warezmaster': 'r2l',
}

In [None]:
# 读入数据
df = pd.read_csv("../input/kdd-cup-1999-data/kddcup.data_10_percent/kddcup.data_10_percent", names=cols)
# 将整理后的攻击类型放入Attack列中
df['Attack'] = df.target.apply(lambda r: attacks_type[r[:-1]])
print("The data shape is (lines, columns):",df.shape)
# df['service'].unique()

In [None]:
# 重命名名称
hajar_to_cup = {
'is_hot_login' : 'is_host_login',
'urg' : 'urgent',
'protocol' : 'protocol_type',
'count_sec' : 'count',
'srv_count_sec' : 'srv_count',
'serror_rate_sec' : 'serror_rate',
'srv_serror_rate_sec' : 'srv_serror_rate',
'rerror_rate_sec' : 'rerror_rate',
'srv_error_rate_sec' : 'srv_rerror_rate',
'same_srv_rate_sec' : 'same_srv_rate',
'diff_srv_rate_sec' : 'diff_srv_rate',
'srv_diff_host_rate_sec' : 'srv_diff_host_rate',
'count_100' : 'dst_host_count',
'srv_count_100' : 'dst_host_srv_count',
'same_srv_rate_100' : 'dst_host_same_srv_rate',
'diff_srv_rate_100' : 'dst_host_diff_srv_rate',
'same_src_port_rate_100' : 'dst_host_same_src_port_rate',
'srv_diff_host_rate_100' : 'dst_host_srv_diff_host_rate',
'serror_rate_100' : 'dst_host_serror_rate',
'srv_serror_rate_100' : 'dst_host_srv_serror_rate',
'rerror_rate_100' : 'dst_host_rerror_rate',
'srv_rerror_rate_100' : 'dst_host_srv_rerror_rate',
}

In [None]:
#不同攻击类型的记录数量统计
df.Attack.value_counts()

In [None]:
#标签、攻击的种类
df.target.unique(), df.Attack.unique()

In [None]:
# 绘制相关矩阵
def plotCorrelationMatrix(df, graphWidth, dataframeName):
    filename = dataframeName
    df = df.dropna('columns') # 舍去值为NaN的列
    df = df[[col for col in df if df[col].nunique() > 1]] # 保留拥有多于一个唯一值的列
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    # 获取数据之间的相关系数
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    # 对x,y轴进行设置
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    # 配色
    plt.colorbar(corrMat)
    plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.show()


In [None]:
# 绘制分布直方图
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 70]]
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (8 * nGraphPerRow, 10 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


## Observe the distribution characteristics of data
(观察数据分布特征)  
Consult the blog https://blog.csdn.net/Eastmount/article/details/103189405, we draw the histograms of blew features.

In [None]:
plotPerColumnDistribution(df[[
    'protocol_type',
    'service',
    'flag',
    'logged_in',
    'srv_serror_rate',
    'srv_diff_host_rate',
]], nGraphShown=30, nGraphPerRow=2)

We can find that ICMP protocol is the most used one among the these protocols, followed by TCP protocol and about 20000 packets of UDP protocol. In addition, only 70000 packets successfully logged in.

## The correlation among data

In [None]:
# 查看数据之间的相关度
plotCorrelationMatrix(df, graphWidth=20, dataframeName="Packets")

# Data preprocessing

In [None]:
#查看是否有缺失值
for c in df.columns:
    print("%20s : %d"%(c, sum(pd.isnull(df[c]))))
# Reference: https://blog.csdn.net/qq_39072607/article/details/89387907

## 数据标记/编码

## 选取特征（依据方差，相关性）
* 方差:

方差为0，说明该项特征对于所有记录是一样的，最理想的特征应有较大的方差，说明不同类型的记录在该特征上表现出了差异性。因此移除方差为0的特征。

In [None]:
df_std = df.std() #所有特征的方差
df_std = df_std.sort_values(ascending=True) #排序输出
df_std

In [None]:
plt.figure(figsize=(15,10))
plt.plot(list(df_std.index) ,list(df_std.values), 'go')
plt.show()

* 相关性:

可以消除一些完全相关的特征。如（srv_serror_rate，serror_rate）与（dst_host_srv_count，dst_host_count）相关，则在这种情况下，可以消除srv_rate和dst_host_count。

In [None]:
# 通过plotScatterMatrix观察相关矩阵
def standardize_columns(df, cols_map=hajar_to_cup):
    #删除'service'列；如果存在TCPDUMP列则重命名
    if 'service' in df.columns:
        df = df.drop(['service'], axis = 1)
    df.rename(columns = cols_map)
    return df

df = standardize_columns(df, cols_map=hajar_to_cup)
#df = df.drop(['is_host_login','num_outbound_cmds','dst_host_count','srv_serror_rate'], axis = 1)
#df.head(10)
df.columns.values 
# Reference：https://blog.csdn.net/u010652755/article/details/105612332

## 将数据分为训练集和测试集

In [None]:
df = df.drop(['target',], axis=1)
print(df.shape)
#以前41项作为输入X，Attack列作为检测标签y
y = df.Attack
X = df.drop(['Attack',], axis=1)
#随机生成训练集、测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape)
print(y_train.shape, y_test.shape)

In [None]:
le_X_cols = {}
le_y = preprocessing.LabelEncoder()

for c in X_train.columns:
    if str(X_train[c].dtype) == 'object': 
        le_X = preprocessing.LabelEncoder()
        X_train[c] = le_X.fit_transform(X_train[c])
        X_test[c] = le_X.transform(X_test[c])
        le_X_cols[c] = le_X

y_train = le_y.fit_transform(y_train.values)
y_test = le_y.transform(y_test.values)

#保存标签
joblib.dump(le_X_cols, 'le_X_cols.pkl') 
joblib.dump(le_y, 'le_y.pkl') 
# Reference: https://www.cnblogs.com/caimuqing/p/9074046.html

In [None]:
class_names, class_index = le_y.classes_, np.unique(y_train)
class_names, class_index

In [None]:
#特征缩放
scaler = StandardScaler(copy=True, with_mean=True, with_std=True)
X_train[['dst_bytes','src_bytes']] = scaler.fit_transform(X_train[['dst_bytes','src_bytes']])
X_test[['dst_bytes','src_bytes']] = scaler.transform(X_test[['dst_bytes','src_bytes']])
#保存
joblib.dump(scaler, 'scaler_1.pkl') 


## 两种分模型（随机森林和XGBoost）

### 1- 随机森林分类模型

In [None]:
from sklearn.ensemble import RandomForestClassifier
#没有参数调整和特征选择的基础模型
classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 42)
classifier.fit(X_train, y_train)
print("训练准确度:", classifier.score(X_train, y_train))
print("测试准确度:",classifier.score(X_test,y_test))
diff_base = abs(classifier.score(X_train, y_train) - classifier.score(X_test,y_test))
print("模型的过度/不足拟合：", diff_base)

In [None]:
#预测测试集中的数据
y_pred = classifier.predict(X_test)

reversefactor = dict(zip(class_index,class_names))
y_test_rev = np.vectorize(reversefactor.get)(y_test)
y_pred_rev = np.vectorize(reversefactor.get)(y_pred)
#生成混淆矩阵
print(pd.crosstab(y_test_rev, y_pred_rev, rownames=['Actual packets attacks'], colnames=['Predicted packets attcks']))

#fig, ax = plt.subplots(figsize=(15, 10))
#plot.confusion_matrix(y_test_rev, y_pred_rev, ax=ax)
#plt.show()


#### 1-1 特征选择

In [None]:
clf = RandomForestClassifier(n_estimators=30)
clf = clf.fit(X_train, y_train)
fti = clf.feature_importances_
model = SelectFromModel(clf, prefit=True, threshold= 0.005)
X_train_new = model.transform(X_train)
X_test_new = model.transform(X_test)
selcted_features = X_train.columns[model.get_support()]
print(X_train_new.shape)

In [None]:
#选择的特征
selcted_features

#### 1-2参数调整

In [None]:
parameters = {
    'n_estimators'      : [20,40,128,130],
    'max_depth'         : [None,14, 15, 17],
    'criterion' :['gini','entropy'],
    'random_state'      : [42],
    #'max_features': ['auto'],
    
}
clf = GridSearchCV(RandomForestClassifier(), parameters, cv=2, n_jobs=-1, verbose=5)
clf.fit(X_train_new, y_train)

In [None]:
print("clf.best_estimator_:",clf.best_estimator_)
print("clf.best_params_",clf.best_params_)
#print("results:",clf.cv_results_)

In [None]:
print("CV训练准确率：",clf.best_score_)
print("CV测试准确率：",clf.score(X_test_new,y_test))
diff_fst = abs(clf.best_score_ - clf.score(X_test_new,y_test))
print("准确率差：", diff_fst)
print("模型表现提升？", diff_base > diff_fst)


In [None]:
#混淆矩阵
#预测测试数据集
y_pred = clf.predict(X_test_new)

reversefactor = dict(zip(class_index,class_names))
y_test_rev = np.vectorize(reversefactor.get)(y_test)
y_pred_rev = np.vectorize(reversefactor.get)(y_pred)
#生成混淆矩阵
print(pd.crosstab(y_test_rev, y_pred_rev, rownames=['Actual packets attacks'], colnames=['Predicted packets attcks']))

#fig, ax = plt.subplots(figsize=(15, 10))
#plot.confusion_matrix(y_test_rev, y_pred_rev, ax=ax)
#plt.show()

#### 保存模型

In [None]:
joblib.dump(clf, 'random_forest_classifier.pkl') 
#To load it: clf_load = joblib.load('saved_model.pkl') 

### 2- XGBoost模型

#### 2-1基础模型

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MultiLabelBinarizer

In [None]:
clf = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=4, n_estimators=70, random_state=42,verbosity=1))

#通过MultiLabelBinarizer将数组[[x，y，z]]中的变量编码为多标签 
lb = preprocessing.LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)
lb.fit(y_train)
y_train_xgb = lb.transform(y_train)
y_test_xgb = lb.transform(y_test)
#训练模型
clf.fit(X_train[selcted_features], y_train_xgb)


In [None]:
#预测
y_pred_xgb = clf.predict(X_test[selcted_features])

print("训练准确率：", clf.score(X_train[selcted_features], y_train_xgb))
print("测试准确率：",clf.score(X_test[selcted_features],y_test_xgb))# New data, not included in Training data
diff_xgb = abs(clf.score(X_train[selcted_features], y_train_xgb) - clf.score(X_test[selcted_features],y_test_xgb))
print("准确率差：", diff_xgb)


In [None]:
#混淆矩阵
y_pred_xgb = np.argmax(y_pred_xgb, axis=1)

reversefactor = dict(zip(class_index,class_names))
y_test_rev = np.vectorize(reversefactor.get)(y_test)
y_pred_rev = np.vectorize(reversefactor.get)(y_pred_xgb)
#生成混淆矩阵
print(pd.crosstab(y_test_rev, y_pred_rev, rownames=['Actual packets attacks'], colnames=['Predicted packets attcks']))


#fig, ax = plt.subplots(figsize=(15, 10))
#plot.confusion_matrix(y_test_rev, y_pred_rev, ax=ax)
#plt.show()

#### 2-2 参数调整

In [None]:
import xgboost as xgb
print(X_train.shape)

xgb_model = OneVsRestClassifier(XGBClassifier(n_jobs=-1, max_depth=4, n_estimators=70, random_state=42,verbosity=1))

parameters = {'estimator__nthread':[4,], #适应线程
              'estimator__objective':['binary:logistic',],
              'estimator__learning_rate': [0.1,0.08], #'eta'值
              'estimator__max_depth': [4,6],
              'estimator__min_child_weight': [1,],
              'estimator__silent': [1,],
              'estimator__subsample': [1,],
              'estimator__colsample_bytree': [1,],
              'estimator__n_estimators': [70,100], #决策树的数量
              'estimator__random_state':[42],
              }


clf = GridSearchCV(xgb_model, parameters, 
                   cv=2, n_jobs=-1, verbose=5, refit=True)
#训练模型
clf.fit(X_train[selcted_features], y_train_xgb)

In [None]:
print("CV训练准确率：",clf.best_score_)
print("参数：", clf.best_params_)
print("CV测试准确率：",clf.score(X_test[selcted_features],y_test_xgb))
diff_fst = abs(clf.best_score_ - clf.score(X_test[selcted_features],y_test_xgb))
print("准确率差：", diff_fst)


In [None]:
#最佳XGB模型的混淆矩阵
y_pred_xgb = clf.predict(X_test[selcted_features])
y_pred_xgb = np.argmax(y_pred_xgb, axis=1)
reversefactor = dict(zip(class_index,class_names))
y_test_rev = np.vectorize(reversefactor.get)(y_test)
y_pred_rev = np.vectorize(reversefactor.get)(y_pred_xgb)
#生成混淆矩阵
print(pd.crosstab(y_test_rev, y_pred_rev, rownames=['Actual packets attacks'], colnames=['Predicted packets attcks']))

#fig, ax = plt.subplots(figsize=(15, 10))
#plot.confusion_matrix(y_test_rev, y_pred_rev, ax=ax)
#plt.show()

#### 保存模型

In [None]:
joblib.dump(clf, 'xgboost_classifier.pkl') 
#加载方法：clf_load = joblib.load('saved_model.pkl') 

### 组装模型
#### 根据上述实现的两个模型构建一个模型


In [None]:
#对应列重命名
needed_cols_dump = []
for l in selcted_features:
    if l in hajar_to_cup.values():
        for k, v in hajar_to_cup.items():
            if v == l:
                needed_cols_dump.append(k)
    else:
        needed_cols_dump.append(l)
print(len(needed_cols_dump), len(selcted_features))
print(needed_cols_dump)

In [None]:
def do_what_we_want(X, 
                    scaler_1, 
                    le_X_cols, 
                    selcted_features, 
                    map_cols,
                    rdf_clf,
                    xgb_clf,
                    PathX=False):
    if PathX:
        X = pd.read_csv(PathX, names=cols, nrows=30000)
    X = standardize_columns(X, cols_map=map_cols) #重命名列
    X[['dst_bytes','src_bytes']] = scaler_1.fit_transform(X[['dst_bytes','src_bytes']])
    X = X[selcted_features]
    for c in X.columns:
        if str(X[c].dtype) == 'object': 
            le_X = le_X_cols[c]
            X[c] = le_X.transform(X[c])
            
    res = {
        'rd_prd_prb': rdf_clf.predict_proba(X),
        'rd_prd': rdf_clf.predict(X),
        'xgb_prd_prb': xgb_clf.predict_proba(X),
        'xgb_prd': xgb_clf.predict(X),
        
    }
    
    return res

In [None]:
scaler_1 = joblib.load('scaler_1.pkl') #缩放后的数据
le_X_cols = joblib.load('le_X_cols.pkl') #训练集的标签
le_y = joblib.load('le_y.pkl') #测试集的标签
xgb_clf = joblib.load('xgboost_classifier.pkl') #XGBoost模型
rdf_clf = joblib.load('random_forest_classifier.pkl') #随机森林模型

In [None]:
#读入数据
X = pd.read_csv("../input/kdd-cup-1999-data/kddcup.data_10_percent/kddcup.data_10_percent", names=cols, nrows=100000)
Y = X.target.apply(lambda r: attacks_type[r[:-1]])

res = do_what_we_want(X, 
                    scaler_1, 
                    le_X_cols, 
                    selcted_features, 
                    hajar_to_cup,
                    rdf_clf,
                    xgb_clf,
                    PathX=False)
res.keys()

### 使用Logistic回归进行汇总以堆叠预测结果

In [None]:
atks = ['dos', 'normal', 'probe', 'r2l', 'u2r']
rd_prd_df = pd.DataFrame(data=res['rd_prd_prb'])
rd_prd_df= rd_prd_df.rename(columns = {l:'rd_'+atks[l] for l in range(len(atks))})
xg_prd_df = pd.DataFrame(data=res['xgb_prd_prb'])
xg_prd_df= xg_prd_df.rename(columns = {l:'xg_'+atks[l] for l in range(len(atks))})

df = pd.concat([rd_prd_df, xg_prd_df], axis=1)
df.head()

In [None]:
params={"C":np.logspace(-7,7,7), "penalty":["l2"], "multi_class":['auto','ovr']}
lg = LogisticRegression(C=4.5, random_state = 42, multi_class = 'ovr', solver = 'lbfgs', max_iter = 1000)
clf = GridSearchCV(lg, params, cv=3)
clf.fit(df[:20000], Y[:20000])
print("训练准确率：", clf.score(df[:20000], Y[:20000]))
print("测试准确率：",clf.score(df[20000:], Y[20000:]))