In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
import missingno as mn # for missing values visualization
from wordcloud import WordCloud as wc # for most highlighted words visualization
from wordcloud import STOPWORDS as sw # this will help ignoring english stop words in string value
from sklearn.preprocessing import StandardScaler # for stardardizing the data to the normal scale
from sklearn.model_selection import train_test_split # for splitting data into train and test
from sklearn.tree import DecisionTreeClassifier # Decision Tree model classifier
from sklearn.ensemble import RandomForestClassifier # RandomForest model classification
from sklearn.linear_model import LogisticRegression # Logistic Regression
from sklearn.metrics import classification_report,confusion_matrix # classification report purposes
# import warnings
# warnings.filterwarnings("ignore")

In [None]:
# 加载数据
file_path_train = r"D:\Download\zyFile\Cyberthreat_Cognitive_System\CTCS_Code\attack_datasets\NSL-KDD\KDDTrain+.txt"
file_path_test = r"D:\Download\zyFile\Cyberthreat_Cognitive_System\CTCS_Code\attack_datasets\NSL-KDD\KDDTest+.txt"
# 定义列名
data_columns = ["duration", "protocol_type", "service", "flag", "src_bytes",
                "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins",
                "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root",
                "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds",
                "is_host_login", "is_guest_login", "count", "srv_count", "serror_rate",
                "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
                "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count",
                "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
                "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate",
                "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label", "difficulty"]
# 加载数据
train_data = pd.read_csv(file_path_train, header=None, names=data_columns)
test_data = pd.read_csv(file_path_test, header=None, names=data_columns)

In [None]:
train_data

In [None]:
test_data

In [None]:
train_data.info()
test_data.info()

In [None]:
train_data.isnull().sum()
test_data.isnull().sum()

In [None]:
mn.bar(train_data, color='yellowgreen', figsize = (20,15))

In [None]:
mn.bar(test_data, color = 'lightskyblue', figsize = (20,15))

In [None]:
classLabel_counts = train_data.groupby(['label']).size()
per_classLabels = classLabel_counts/train_data.shape[0]*100
fig = plt.figure(figsize = (20,10))
r_ = [round(each, 2) for each in per_classLabels.values]
ax = fig.add_subplot(111)
ax.bar(per_classLabels.index, per_classLabels.values, color = ['#413E85', '#30688D', '#1F928B', '#35B777', '#91D542', '#F8E620'], edgecolor = 'black')
ax.set_xticklabels(per_classLabels.index, rotation = 45)
ax.set_xlabel("KDDTrain+ dataset attack Name", fontsize = 20)
ax.set_ylabel("Percentage", fontsize = 20)
ax.set_title("KDDTrain+ dataset attack label percentage", fontsize = 20)
#44045A
#413E85
#30688D
#1F928B
#35B777
#91D542
#F8E620
for i in range(len(per_classLabels.values)):
    plt.annotate(f"{r_[i]}%", xy=(per_classLabels.index[i],per_classLabels.values[i]+0.5), ha='center', va='bottom') 
plt.show()

In [None]:
classLabel_counts = test_data.groupby(['label']).size()
per_classLabels = classLabel_counts/test_data.shape[0]*100
fig = plt.figure(figsize = (20,10))
r_ = [round(each, 2) for each in per_classLabels.values]
ax = fig.add_subplot(111)
ax.bar(per_classLabels.index, per_classLabels.values, color = ['#413E85', '#30688D', '#1F928B', '#35B777', '#91D542', '#F8E620'], edgecolor = 'black')
ax.set_xticklabels(per_classLabels.index, rotation = 45)
ax.set_xlabel("KDDTest+ dataset attack Name", fontsize = 20)
ax.set_ylabel("Percentage", fontsize = 20)
ax.set_title("KDDTest+ dataset attack label percentage", fontsize = 20)

for i in range(len(per_classLabels.values)):
    plt.annotate(f"{r_[i]}%", xy=(per_classLabels.index[i],per_classLabels.values[i]+0.5), ha='center', va='bottom') 
plt.show()  

In [None]:
attack_mapping = {}
with open(r'D:\Download\zyFile\Cyberthreat_Cognitive_System\CTCS_Code\attack_datasets\NSL-KDD\attack_name',
          'r') as file:
    for line in file:
        parts = line.strip().split(' ')
        if len(parts) == 2:
            attack, category = parts
            attack_mapping[attack] = category
# 然后像之前那样使用这个映射字典
train_data['label'] = train_data['label'].map(attack_mapping)
test_data['label'] = test_data['label'].map(attack_mapping)

In [None]:
# 计算每个标签的百分比
percent_data = (train_data.groupby('label').size()) / train_data.shape[0] * 100
# 定义自定义排序的顺序
custom_order = ['normal', 'dos', 'probe', 'r2l', 'u2r']
# 对 percent_data 进行重新排序以匹配自定义顺序
percent_data = percent_data.loc[custom_order]
# 获取每个标签的具体数量
count_data = train_data.groupby('label').size().loc[custom_order]
# 准备绘图
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)
# 绘制条形图
ax.bar(percent_data.index, percent_data.values, color=['#30688D', '#1F928B', '#35B777', '#91D542', '#F8E620'], edgecolor='black')
# 设置轴标签和标题
ax.set_xticklabels(percent_data.index, rotation=45)
ax.set_xlabel("KDDTrain+ dataset attack type", fontsize=20)
ax.set_ylabel("Percentage", fontsize=20)
ax.set_title("KDDTrain+ dataset attack type and percentage", fontsize=20)
# 在每个条形上方添加百分比和具体数量
for i, (label, value) in enumerate(percent_data.items()):
    ax.annotate(f"{count_data[label]} ({value:.2f}%)", 
                xy=(i, value), ha='center', va='bottom')
# 显示图表
plt.show()

In [None]:
# 计算每个标签的百分比
percent_data = (test_data.groupby('label').size()) / test_data.shape[0] * 100
# 获取每个标签的具体数量
count_data = test_data.groupby('label').size()
# 定义自定义排序的顺序
custom_order = ['normal', 'dos', 'probe', 'r2l', 'u2r']
# 对 percent_data 进行重新排序以匹配自定义顺序
percent_data = percent_data.loc[custom_order]
count_data = count_data.loc[custom_order]
# 准备绘图
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111)
# 绘制条形图
ax.bar(percent_data.index, percent_data.values, color=['#30688D', '#1F928B', '#35B777', '#91D542', '#F8E620'], edgecolor='black')
# 设置轴标签和标题
ax.set_xticklabels(percent_data.index, rotation=45)
ax.set_xlabel("KDDTest+ dataset attack type", fontsize=20)
ax.set_ylabel("Percentage", fontsize=20)
ax.set_title("KDDTest+ dataset attack type and percentage", fontsize=20)
# 在每个条形上方添加具体数量和百分比
for i, label in enumerate(custom_order):
    count = count_data[label]
    percentage = r_[i]
    ax.annotate(f"{count} ({percentage}%)", xy=(i, percent_data[label]+0.5), ha='center', va='bottom')
# 显示图表
plt.show()

In [None]:
import matplotlib.pyplot as plt

# 计算具体的数量
value_counts = train_data['protocol_type'].value_counts(dropna=False)

# 绘制条形图
plt.subplots(figsize=(10,8))
value_counts.plot.bar(color=['#1F928B', '#91D542', '#F8E620'])

# 添加横纵标题和图表标题
plt.xlabel('Protocol Type', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.title('KDDTrain+ dataset distribution of Protocol Types', fontsize=16)

# 在每个条形上方添加数量
for index, value in enumerate(value_counts):
    plt.annotate(value, (index, value), textcoords="offset points", xytext=(0,10), ha='center')
plt.show()

In [None]:
plt.subplots(figsize=(25,16))
train_data['service'].value_counts(normalize = True)
train_data['service'].value_counts(dropna = False).plot.bar(color=['#413E85', '#30688D', '#1F928B', '#35B777', '#91D542', '#F8E620'])
#413E85
#30688D
#1F928B
#35B777
#91D542
#F8E620
plt.xlabel('Service Type', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.title('KDDTrain+ dataset of distribution of Service Type', fontsize=16)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# 绘制条形图
fig, ax = plt.subplots(figsize=(10,8))
value_counts = train_data['flag'].value_counts(dropna=False)
value_counts.plot.bar(color=['#413E85', '#30688D', '#1F928B', '#35B777', '#91D542', '#F8E620'], ax=ax)

# 添加横纵标题和图表标题
ax.set_xlabel('Flag', fontsize=14)
ax.set_ylabel('Quantity', fontsize=14)
ax.set_title('KDDTrain+ dataset distribution of Flags', fontsize=16)

# 在每个条形上方添加具体的数量
for index, value in enumerate(value_counts):
    ax.text(index, value, str(value), ha='center', va='bottom', fontsize=9)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# 计算具体的数量
value_counts = test_data['protocol_type'].value_counts(dropna=False)

# 绘制条形图
plt.subplots(figsize=(10,8))
value_counts.plot.bar(color=['#1F928B', '#91D542', '#F8E620'])

# 添加横纵标题和图表标题
plt.xlabel('Protocol Type', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.title('KDDTest+ dataset distribution of Protocol Types', fontsize=16)

# 在每个条形上方添加数量
for index, value in enumerate(value_counts):
    plt.annotate(value, (index, value), textcoords="offset points", xytext=(0,10), ha='center')
plt.show()

In [None]:
plt.subplots(figsize=(25,16))
test_data['service'].value_counts(normalize = True)
test_data['service'].value_counts(dropna = False).plot.bar(color=['#413E85', '#30688D', '#1F928B', '#35B777', '#91D542', '#F8E620'])
plt.xlabel('Service Type', fontsize=14)
plt.ylabel('Quantity', fontsize=14)
plt.title('KDDTest+ dataset distribution of Service Type', fontsize=16)
plt.show()

In [None]:
import matplotlib.pyplot as plt

# 绘制条形图
fig, ax = plt.subplots(figsize=(10,8))
value_counts = test_data['flag'].value_counts(dropna=False)
value_counts.plot.bar(color=['#413E85', '#30688D', '#1F928B', '#35B777', '#91D542', '#F8E620'], ax=ax)

# 添加横纵标题和图表标题
ax.set_xlabel('Flag', fontsize=14)
ax.set_ylabel('Quantity', fontsize=14)
ax.set_title('KDDTest+ dataset distribution of Flags', fontsize=16)

# 在每个条形上方添加具体的数量
for index, value in enumerate(value_counts):
    ax.text(index, value, str(value), ha='center', va='bottom', fontsize=9)
plt.show()

In [None]:
avg_pro = pd.crosstab(train_data['difficulty'], train_data['label'])
# 定义自定义顺序的列表
custom_order = ['normal', 'dos', 'probe', 'r2l', 'u2r']
# 确保avg_pro中只包含custom_order中的列
avg_pro = avg_pro[custom_order]
# 计算每个难度等级中，各攻击类型的比例
avg_pro_percentage = avg_pro.div(avg_pro.sum(1).astype(float), axis=0)
# 绘制堆积条形图
avg_pro_percentage.plot(kind='bar', stacked=True, color=['#413E85', '#30688D', '#1F928B', '#35B777', '#F8E620'])
# 设置图表标题和轴标签
plt.title('Attack Type Difficulty Distribution in KDDTrain+ Dataset', fontsize=20)
plt.xlabel('Difficulty Levels', fontsize=20)
# 设置图例位置到图表外部的右上角
plt.legend(title='Attack type', loc='upper left', bbox_to_anchor=(1,1))
# 调整布局以防止x轴标签被截断
plt.tight_layout()
# 显示图表
plt.show()

In [None]:
avg_pro = pd.crosstab(test_data['difficulty'], test_data['label'])
# 定义自定义顺序的列表
custom_order = ['normal', 'dos', 'probe', 'r2l', 'u2r']
# 确保avg_pro中只包含custom_order中的列
avg_pro = avg_pro[custom_order]
# 计算每个难度等级中，各攻击类型的比例
avg_pro_percentage = avg_pro.div(avg_pro.sum(1).astype(float), axis=0)
# 绘制堆积条形图
avg_pro_percentage.plot(kind='bar', stacked=True, color=['#413E85', '#30688D', '#1F928B', '#35B777', '#F8E620'])
# 设置图表标题和轴标签
plt.title('Attack Type Difficulty Distribution in KDDTest+ Dataset', fontsize=20)
plt.xlabel('Difficulty Levels', fontsize=20)
# 设置图例位置到图表外部的右上角
plt.legend(title='Attack type', loc='upper left', bbox_to_anchor=(1,1))
# 调整布局以防止x轴标签被截断
plt.tight_layout()
# 显示图表
plt.show()