### 读取所有数据

In [2]:
import pandas as pd
import glob
import numpy as np
# 获取ROAD文件夹下所有CSV文件的路径
csv_files = glob.glob('./*.csv')

# 使用列表推导式读取所有文件并存储在列表中，尝试使用不同的编码
dataframes = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
    except UnicodeDecodeError:
        df = pd.read_csv(file, encoding='latin1')  # 尝试使用Latin-1编码
    dataframes.append(df)


In [3]:
# 合并所有DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# 显示合并后的DataFrame的信息，如需
print(combined_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408219 entries, 0 to 1408218
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1408219 non-null  int64 
 1   DATA_0          1408219 non-null  int64 
 2   DATA_1          1408219 non-null  int64 
 3   DATA_2          1408219 non-null  int64 
 4   DATA_3          1408219 non-null  int64 
 5   DATA_4          1408219 non-null  int64 
 6   DATA_5          1408219 non-null  int64 
 7   DATA_6          1408219 non-null  int64 
 8   DATA_7          1408219 non-null  int64 
 9   label           1408219 non-null  object
 10  category        1408219 non-null  object
 11  specific_class  1408219 non-null  object
dtypes: int64(9), object(3)
memory usage: 128.9+ MB
None


In [4]:
combined_df.columns

Index(['ID', 'DATA_0', 'DATA_1', 'DATA_2', 'DATA_3', 'DATA_4', 'DATA_5',
       'DATA_6', 'DATA_7', 'label', 'category', 'specific_class'],
      dtype='object')

In [5]:
combined_df.describe()

Unnamed: 0,ID,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,537.2079,71.0866,69.98925,55.01127,57.45364,45.28517,53.88261,71.74914,60.27477
std,322.48,88.97717,95.58374,72.76584,90.32077,64.45835,94.33612,101.6872,99.96547
min,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,357.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,516.0,16.0,12.0,13.0,0.0,6.0,0.0,0.0,0.0
75%,578.0,127.0,128.0,125.0,92.0,86.0,63.0,138.0,80.0
max,1438.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


合并'label', 'category', 'specific_class'列

In [24]:
combined_df['Label'] = combined_df.apply(lambda row: ' '.join([str(row['label']), str(row['category']), str(row['specific_class'])]), axis=1)
combined_df.drop(['label', 'category', 'specific_class'], axis=1, inplace=True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408219 entries, 0 to 1408218
Data columns (total 10 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   ID      1408219 non-null  int64 
 1   DATA_0  1408219 non-null  int64 
 2   DATA_1  1408219 non-null  int64 
 3   DATA_2  1408219 non-null  int64 
 4   DATA_3  1408219 non-null  int64 
 5   DATA_4  1408219 non-null  int64 
 6   DATA_5  1408219 non-null  int64 
 7   DATA_6  1408219 non-null  int64 
 8   DATA_7  1408219 non-null  int64 
 9   Label   1408219 non-null  object
dtypes: int64(9), object(1)
memory usage: 107.4+ MB


查看合并以后的类别情况。

In [25]:
# 查看 'Label' 列的所有唯一值及其出现次数
label_counts = combined_df['Label'].value_counts()

# 打印结果
print(label_counts)

BENIGN BENIGN BENIGN              1223737
ATTACK DoS DoS                      74663
ATTACK SPOOFING RPM                 54900
ATTACK SPOOFING SPEED               24951
ATTACK SPOOFING STEERING_WHEEL      19977
ATTACK SPOOFING GAS                  9991
Name: Label, dtype: int64


In [26]:
combined_df.columns

Index(['ID', 'DATA_0', 'DATA_1', 'DATA_2', 'DATA_3', 'DATA_4', 'DATA_5',
       'DATA_6', 'DATA_7', 'Label'],
      dtype='object')

In [27]:
combined_df.describe()

Unnamed: 0,ID,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,537.2079,71.0866,69.98925,55.01127,57.45364,45.28517,53.88261,71.74914,60.27477
std,322.48,88.97717,95.58374,72.76584,90.32077,64.45835,94.33612,101.6872,99.96547
min,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,357.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,516.0,16.0,12.0,13.0,0.0,6.0,0.0,0.0,0.0
75%,578.0,127.0,128.0,125.0,92.0,86.0,63.0,138.0,80.0
max,1438.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


检查ID列的情况

In [28]:
if combined_df['ID'].is_unique:
    print("ID is unique.")
else:
    print("ID is not unique. Further investigation needed.")


ID is not unique. Further investigation needed.


结果显示，ID为非唯一的，因此这里不用作训练列数，防止模型过拟合。

In [10]:
# if 'ID' in combined_df.columns:
#     combined_df = combined_df.drop('ID', axis=1)
# combined_df.info()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408219 entries, 0 to 1408218
Data columns (total 9 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   DATA_0  1408219 non-null  int64 
 1   DATA_1  1408219 non-null  int64 
 2   DATA_2  1408219 non-null  int64 
 3   DATA_3  1408219 non-null  int64 
 4   DATA_4  1408219 non-null  int64 
 5   DATA_5  1408219 non-null  int64 
 6   DATA_6  1408219 non-null  int64 
 7   DATA_7  1408219 non-null  int64 
 8   Label   1408219 non-null  object
dtypes: int64(8), object(1)
memory usage: 96.7+ MB


In [31]:
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408219 entries, 0 to 1408218
Data columns (total 10 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   ID      1408219 non-null  int64 
 1   DATA_0  1408219 non-null  int64 
 2   DATA_1  1408219 non-null  int64 
 3   DATA_2  1408219 non-null  int64 
 4   DATA_3  1408219 non-null  int64 
 5   DATA_4  1408219 non-null  int64 
 6   DATA_5  1408219 non-null  int64 
 7   DATA_6  1408219 non-null  int64 
 8   DATA_7  1408219 non-null  int64 
 9   Label   1408219 non-null  object
dtypes: int64(9), object(1)
memory usage: 107.4+ MB


数据归一化

In [32]:
from sklearn.preprocessing import MinMaxScaler

# 创建归一化器实例
scaler = MinMaxScaler()

# 归一化'DATA_0'到'DATA_7'这些列
data_columns = ['ID','DATA_0', 'DATA_1', 'DATA_2', 'DATA_3', 'DATA_4', 'DATA_5', 'DATA_6', 'DATA_7']
combined_df[data_columns] = scaler.fit_transform(combined_df[data_columns])

In [33]:
combined_df.describe()

Unnamed: 0,ID,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,0.3439242,0.278771,0.2744676,0.2157305,0.2253084,0.1775889,0.2113044,0.2813692,0.2363716
std,0.2348725,0.3489301,0.3748382,0.2853562,0.3541991,0.2527778,0.3699456,0.3987733,0.3920214
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.212673,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.3284778,0.0627451,0.04705882,0.05098039,0.0,0.02352941,0.0,0.0,0.0
75%,0.3736344,0.4980392,0.5019608,0.4901961,0.3607843,0.3372549,0.2470588,0.5411765,0.3137255
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
combined_df.head()

Unnamed: 0,ID,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7,Label
0,0.164603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ATTACK DoS DoS
1,0.164603,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS
2,0.164603,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS
3,0.164603,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS
4,0.164603,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS


对数据进行放缩，将其转为[0,255]的形式

In [35]:
from sklearn.preprocessing import QuantileTransformer
# Transform all features into the scale of [0,1]
numeric_features = combined_df.dtypes[combined_df.dtypes != 'object'].index
scaler = QuantileTransformer() 
combined_df[numeric_features] = scaler.fit_transform(combined_df[numeric_features])

In [36]:
# Multiply the feature values by 255 to transform them into the scale of [0,255]
combined_df[numeric_features] = combined_df[numeric_features].apply(
    lambda x: (x*255))

In [37]:
combined_df.describe()

Unnamed: 0,ID,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,127.5251,115.6957,109.0651,113.158,95.05923,101.877,97.30816,95.16561,83.10284
std,73.86103,90.93228,97.81772,89.91678,101.9052,96.469,104.528,101.466,106.4624
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,66.87688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,132.3498,126.6066,127.3724,128.521,0.0,124.1817,0.0,0.0,0.0
75%,189.9099,190.1652,194.3769,189.527,193.3559,188.8889,187.8679,191.3138,189.2718
max,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


In [38]:
# 查看 'Label' 列的所有唯一值及其出现次数
label_counts = combined_df['Label'].value_counts()

# 打印结果
print(label_counts)

BENIGN BENIGN BENIGN              1223737
ATTACK DoS DoS                      74663
ATTACK SPOOFING RPM                 54900
ATTACK SPOOFING SPEED               24951
ATTACK SPOOFING STEERING_WHEEL      19977
ATTACK SPOOFING GAS                  9991
Name: Label, dtype: int64


对数据进行分类

In [39]:
df0=combined_df[combined_df['Label']=='BENIGN BENIGN BENIGN'].drop(['Label'],axis=1)
df1=combined_df[combined_df['Label']=='ATTACK DoS DoS'].drop(['Label'],axis=1)
df2=combined_df[combined_df['Label']=='ATTACK SPOOFING RPM'].drop(['Label'],axis=1)
df3=combined_df[combined_df['Label']=='ATTACK SPOOFING SPEED'].drop(['Label'],axis=1)
df4=combined_df[combined_df['Label']=='ATTACK SPOOFING STEERING_WHEEL'].drop(['Label'],axis=1)
df5=combined_df[combined_df['Label']=='ATTACK SPOOFING GAS'].drop(['Label'],axis=1)


## 将数据集转为9 * 9 * 3的图像形式
1. 选取前27行数据
2. 对于不足27行的情况，提取前27行
3. 对选取的数据reshape为 9，9，3的形式

In [42]:
import numpy as np
from PIL import Image
import os

# 假设 combined_df 是你的原始 DataFrame
labels = [
    'BENIGN BENIGN BENIGN', 
    'ATTACK DoS DoS', 
    'ATTACK SPOOFING RPM', 
    'ATTACK SPOOFING SPEED', 
    'ATTACK SPOOFING STEERING_WHEEL', 
    'ATTACK SPOOFING GAS'
]
image_counts = []

needed_pixels = 9 * 9 * 3  # 每张图片需要的像素数
rows_needed = 27  # 每张图片需要的行数

for idx, label in enumerate(labels):
    df = combined_df[combined_df['Label'] == label].drop(['Label'], axis=1)
    image_path = f"./train/{idx}/"
    os.makedirs(image_path, exist_ok=True)
    
    image_count = 0  # 初始化当前类别的图像计数器
    num_images = len(df) // rows_needed  # 计算可能生成的图片数量
    
    # 处理所有完整的图片集
    for i in range(num_images):
        data_block = df.iloc[i*rows_needed:(i+1)*rows_needed].values.flatten()
        if len(data_block) < needed_pixels:
            data_block = np.tile(data_block, needed_pixels // len(data_block) + 1)[:needed_pixels]
        
        image_data = data_block.reshape((9, 9, 3))
        image_data = np.array(image_data, dtype=np.uint8)
        new_image = Image.fromarray(image_data)
        new_image.save(os.path.join(image_path, f'{image_count}.png'))
        
        image_count += 1
    
    # 处理最后不足384行的数据集
    remainder = len(df) % rows_needed
    if remainder != 0:
        # 如果剩余行数不为0，使用类别的前384行作为替代
        data_block = df.iloc[:rows_needed].values.flatten()
        image_data = data_block.reshape((9, 9, 3))
        image_data = np.array(image_data, dtype=np.uint8)
        new_image = Image.fromarray(image_data)
        new_image.save(os.path.join(image_path, f'{image_count}.png'))
        image_count += 1
    
    image_counts.append((label, image_count))

# 输出每个类别的图像数量
for label, count in image_counts:
    print(f"Category: {label} - Images: {count}")


Category: BENIGN BENIGN BENIGN - Images: 45324
Category: ATTACK DoS DoS - Images: 2766
Category: ATTACK SPOOFING RPM - Images: 2034
Category: ATTACK SPOOFING SPEED - Images: 925
Category: ATTACK SPOOFING STEERING_WHEEL - Images: 740
Category: ATTACK SPOOFING GAS - Images: 371


## 数据划分，转换后的数据划分为测试数据和训练数据
1. 将处理好的数据分割，分割一部分作为测试数据
2. 其中测试数据占总的30%

In [20]:
import os
import shutil
from random import sample

# 图片存储的基础目录
base_dir = './train'
test_dir = './test'
os.makedirs(test_dir, exist_ok=True)  # 确保测试目录存在

# 类别标签
labels = ['0', '1', '2', '3', '4', '5']

# 为每个类别处理图片
for label in labels:
    class_dir = os.path.join(base_dir, label)  # 类别的目录
    images = os.listdir(class_dir)  # 列出所有图片
    test_sample = sample(images, k=int(len(images) * 0.3))  # 随机选择30%的图片
    
    # 为测试集创建相应的目录
    test_class_dir = os.path.join(test_dir, label)
    os.makedirs(test_class_dir, exist_ok=True)
    
    # 将选中的图片移动到测试目录
    for image in test_sample:
        src_path = os.path.join(class_dir, image)
        dst_path = os.path.join(test_class_dir, image)
        shutil.move(src_path, dst_path)  # 使用move进行移动，也可以用shutil.copy(src_path, dst_path)进行复制

# 输出完成信息
print("Test datasets have been created successfully.")


Test datasets have been created successfully.
