### 读取所有数据

In [4]:
import pandas as pd
import glob
import numpy as np
# 获取ROAD文件夹下所有CSV文件的路径
csv_files = glob.glob('./*.csv')

# 使用列表推导式读取所有文件并存储在列表中，尝试使用不同的编码
dataframes = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
    except UnicodeDecodeError:
        df = pd.read_csv(file, encoding='latin1')  # 尝试使用Latin-1编码
    dataframes.append(df)


In [10]:
# 遍历dataframes列表，查看每个DataFrame的行数、列数和列名
for i, df in enumerate(dataframes):
    rows, cols = df.shape
    column_names = df.columns.tolist()
    print(f"DataFrame {i+1}: {rows} 行, {cols} 列")
    print(f"列名: {column_names}")
    print("-" * 50)  # 添加分隔线以清晰区分每个DataFrame的输出

DataFrame 1: 3838859 行, 12 列
列名: ['1478195721.903877', '0545', '8', 'd8', '00', '00.1', '8a', '00.2', '00.3', '00.4', '00.5', 'R']
--------------------------------------------------
DataFrame 2: 4621701 行, 12 列
列名: ['1478191030.045114', '0316', '8', '05', '22', '68', '09', '22.1', '20', '00', '75', 'R']
--------------------------------------------------
DataFrame 3: 3665770 行, 12 列
列名: ['1478198376.389427', '0316', '8', '05', '21', '68', '09', '21.1', '21.2', '00', '6f', 'R']
--------------------------------------------------
DataFrame 4: 4443141 行, 12 列
列名: ['1478193190.056566', '0140', '8', '00', '00.1', '00.2', '00.3', '10', '29', '2a', '24', 'R']
--------------------------------------------------


In [11]:
dataframes[0].head(10)

Unnamed: 0,1478195721.903877,0545,8,d8,00,00.1,8a,00.2,00.3,00.4,00.5,R
0,1478196000.0,02b0,5,ff,7f,0,05,49,R,,,
1,1478196000.0,0002,8,00,00,0,00,00,01,7.0,15,R
2,1478196000.0,0153,8,00,21,10,ff,00,ff,0.0,00,R
3,1478196000.0,0130,8,19,80,0,ff,fe,7f,7.0,60,R
4,1478196000.0,0131,8,17,80,0,00,65,7f,7.0,9f,R
5,1478196000.0,0140,8,00,00,0,00,02,20,27.0,a8,R
6,1478196000.0,0350,8,05,20,14,68,78,00,0.0,21,R
7,1478196000.0,02c0,8,15,00,0,00,00,00,0.0,00,R
8,1478196000.0,0370,8,00,20,0,00,00,00,0.0,00,R
9,1478196000.0,043f,8,10,40,60,ff,7d,8c,9.0,00,R


In [12]:
# 遍历每个DataFrame，打印出其所有列名
for i, df in enumerate(dataframes):
    column_names = df.columns.tolist()
    print(f"文件 {csv_files[i]} 的列名:")
    print(column_names)
    print("-" * 50)  # 添加分隔线以清晰区分每个文件的输出

文件 ./Fuzzy_dataset.csv 的列名:
['1478195721.903877', '0545', '8', 'd8', '00', '00.1', '8a', '00.2', '00.3', '00.4', '00.5', 'R']
--------------------------------------------------
文件 ./RPM_dataset.csv 的列名:
['1478191030.045114', '0316', '8', '05', '22', '68', '09', '22.1', '20', '00', '75', 'R']
--------------------------------------------------
文件 ./DoS_dataset.csv 的列名:
['1478198376.389427', '0316', '8', '05', '21', '68', '09', '21.1', '21.2', '00', '6f', 'R']
--------------------------------------------------
文件 ./gear_dataset.csv 的列名:
['1478193190.056566', '0140', '8', '00', '00.1', '00.2', '00.3', '10', '29', '2a', '24', 'R']
--------------------------------------------------


In [6]:
# 合并所有DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# 显示合并后的DataFrame的信息，如需
print(combined_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16569471 entries, 0 to 16569470
Data columns (total 32 columns):
 #   Column             Dtype  
---  ------             -----  
 0   1478195721.903877  float64
 1   0545               object 
 2   8                  int64  
 3   d8                 object 
 4   00                 object 
 5   00.1               object 
 6   8a                 object 
 7   00.2               object 
 8   00.3               object 
 9   00.4               object 
 10  00.5               object 
 11  R                  object 
 12  1478191030.045114  float64
 13  0316               object 
 14  05                 object 
 15  22                 object 
 16  68                 object 
 17  09                 object 
 18  22.1               object 
 19  20                 object 
 20  75                 object 
 21  1478198376.389427  float64
 22  21                 object 
 23  21.1               object 
 24  21.2               object 
 25  6f              

In [7]:
combined_df.columns

Index(['1478195721.903877', '0545', '8', 'd8', '00', '00.1', '8a', '00.2',
       '00.3', '00.4', '00.5', 'R', '1478191030.045114', '0316', '05', '22',
       '68', '09', '22.1', '20', '75', '1478198376.389427', '21', '21.1',
       '21.2', '6f', '1478193190.056566', '0140', '10', '29', '2a', '24'],
      dtype='object')

合并'label', 'category', 'specific_class'列

In [8]:
combined_df['Label'] = combined_df.apply(lambda row: ' '.join([str(row['label']), str(row['category']), str(row['specific_class'])]), axis=1)
combined_df.drop(['label', 'category', 'specific_class'], axis=1, inplace=True)
combined_df.info()

KeyError: 'label'

查看合并以后的类别情况。

In [24]:
# 查看 'Label' 列的所有唯一值及其出现次数
label_counts = combined_df['Label'].value_counts()

# 打印结果
print(label_counts)

BENIGN BENIGN BENIGN              1223737
ATTACK DoS DoS                      74663
ATTACK SPOOFING RPM                 54900
ATTACK SPOOFING SPEED               24951
ATTACK SPOOFING STEERING_WHEEL      19977
ATTACK SPOOFING GAS                  9991
Name: Label, dtype: int64


In [26]:
combined_df.columns

Index(['ID', 'DATA_0', 'DATA_1', 'DATA_2', 'DATA_3', 'DATA_4', 'DATA_5',
       'DATA_6', 'DATA_7', 'Label'],
      dtype='object')

In [25]:
combined_df.describe()

Unnamed: 0,ID,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,537.2079,71.0866,69.98925,55.01127,57.45364,45.28517,53.88261,71.74914,60.27477
std,322.48,88.97717,95.58374,72.76584,90.32077,64.45835,94.33612,101.6872,99.96547
min,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,357.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,516.0,16.0,12.0,13.0,0.0,6.0,0.0,0.0,0.0
75%,578.0,127.0,128.0,125.0,92.0,86.0,63.0,138.0,80.0
max,1438.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


检查ID列的情况

In [27]:
if combined_df['ID'].is_unique:
    print("ID is unique.")
else:
    print("ID is not unique. Further investigation needed.")


ID is not unique. Further investigation needed.


结果显示，ID为非唯一的，因此这里不用作训练列数，防止模型过拟合。

In [28]:
if 'ID' in combined_df.columns:
    combined_df = combined_df.drop('ID', axis=1)
combined_df.info()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408219 entries, 0 to 1408218
Data columns (total 9 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   DATA_0  1408219 non-null  int64 
 1   DATA_1  1408219 non-null  int64 
 2   DATA_2  1408219 non-null  int64 
 3   DATA_3  1408219 non-null  int64 
 4   DATA_4  1408219 non-null  int64 
 5   DATA_5  1408219 non-null  int64 
 6   DATA_6  1408219 non-null  int64 
 7   DATA_7  1408219 non-null  int64 
 8   Label   1408219 non-null  object
dtypes: int64(8), object(1)
memory usage: 96.7+ MB


数据归一化

In [29]:
from sklearn.preprocessing import MinMaxScaler

# 创建归一化器实例
scaler = MinMaxScaler()

# 归一化'DATA_0'到'DATA_7'这些列
data_columns = ['DATA_0', 'DATA_1', 'DATA_2', 'DATA_3', 'DATA_4', 'DATA_5', 'DATA_6', 'DATA_7']
combined_df[data_columns] = scaler.fit_transform(combined_df[data_columns])

In [30]:
combined_df.describe()

Unnamed: 0,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,0.278771,0.2744676,0.2157305,0.2253084,0.1775889,0.2113044,0.2813692,0.2363716
std,0.3489301,0.3748382,0.2853562,0.3541991,0.2527778,0.3699456,0.3987733,0.3920214
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0627451,0.04705882,0.05098039,0.0,0.02352941,0.0,0.0,0.0
75%,0.4980392,0.5019608,0.4901961,0.3607843,0.3372549,0.2470588,0.5411765,0.3137255
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
combined_df.head()

Unnamed: 0,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ATTACK DoS DoS
1,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS
2,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS
3,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS
4,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS


对数据进行放缩，将其转为[0,255]的形式

In [36]:
from sklearn.preprocessing import QuantileTransformer
# Transform all features into the scale of [0,1]
numeric_features = combined_df.dtypes[combined_df.dtypes != 'object'].index
scaler = QuantileTransformer() 
combined_df[numeric_features] = scaler.fit_transform(combined_df[numeric_features])

In [37]:
# Multiply the feature values by 255 to transform them into the scale of [0,255]
combined_df[numeric_features] = combined_df[numeric_features].apply(
    lambda x: (x*255))

In [38]:
combined_df.describe()

Unnamed: 0,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,115.9415,109.0081,113.3562,95.11913,102.2382,97.25848,95.23261,83.23352
std,90.968,97.79881,89.99221,101.9489,96.77039,104.4887,101.5058,106.5898
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,127.2447,127.1171,128.7763,0.0,124.8198,0.0,0.0,0.0
75%,190.2928,194.3769,189.6547,193.7387,189.6547,187.7402,191.3138,189.7823
max,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


In [40]:
# 查看 'Label' 列的所有唯一值及其出现次数
label_counts = combined_df['Label'].value_counts()

# 打印结果
print(label_counts)

BENIGN BENIGN BENIGN              1223737
ATTACK DoS DoS                      74663
ATTACK SPOOFING RPM                 54900
ATTACK SPOOFING SPEED               24951
ATTACK SPOOFING STEERING_WHEEL      19977
ATTACK SPOOFING GAS                  9991
Name: Label, dtype: int64


对数据进行分类

In [41]:
df0=combined_df[combined_df['Label']=='BENIGN BENIGN BENIGN'].drop(['Label'],axis=1)
df1=combined_df[combined_df['Label']=='ATTACK DoS DoS'].drop(['Label'],axis=1)
df2=combined_df[combined_df['Label']=='ATTACK SPOOFING RPM'].drop(['Label'],axis=1)
df3=combined_df[combined_df['Label']=='ATTACK SPOOFING SPEED'].drop(['Label'],axis=1)
df4=combined_df[combined_df['Label']=='ATTACK SPOOFING STEERING_WHEEL'].drop(['Label'],axis=1)
df5=combined_df[combined_df['Label']=='ATTACK SPOOFING GAS'].drop(['Label'],axis=1)


将数据集转为32*32*3的图像形式

In [45]:
import numpy as np
from PIL import Image
import os

# 假设 combined_df 是你的原始 DataFrame
labels = [
    'BENIGN BENIGN BENIGN', 
    'ATTACK DoS DoS', 
    'ATTACK SPOOFING RPM', 
    'ATTACK SPOOFING SPEED', 
    'ATTACK SPOOFING STEERING_WHEEL', 
    'ATTACK SPOOFING GAS'
]
image_counts = []

needed_pixels = 32 * 32 * 3

for idx, label in enumerate(labels):
    df = combined_df[combined_df['Label'] == label].drop(['Label'], axis=1)
    image_path = f"./train/{idx}/"
    os.makedirs(image_path, exist_ok=True)
    
    image_count = 0  # 初始化当前类别的图像计数器
    start_index = 0
    while start_index < len(df):
        if start_index + 4 > len(df):
            # 最后一组可能不足四条
            data_block = df.iloc[start_index:].values.flatten()
            data_block = np.tile(data_block, (4, 1)).flatten()[:32]  # 用最后一组数据填充到足够长度
        else:
            # 正常情况下获取4条数据
            data_block = df.iloc[start_index:start_index + 4].values.flatten()
        
        repeated_data = np.tile(data_block, needed_pixels // len(data_block) + 1)[:needed_pixels]
        image_data = repeated_data.reshape((32, 32, 3))
        image_data = np.array(image_data, dtype=np.uint8)
        new_image = Image.fromarray(image_data)
        new_image.save(os.path.join(image_path, f'{image_count}.png'))
        
        image_count += 1
        start_index += 4
    
    image_counts.append((label, image_count))

# 输出每个类别的图像数量
for label, count in image_counts:
    print(f"Category: {label} - Images: {count}")


Category: BENIGN BENIGN BENIGN - Images: 305935
Category: ATTACK DoS DoS - Images: 18666
Category: ATTACK SPOOFING RPM - Images: 13725
Category: ATTACK SPOOFING SPEED - Images: 6238
Category: ATTACK SPOOFING STEERING_WHEEL - Images: 4995
Category: ATTACK SPOOFING GAS - Images: 2498
