### 读取所有数据

In [1]:
import pandas as pd
import glob
import numpy as np
# 获取ROAD文件夹下所有CSV文件的路径
csv_files = glob.glob('./*.csv')

# 使用列表推导式读取所有文件并存储在列表中，尝试使用不同的编码
dataframes = []
for file in csv_files:
    try:
        df = pd.read_csv(file)
    except UnicodeDecodeError:
        df = pd.read_csv(file, encoding='latin1')  # 尝试使用Latin-1编码
    dataframes.append(df)


In [2]:
# 合并所有DataFrame
combined_df = pd.concat(dataframes, ignore_index=True)

# 显示合并后的DataFrame的信息，如需
print(combined_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1453835 entries, 0 to 1453834
Data columns (total 51 columns):
 #   Column  Non-Null Count    Dtype  
---  ------  --------------    -----  
 0   1694    152303 non-null   float64
 1   4       311860 non-null   float64
 2   64      410217 non-null   float64
 3   4.1     152303 non-null   float64
 4   126     62285 non-null    float64
 5   31      257558 non-null   float64
 6   192     99861 non-null    float64
 7   21      152303 non-null   float64
 8   66      152303 non-null   float64
 9   0.0     1453835 non-null  float64
 10  263     324556 non-null   float64
 11  0       1391550 non-null  float64
 12  0.1     1159285 non-null  float64
 13  0.2     773404 non-null   float64
 14  0.3     521689 non-null   float64
 15  0.4     521689 non-null   float64
 16  0.5     521689 non-null   float64
 17  0.6     521689 non-null   float64
 18  0.7     324556 non-null   float64
 19  124     90018 non-null    float64
 20  32      655024 non-null 

In [3]:
combined_df.columns

Index(['1694', '4', '64', '4.1', '126', '31', '192', '21', '66', '0.0', '263',
       '0', '0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '0.7', '124', '32',
       '852', '50', '5', '168', '128', '51', '7', '15', '7.1', '208', '255',
       '3', '227', '412', '24', '230', '2', '40', '737', '1176', '132', '251',
       '1', '10', '116', '526', '78', '160', '63', '239'],
      dtype='object')

合并'label', 'category', 'specific_class'列

In [22]:
combined_df['Label'] = combined_df.apply(lambda row: ' '.join([str(row['label']), str(row['category']), str(row['specific_class'])]), axis=1)
combined_df.drop(['label', 'category', 'specific_class'], axis=1, inplace=True)
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408219 entries, 0 to 1408218
Data columns (total 10 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   ID      1408219 non-null  int64 
 1   DATA_0  1408219 non-null  int64 
 2   DATA_1  1408219 non-null  int64 
 3   DATA_2  1408219 non-null  int64 
 4   DATA_3  1408219 non-null  int64 
 5   DATA_4  1408219 non-null  int64 
 6   DATA_5  1408219 non-null  int64 
 7   DATA_6  1408219 non-null  int64 
 8   DATA_7  1408219 non-null  int64 
 9   Label   1408219 non-null  object
dtypes: int64(9), object(1)
memory usage: 107.4+ MB


查看合并以后的类别情况。

In [24]:
# 查看 'Label' 列的所有唯一值及其出现次数
label_counts = combined_df['Label'].value_counts()

# 打印结果
print(label_counts)

BENIGN BENIGN BENIGN              1223737
ATTACK DoS DoS                      74663
ATTACK SPOOFING RPM                 54900
ATTACK SPOOFING SPEED               24951
ATTACK SPOOFING STEERING_WHEEL      19977
ATTACK SPOOFING GAS                  9991
Name: Label, dtype: int64


In [26]:
combined_df.columns

Index(['ID', 'DATA_0', 'DATA_1', 'DATA_2', 'DATA_3', 'DATA_4', 'DATA_5',
       'DATA_6', 'DATA_7', 'Label'],
      dtype='object')

In [25]:
combined_df.describe()

Unnamed: 0,ID,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,537.2079,71.0866,69.98925,55.01127,57.45364,45.28517,53.88261,71.74914,60.27477
std,322.48,88.97717,95.58374,72.76584,90.32077,64.45835,94.33612,101.6872,99.96547
min,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,357.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,516.0,16.0,12.0,13.0,0.0,6.0,0.0,0.0,0.0
75%,578.0,127.0,128.0,125.0,92.0,86.0,63.0,138.0,80.0
max,1438.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


检查ID列的情况

In [27]:
if combined_df['ID'].is_unique:
    print("ID is unique.")
else:
    print("ID is not unique. Further investigation needed.")


ID is not unique. Further investigation needed.


结果显示，ID为非唯一的，因此这里不用作训练列数，防止模型过拟合。

In [28]:
if 'ID' in combined_df.columns:
    combined_df = combined_df.drop('ID', axis=1)
combined_df.info()    

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408219 entries, 0 to 1408218
Data columns (total 9 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   DATA_0  1408219 non-null  int64 
 1   DATA_1  1408219 non-null  int64 
 2   DATA_2  1408219 non-null  int64 
 3   DATA_3  1408219 non-null  int64 
 4   DATA_4  1408219 non-null  int64 
 5   DATA_5  1408219 non-null  int64 
 6   DATA_6  1408219 non-null  int64 
 7   DATA_7  1408219 non-null  int64 
 8   Label   1408219 non-null  object
dtypes: int64(8), object(1)
memory usage: 96.7+ MB


数据归一化

In [29]:
from sklearn.preprocessing import MinMaxScaler

# 创建归一化器实例
scaler = MinMaxScaler()

# 归一化'DATA_0'到'DATA_7'这些列
data_columns = ['DATA_0', 'DATA_1', 'DATA_2', 'DATA_3', 'DATA_4', 'DATA_5', 'DATA_6', 'DATA_7']
combined_df[data_columns] = scaler.fit_transform(combined_df[data_columns])

In [30]:
combined_df.describe()

Unnamed: 0,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,0.278771,0.2744676,0.2157305,0.2253084,0.1775889,0.2113044,0.2813692,0.2363716
std,0.3489301,0.3748382,0.2853562,0.3541991,0.2527778,0.3699456,0.3987733,0.3920214
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0627451,0.04705882,0.05098039,0.0,0.02352941,0.0,0.0,0.0
75%,0.4980392,0.5019608,0.4901961,0.3607843,0.3372549,0.2470588,0.5411765,0.3137255
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [34]:
combined_df.head()

Unnamed: 0,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ATTACK DoS DoS
1,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS
2,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS
3,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS
4,0.054902,0.043137,0.015686,0.015686,0.011765,0.011765,0.031373,0.047059,ATTACK DoS DoS


对数据进行放缩，将其转为[0,255]的形式

In [36]:
from sklearn.preprocessing import QuantileTransformer
# Transform all features into the scale of [0,1]
numeric_features = combined_df.dtypes[combined_df.dtypes != 'object'].index
scaler = QuantileTransformer() 
combined_df[numeric_features] = scaler.fit_transform(combined_df[numeric_features])

In [37]:
# Multiply the feature values by 255 to transform them into the scale of [0,255]
combined_df[numeric_features] = combined_df[numeric_features].apply(
    lambda x: (x*255))

In [38]:
combined_df.describe()

Unnamed: 0,DATA_0,DATA_1,DATA_2,DATA_3,DATA_4,DATA_5,DATA_6,DATA_7
count,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0,1408219.0
mean,115.9415,109.0081,113.3562,95.11913,102.2382,97.25848,95.23261,83.23352
std,90.968,97.79881,89.99221,101.9489,96.77039,104.4887,101.5058,106.5898
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,127.2447,127.1171,128.7763,0.0,124.8198,0.0,0.0,0.0
75%,190.2928,194.3769,189.6547,193.7387,189.6547,187.7402,191.3138,189.7823
max,255.0,255.0,255.0,255.0,255.0,255.0,255.0,255.0


In [40]:
# 查看 'Label' 列的所有唯一值及其出现次数
label_counts = combined_df['Label'].value_counts()

# 打印结果
print(label_counts)

BENIGN BENIGN BENIGN              1223737
ATTACK DoS DoS                      74663
ATTACK SPOOFING RPM                 54900
ATTACK SPOOFING SPEED               24951
ATTACK SPOOFING STEERING_WHEEL      19977
ATTACK SPOOFING GAS                  9991
Name: Label, dtype: int64


对数据进行分类

In [41]:
df0=combined_df[combined_df['Label']=='BENIGN BENIGN BENIGN'].drop(['Label'],axis=1)
df1=combined_df[combined_df['Label']=='ATTACK DoS DoS'].drop(['Label'],axis=1)
df2=combined_df[combined_df['Label']=='ATTACK SPOOFING RPM'].drop(['Label'],axis=1)
df3=combined_df[combined_df['Label']=='ATTACK SPOOFING SPEED'].drop(['Label'],axis=1)
df4=combined_df[combined_df['Label']=='ATTACK SPOOFING STEERING_WHEEL'].drop(['Label'],axis=1)
df5=combined_df[combined_df['Label']=='ATTACK SPOOFING GAS'].drop(['Label'],axis=1)


将数据集转为32*32*3的图像形式

In [5]:
import numpy as np
from PIL import Image
import os

image_path = "./train/0/"
os.makedirs(image_path, exist_ok=True)

needed_pixels = 32 * 32 * 3

# 检查数据是否为4的整数倍
remaining_rows = len(df) % 4
start_index = 0

while start_index < len(df):
    if start_index + 4 > len(df):
        # 如果剩余数据不足4条，从前面取足够的数据填充
        extra_needed = 4 - remaining_rows
        data_block = np.concatenate([df.iloc[start_index:start_index + remaining_rows].values.flatten(),
                                     df.iloc[start_index - extra_needed:start_index].values.flatten()])
    else:
        # 正常情况下获取4条数据
        data_block = df.iloc[start_index:start_index + 4].values.flatten()
    
    repeated_data = np.tile(data_block, needed_pixels // len(data_block) + 1)[:needed_pixels]
    image_data = repeated_data.reshape((32, 32, 3))
    image_data = np.array(image_data, dtype=np.uint8)
    new_image = Image.fromarray(image_data)
    new_image.save(os.path.join(image_path, f'{start_index // 4}.png'))

    start_index += 4


KeyboardInterrupt: 