In [5]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import numpy as np
import pickle

In [6]:
# 读取数据
fs = ['CAN_ID',
 'DATA[1]',
 'DATA[0]',
 'DATA[4]',
 'DATA[2]',
 'DATA[5]',
 'DATA[3]',
 'time_stamp',
 'DATA[7]']

dos = pd.read_csv('./DoS_dataset.csv',names=['time_stamp','CAN_ID','DLC']+['DATA['+str(i)+']' for i in range(8)]+['Label'])

fuzzy = pd.read_csv('./Fuzzy_dataset.csv',names=['time_stamp','CAN_ID','DLC']+['DATA['+str(i)+']' for i in range(8)]+['Label'])

gear = pd.read_csv('./gear_dataset.csv',names=['time_stamp','CAN_ID','DLC']+['DATA['+str(i)+']' for i in range(8)]+['Label'])

rpm = pd.read_csv('RPM_dataset.csv',names=['time_stamp','CAN_ID','DLC']+['DATA['+str(i)+']' for i in range(8)]+['Label'])
# df = pd.concat([dos,fuzzy,gear,rpm],axis=0,ignore_index=True)
#
# df

In [7]:
# 缺失值补0
def zerofilling(df):
    for r in range(len(df.index)):
        if df.loc[r,'DLC'] < 8:
            for i in range(8):
                if df.loc[r,'DATA['+str(i)+']'] == 'R' or df.loc[r,'DATA['+str(i)+']'] == 'T':
                    df.loc[r,'Label']=df.loc[r,'DATA['+str(i)+']']
                    df.loc[r, 'DATA[' + str(i) + ']'] = '00'
                    break

    df = df.fillna('00')
    return df

dos = zerofilling(dos)
fuzzy = zerofilling(fuzzy)
gear = zerofilling(gear)
rpm = zerofilling(rpm)

In [8]:
# 处理timestamp强关联,顺便对CANID作进制转换

def trans_timestamp_CANID(df):
    for r in reversed(range(len(df.index))):
        if r == 0:
            df.iloc[r,0] = 0.0
        else:
            df.iloc[r,0] = float(df.iloc[r,0]) - float(df.iloc[r-1,0])

        df.iloc[r,1] = int(df.iloc[r,1],16)
    return df


# df.to_csv('normal_run_data.csv')
# print(df)

dos = trans_timestamp_CANID(dos)
fuzzy = trans_timestamp_CANID(fuzzy)
gear = trans_timestamp_CANID(gear)
rpm = trans_timestamp_CANID(rpm)

In [9]:
# 对DATAFILED作进制转换

def trans_datafield(df):
    df.loc[:,['DATA['+str(i)+']' for i in range(8)]] = df.loc[:,['DATA['+str(i)+']' for i in range(8)]].applymap(lambda x:int(x,16))
    return df

dos = trans_datafield(dos)
fuzzy = trans_datafield(fuzzy)
gear = trans_datafield(gear)
rpm = trans_datafield(rpm)

In [10]:
# transform the label to 0 and 1
dos.loc[:,'Label']=dos.loc[:,'Label'].apply(lambda x:0.0 if x=='R' else 1.0)
fuzzy.loc[:,'Label']=fuzzy.loc[:,'Label'].apply(lambda x:0.0 if x=='R' else 1.0)
gear.loc[:,'Label']=gear.loc[:,'Label'].apply(lambda x:0.0 if x=='R' else 1.0)
rpm.loc[:,'Label']=rpm.loc[:,'Label'].apply(lambda x:0.0 if x=='R' else 1.0)

In [11]:
dos = dos.astype(float)
fuzzy = fuzzy.astype(float)
gear = gear.astype(float)
rpm = rpm.astype(float)

dos.to_csv('./trans_dos.csv',index=0)
fuzzy.to_csv('./trans_fuzzy.csv',index=0)
gear.to_csv('./trans_gear.csv',index=0)
rpm.to_csv('./trans_rpm.csv',index=0)

In [13]:
dos = dos[fs+['Label']]
fuzzy = fuzzy[fs+['Label']]
gear = gear[fs+['Label']]
rpm = rpm[fs+['Label']]

# normalization

In [14]:
with open('hy_params.pkl', 'rb') as f:
    params = pickle.load(f)

km_hy_mean = params['hy_mean']
km_hy_std = params['hy_std']
print(f'km_mean:{km_hy_mean},km_std:{km_hy_std}')

km_mean:time_stamp      0.000638
CAN_ID        704.903056
DLC             7.936663
DATA[0]        58.097140
DATA[1]        45.526462
DATA[2]        40.036807
DATA[3]        77.731593
DATA[4]        51.284253
DATA[5]        61.157021
DATA[6]        26.003214
DATA[7]        53.561850
dtype: float64,km_std:time_stamp      0.001130
CAN_ID        397.875027
DLC             0.589138
DATA[0]        89.915449
DATA[1]        54.446672
DATA[2]        58.416384
DATA[3]       102.117568
DATA[4]        73.203852
DATA[5]        77.163061
DATA[6]        57.129318
DATA[7]        80.002116
dtype: float64


In [15]:
df = pd.concat([dos,fuzzy,gear,rpm],axis=0,ignore_index=True)
# saving std and mean in hybrid dataset
hy_mean = df.drop('Label',axis=1).mean()
hy_std = df.drop('Label',axis=1).std()
print(f'mean:{hy_mean},std:{hy_std}')

mean:CAN_ID        704.884227
DATA[1]        45.525033
DATA[0]        58.042142
DATA[4]        51.285084
DATA[2]        40.035023
DATA[5]        61.143568
DATA[3]        77.776993
time_stamp      0.001600
DATA[7]        53.544871
dtype: float64,std:CAN_ID        397.754650
DATA[1]        54.458214
DATA[0]        89.859080
DATA[4]        73.180462
DATA[2]        58.375761
DATA[5]        77.165713
DATA[3]       102.143626
time_stamp      2.427758
DATA[7]        79.942758
dtype: float64


In [18]:
print(km_hy_mean[fs])
print(km_hy_std[fs])

CAN_ID        704.903056
DATA[1]        45.526462
DATA[0]        58.097140
DATA[4]        51.284253
DATA[2]        40.036807
DATA[5]        61.157021
DATA[3]        77.731593
time_stamp      0.000638
DATA[7]        53.561850
dtype: float64
CAN_ID        397.875027
DATA[1]        54.446672
DATA[0]        89.915449
DATA[4]        73.203852
DATA[2]        58.416384
DATA[5]        77.163061
DATA[3]       102.117568
time_stamp      0.001130
DATA[7]        80.002116
dtype: float64


In [20]:
print(hy_mean - km_hy_mean[fs])
print(hy_std - km_hy_std[fs])

CAN_ID       -0.018829
DATA[1]      -0.001429
DATA[0]      -0.054999
DATA[4]       0.000831
DATA[2]      -0.001784
DATA[5]      -0.013453
DATA[3]       0.045400
time_stamp    0.000962
DATA[7]      -0.016979
dtype: float64
CAN_ID       -0.120378
DATA[1]       0.011542
DATA[0]      -0.056369
DATA[4]      -0.023390
DATA[2]      -0.040623
DATA[5]       0.002652
DATA[3]       0.026058
time_stamp    2.426628
DATA[7]      -0.059358
dtype: float64


In [23]:

def normalize_with_params(df, mean, std):
    features = df.drop('Label', axis=1)
    standardized_features = (features - mean) / std
    # 将标准化后的特征赋值回原 DataFrame，保持 'Label' 列不变
    df_standardized = standardized_features.copy()
    df_standardized['Label'] = df['Label'].values
    return df_standardized

def normalize(df):
    features = df.drop('Label', axis=1)
    scaler = StandardScaler()
    features = scaler.fit_transform(features)
    df.iloc[:,:-1]=features
    return df

# n_dos = normalize_with_params(dos, dos.drop('Label',axis=1).mean(), dos.drop('Label',axis=1).std())
# n_fuzzy = normalize_with_params(fuzzy, fuzzy.drop('Label',axis=1).mean(), fuzzy.drop('Label',axis=1).mean().std())
# n_gear = normalize_with_params(gear, gear.drop('Label',axis=1).mean(), gear.drop('Label',axis=1).mean().std())
# n_rpm = normalize_with_params(rpm, rpm.drop('Label',axis=1).mean(), rpm.drop('Label',axis=1).mean().std())
# n_hybrid = normalize_with_params(df.copy(), hy_mean, hy_std)

n_dos = normalize(dos.copy())
n_fuzzy = normalize(fuzzy.copy())
n_gear = normalize(gear.copy())
n_rpm = normalize(rpm.copy())


n_dos.to_csv('./norm_dos.csv',index=0)
n_fuzzy.to_csv('./norm_fuzzy.csv',index=0)
n_gear.to_csv('./norm_gear.csv',index=0)
n_rpm.to_csv('./norm_rpm.csv',index=0)
# n_hybrid.to_csv('./norm_hybrid.csv',index=0)
# features = df.drop('Label',axis=1)
# scaler = StandardScaler()
# features = scaler.fit_transform(features)
# df.iloc[:,:-1]=features

# 特征工程(IG-KPCA)
## IG

In [16]:
# Read the sampled dataset
df=pd.read_csv('./norm_hybrid.csv')

In [17]:
X = df.drop(['Label'],axis=1).values
y = df.iloc[:, -1].values.reshape(-1,1)
y=np.ravel(y)

In [18]:
from sklearn.feature_selection import mutual_info_classif
importances = mutual_info_classif(X, y)

In [19]:
features = df.columns[df.columns!='Label']
f_list = sorted(zip(importances, features), reverse=True)

In [20]:
Sum = 0
for i in range(0, len(f_list)):
    Sum = Sum + f_list[i][0]

In [21]:
f_list2 = sorted(zip(importances/Sum, features), reverse=True)
Sum2 = 0
fs = []
for i in range(0, len(f_list2)):
    Sum2 = Sum2 + f_list2[i][0]
    fs.append(f_list2[i][1])
    if Sum2>=0.9:
        break

In [22]:
fs

['CAN_ID',
 'DATA[1]',
 'DATA[0]',
 'DATA[4]',
 'DATA[2]',
 'DATA[5]',
 'DATA[3]',
 'time_stamp',
 'DATA[7]']

In [23]:
df[fs]

Unnamed: 0,CAN_ID,DATA[1],DATA[0],DATA[4],DATA[2],DATA[5],DATA[3],time_stamp,DATA[7]
0,-0.967397,-0.836166,-0.646131,-0.482000,-0.685369,-0.287145,-0.761197,-0.340333,0.730458
1,-0.967397,-0.836166,-0.646131,-0.673247,-0.685369,-0.377862,-0.761197,-0.350041,0.855454
2,-0.967397,-0.836166,-0.646131,-0.482000,-0.685369,-0.131631,-0.761197,-0.352785,0.792956
3,-0.967397,-0.836166,-0.646131,-0.673247,-0.685369,-0.377862,-0.761197,-0.350041,0.855454
4,-0.967397,-0.836166,-0.646131,-0.482000,-0.685369,-0.131631,-0.761197,-0.350885,0.792956
...,...,...,...,...,...,...,...,...,...
66260,-0.967397,-0.836166,-0.646131,-0.263432,-0.685369,-0.766649,-0.761197,-0.350041,0.280469
66261,-0.967397,-0.836166,-0.646131,-0.236111,-0.685369,-0.688892,-0.761197,-0.348142,0.155473
66262,-0.967397,-0.836166,-0.646131,-0.290753,-0.685369,-0.624094,-0.761197,-0.350041,0.067975
66263,-0.967397,-0.836166,-0.646131,-0.645926,-0.685369,-0.792569,-0.761197,-0.348986,0.180472


In [24]:
X_fs = df[fs].values
X_fs.shape

(66265, 9)

## KPCA

In [25]:
from sklearn.decomposition import KernelPCA
kpca = KernelPCA(n_components = 6, kernel = 'rbf')
kpca.fit(X_fs, y)
X_kpca = kpca.transform(X_fs)

# from sklearn.decomposition import PCA
# kpca = PCA(n_components = 10)
# kpca.fit(X_fss, y)
# X_kpca = kpca.transform(X_fss)

In [26]:
X_kpca

array([[ 0.45932435, -0.23899496, -0.3236166 , -0.1826117 ,  0.07365631,
         0.01676231],
       [ 0.45292158, -0.24986233, -0.33133074, -0.20045482,  0.07159851,
         0.04994654],
       [ 0.4317891 , -0.25179356, -0.33296943, -0.17421079,  0.06777208,
         0.02246888],
       ...,
       [ 0.55542132, -0.16324918, -0.22090514, -0.12828752,  0.10282733,
        -0.0976589 ],
       [ 0.57336771, -0.18455929, -0.2243492 , -0.14199624,  0.09821739,
        -0.05751834],
       [ 0.52949268, -0.17840469, -0.25932368, -0.1643438 ,  0.09707702,
        -0.05451208]])

In [27]:
X_kpca.shape

(66265, 6)

In [28]:
import joblib

joblib.dump(kpca, "kpca_model.pkl")

['kpca_model.pkl']

# saving dataset after feature engineering

In [29]:
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
import numpy as np
import pickle
import joblib

In [32]:
# 读取数据
fs = ['CAN_ID',
 'DATA[1]',
 'DATA[0]',
 'DATA[4]',
 'DATA[2]',
 'DATA[5]',
 'DATA[3]',
 'time_stamp',
 'DATA[7]']
column = fs+['Label']

n_dos = pd.read_csv('./norm_dos.csv',usecols=column)
n_fuzzy = pd.read_csv('./norm_fuzzy.csv',usecols=column)
n_gear = pd.read_csv('./norm_gear.csv',usecols=column)
n_rpm = pd.read_csv('./norm_rpm.csv',usecols=column)


In [33]:
n_df = pd.concat([n_dos,n_fuzzy,n_gear,n_rpm],axis=0,ignore_index=True)

In [35]:
# using kpca model to transform the data
kpca = joblib.load("kpca_model.pkl")
X_dos = n_dos[fs].values
X_fuzzy = n_fuzzy[fs].values
X_gear = n_gear[fs].values
X_rpm = n_rpm[fs].values
X_df = n_df[fs].values

# def kpca_transform_in_batches(kpca, data, batch_size=100000):
#     """
#     使用批量处理对数据进行 KPCA 变换。
#     """
#     transformed_batches = []
#     for i in range(0, data.shape[0], batch_size):
#         batch = data[i:i + batch_size]
#         transformed_batch = kpca.transform(batch)
#         transformed_batches.append(transformed_batch)
#     return np.vstack(transformed_batches)

X_dos_kpca = kpca.transform(X_dos)
X_fuzzy_kpca = kpca.transform(X_fuzzy)
X_gear_kpca = kpca.transform(X_gear)
X_rpm_kpca = kpca.transform(X_rpm)
X_df_kpca = kpca.transform(X_df)


# saving the preprocessed dataset
def save_kpca_transformed_data(X_kpca, df, filename):
    # Convert the KPCA data to a DataFrame
    kpca_df = pd.DataFrame(X_kpca)
    # Add the 'Label' column back from the original DataFrame
    kpca_df['Label'] = df['Label'].values
    # Save the DataFrame to a CSV file
    kpca_df.to_csv(filename, index=False)


save_kpca_transformed_data(X_dos_kpca, n_dos, 'dos_preprocessed.csv')
save_kpca_transformed_data(X_fuzzy_kpca, n_fuzzy, 'fuzzy_preprocessed.csv')
save_kpca_transformed_data(X_gear_kpca, n_gear, 'gear_preprocessed.csv')
save_kpca_transformed_data(X_rpm_kpca, n_rpm, 'rpm_preprocessed.csv')
save_kpca_transformed_data(X_df_kpca, n_df, 'hybrid_preprocessed.csv')


In [None]:
print(n_df)