In [1]:
import math
import io
import shutil
import os
import sys
from os import path
import json

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [2]:
df = pd.read_csv(path.join('dataset', 'EVSE-B-HPC-Kernel-Events-cleaned.csv'))
df

Unnamed: 0,time,alarmtimer_alarmtimer_cancel,alarmtimer_alarmtimer_fired,alarmtimer_alarmtimer_start,alarmtimer_alarmtimer_suspend,alignment-faults,ase_spec,block_block_bio_backmerge,block_block_bio_bounce,block_block_bio_complete,...,writeback_writeback_wake_background,writeback_writeback_write_inode,writeback_writeback_write_inode_start,writeback_writeback_written,State,Attack,Scenario,Label,interface,isDoS
0,5.001477,0,0,0,0,0,693371795,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
1,5.001487,0,0,0,0,0,699964025,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
2,5.001641,0,0,0,0,0,549770341,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
3,5.003762,0,0,0,0,0,571970875,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
4,10.065740,0,0,0,0,0,553199786,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6161,283.646045,0,0,0,0,0,5463807,0,0,0,...,0,0,0,0,Charging,synonymous-ip-flood,DoS,attack,ocpp,True
6162,288.714173,0,0,0,0,0,10977108,0,0,0,...,0,0,0,0,Charging,synonymous-ip-flood,DoS,attack,ocpp,True
6163,293.790086,0,0,0,0,0,3683292,0,0,0,...,0,0,0,0,Charging,synonymous-ip-flood,DoS,attack,ocpp,True
6164,298.861925,0,0,0,0,0,5165840,0,0,0,...,0,0,0,0,Charging,synonymous-ip-flood,DoS,attack,ocpp,True


In [3]:
column_types = set()
for col in df.columns:
    column_types.add(pd.api.types.infer_dtype(df[col]))
column_types

{'boolean', 'floating', 'integer', 'string'}

In [9]:
numerical_columns = []
string_columns = []
for col in df.columns:
    if pd.api.types.infer_dtype(df[col]) != 'string':
        numerical_columns.append(col)
    else:
        string_columns.append(col)

numerical_df = df[numerical_columns]
numerical_df

catergorical_df = df[string_columns].copy(deep=True)
catergorical_df = pd.get_dummies(catergorical_df, columns=string_columns)

In [10]:
catergorical_df

Unnamed: 0,State_Charging,State_idle,Attack_aggressive-scan,Attack_cryptojacking,Attack_icmp-flood,Attack_icmp-fragmentation,Attack_icmp-fragmentation_old,Attack_none,Attack_os-fingerprinting,Attack_os-scan,...,Scenario_Cryptojacking,Scenario_DoS,Scenario_Recon,Label_attack,Label_benign,interface_any,interface_iso15118,interface_iso15118.1,interface_none,interface_ocpp
0,True,False,False,True,False,False,False,False,False,False,...,True,False,False,True,False,True,False,False,False,False
1,True,False,False,True,False,False,False,False,False,False,...,True,False,False,True,False,True,False,False,False,False
2,True,False,False,True,False,False,False,False,False,False,...,True,False,False,True,False,True,False,False,False,False
3,True,False,False,True,False,False,False,False,False,False,...,True,False,False,True,False,True,False,False,False,False
4,True,False,False,True,False,False,False,False,False,False,...,True,False,False,True,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6161,True,False,False,False,False,False,False,False,False,False,...,False,True,False,True,False,False,False,False,False,True
6162,True,False,False,False,False,False,False,False,False,False,...,False,True,False,True,False,False,False,False,False,True
6163,True,False,False,False,False,False,False,False,False,False,...,False,True,False,True,False,False,False,False,False,True
6164,True,False,False,False,False,False,False,False,False,False,...,False,True,False,True,False,False,False,False,False,True


In [7]:
# Define the label column
label_column = 'isDoS'

# Calculate correlation of all other columns with the label column
correlation = numerical_df.corr()[label_column]

# Drop the label itself (correlation of label with itself is 1)
correlation = correlation.drop(label_column)

# Sort by the absolute value of the correlation in descending order
top_features = correlation.abs().sort_values(ascending=False).head(10)

# Get the column names of the top features
top_columns = top_features.index.tolist()

In [8]:
top_columns

['irq_softirq_exit',
 'irq_softirq_entry',
 'irq_softirq_raise',
 'kmem_kmem_cache_free',
 'kmem_kmem_cache_alloc',
 'net_netif_rx',
 'net_netif_rx_ni_exit',
 'net_netif_rx_ni_entry',
 'rpm_rpm_usage',
 'rpm_rpm_resume']

In [11]:
top_numerical_df = numerical_df[top_columns]
top_numerical_df

Unnamed: 0,irq_softirq_exit,irq_softirq_entry,irq_softirq_raise,kmem_kmem_cache_free,kmem_kmem_cache_alloc,net_netif_rx,net_netif_rx_ni_exit,net_netif_rx_ni_entry,rpm_rpm_usage,rpm_rpm_resume
0,5808,5808,5826,3976,4016,0,0,0,4277,4369
1,4791,4791,4808,12217,13581,0,0,0,1355,1391
2,6635,6635,6667,16222,16487,0,0,0,2683,2719
3,9165,9165,9228,15833,17867,0,0,0,4934,4988
4,8405,8405,8431,16182,15720,0,0,0,4736,4778
...,...,...,...,...,...,...,...,...,...,...
6161,95032,95032,95043,109798,109887,44337,44335,44335,80240,80266
6162,96744,96744,96779,98202,98220,37797,37795,37795,69827,69873
6163,99936,99935,99947,105024,104998,41208,41208,41208,75511,75531
6164,96575,96576,96590,99527,99583,38825,38825,38825,71057,71074


In [17]:
label_df = numerical_df['isDoS']

processed_df = pd.concat([top_numerical_df, catergorical_df, label_df], axis=1)
processed_df

Unnamed: 0,irq_softirq_exit,irq_softirq_entry,irq_softirq_raise,kmem_kmem_cache_free,kmem_kmem_cache_alloc,net_netif_rx,net_netif_rx_ni_exit,net_netif_rx_ni_entry,rpm_rpm_usage,rpm_rpm_resume,...,Scenario_DoS,Scenario_Recon,Label_attack,Label_benign,interface_any,interface_iso15118,interface_iso15118.1,interface_none,interface_ocpp,isDoS
0,5808,5808,5826,3976,4016,0,0,0,4277,4369,...,False,False,True,False,True,False,False,False,False,False
1,4791,4791,4808,12217,13581,0,0,0,1355,1391,...,False,False,True,False,True,False,False,False,False,False
2,6635,6635,6667,16222,16487,0,0,0,2683,2719,...,False,False,True,False,True,False,False,False,False,False
3,9165,9165,9228,15833,17867,0,0,0,4934,4988,...,False,False,True,False,True,False,False,False,False,False
4,8405,8405,8431,16182,15720,0,0,0,4736,4778,...,False,False,True,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6161,95032,95032,95043,109798,109887,44337,44335,44335,80240,80266,...,True,False,True,False,False,False,False,False,True,True
6162,96744,96744,96779,98202,98220,37797,37795,37795,69827,69873,...,True,False,True,False,False,False,False,False,True,True
6163,99936,99935,99947,105024,104998,41208,41208,41208,75511,75531,...,True,False,True,False,False,False,False,False,True,True
6164,96575,96576,96590,99527,99583,38825,38825,38825,71057,71074,...,True,False,True,False,False,False,False,False,True,True


In [18]:
processed_df.to_csv(path.join('dataset', 'EVSE-B-HPC-Kernel-Events-processed.csv'), index=False)