In [1]:
import math
import io
import shutil
import os
import sys
from os import path
import json

import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import pandas as pd

import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [3]:
df = pd.read_csv(path.join('dataset', 'EVSE-B-HPC-Kernel-Events-Combined.csv'), low_memory=False)
df

Unnamed: 0,time,alarmtimer_alarmtimer_cancel,alarmtimer_alarmtimer_fired,alarmtimer_alarmtimer_start,alarmtimer_alarmtimer_suspend,alignment-faults,ase_spec,block_block_bio_backmerge,block_block_bio_bounce,block_block_bio_complete,...,writeback_writeback_written,State,Attack,Scenario,Label,interface,Unnamed: 911,Unnamed: 912,Unnamed: 913,Unnamed: 914
0,5.001476508,0,0,0,0,0,693371795,0,0,0,...,0,Charging,cryptojacking,Cryptojacking,attack,any,,,,
1,5.001487254,0,0,0,0,0,699964025,0,0,0,...,0,Charging,cryptojacking,Cryptojacking,attack,any,,,,
2,5.001640501,0,0,0,0,0,549770341,0,0,0,...,0,Charging,cryptojacking,Cryptojacking,attack,any,,,,
3,5.003761639,0,0,0,0,0,571970875,0,0,0,...,0,Charging,cryptojacking,Cryptojacking,attack,any,,,,
4,10.06573989,0,0,0,0,0,553199786,0,0,0,...,0,Charging,cryptojacking,Cryptojacking,attack,any,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8469,5808.438729,0,0,0,0,0,1333391,0,0,0,...,0,0,0,0,0,idle,backdoor,host-attack,attack,any
8470,5813.510716,0,0,0,0,0,1127358,0,0,0,...,0,0,0,0,0,idle,backdoor,host-attack,attack,any
8471,5818.574812,0,0,0,0,0,6370769,0,0,0,...,0,0,0,0,0,idle,backdoor,host-attack,attack,any
8472,5823.638711,0,0,0,0,0,7294614,0,0,0,...,0,0,0,0,0,idle,backdoor,host-attack,attack,any


In [15]:
print(df['writeback_writeback_written'].unique())
print(df['State'].unique())
print(df['Attack'].unique())
print(df['Scenario'].unique())
print(df['Label'].unique())

['0' nan 'writeback:writeback_wait_iff_congested']
['Charging' 'idle' nan 'writeback:writeback_wake_background' '0']
['cryptojacking' 'none' 'aggressive-scan' 'icmp-flood'
 'icmp-fragmentation_old' 'os-fingerprinting' 'port-scan' 'push-ack-flood'
 'serice-detection' 'syn-flood' 'syn-stealth' 'tcp-flood' 'udp-flood'
 'vuln-scan' 'icmp-fragmentation' 'service-detection'
 'synonymous-ip-flood' 'os-scan' nan 'writeback:writeback_write_inode' '0']
['Cryptojacking' 'Benign' 'Recon' 'DoS' nan
 'writeback:writeback_write_inode_start' '0']
['attack' 'benign' nan 'writeback:writeback_written' '0']


In [11]:
corrupted_df = df[df['State'] == '0']
corrupted_df[:5]

Unnamed: 0,time,alarmtimer_alarmtimer_cancel,alarmtimer_alarmtimer_fired,alarmtimer_alarmtimer_start,alarmtimer_alarmtimer_suspend,alignment-faults,ase_spec,block_block_bio_backmerge,block_block_bio_bounce,block_block_bio_complete,...,writeback_writeback_written,State,Attack,Scenario,Label,interface,Unnamed: 911,Unnamed: 912,Unnamed: 913,Unnamed: 914
6171,5.004938602,0,0,0,0,0,9029495,0,0,0,...,0,0,0,0,0,Charging,backdoor,host-attack,attack,any
6172,10.06490129,0,0,0,0,0,10016259,0,0,0,...,0,0,0,0,0,Charging,backdoor,host-attack,attack,any
6173,15.12084978,0,0,0,0,0,6899022,0,0,0,...,0,0,0,0,0,Charging,backdoor,host-attack,attack,any
6174,20.18085179,0,0,0,0,0,5553413,0,0,0,...,0,0,0,0,0,Charging,backdoor,host-attack,attack,any
6175,25.24072949,0,0,0,0,0,7368832,0,0,0,...,0,0,0,0,0,Charging,backdoor,host-attack,attack,any


In [19]:
fixed_df = corrupted_df.copy(deep=True)

fixed_df['State'] = fixed_df['interface']
fixed_df['Attack'] = fixed_df['Unnamed: 911']
fixed_df['Scenario'] = fixed_df['Unnamed: 912']
fixed_df['Label'] = fixed_df['Unnamed: 913']
fixed_df['interface'] = fixed_df['Unnamed: 914']
fixed_df[:5]

Unnamed: 0,time,alarmtimer_alarmtimer_cancel,alarmtimer_alarmtimer_fired,alarmtimer_alarmtimer_start,alarmtimer_alarmtimer_suspend,alignment-faults,ase_spec,block_block_bio_backmerge,block_block_bio_bounce,block_block_bio_complete,...,writeback_writeback_written,State,Attack,Scenario,Label,interface,Unnamed: 911,Unnamed: 912,Unnamed: 913,Unnamed: 914
6171,5.004938602,0,0,0,0,0,9029495,0,0,0,...,0,Charging,backdoor,host-attack,attack,any,backdoor,host-attack,attack,any
6172,10.06490129,0,0,0,0,0,10016259,0,0,0,...,0,Charging,backdoor,host-attack,attack,any,backdoor,host-attack,attack,any
6173,15.12084978,0,0,0,0,0,6899022,0,0,0,...,0,Charging,backdoor,host-attack,attack,any,backdoor,host-attack,attack,any
6174,20.18085179,0,0,0,0,0,5553413,0,0,0,...,0,Charging,backdoor,host-attack,attack,any,backdoor,host-attack,attack,any
6175,25.24072949,0,0,0,0,0,7368832,0,0,0,...,0,Charging,backdoor,host-attack,attack,any,backdoor,host-attack,attack,any


In [21]:
valid_df = df[df['State'].isin(['Charging', 'idle'])]
combined_df = pd.concat([valid_df, fixed_df], axis=0, ignore_index=True)
combined_df = combined_df.drop(columns=['Unnamed: 911', 'Unnamed: 912', 'Unnamed: 913', 'Unnamed: 914'])

combined_df

Unnamed: 0,time,alarmtimer_alarmtimer_cancel,alarmtimer_alarmtimer_fired,alarmtimer_alarmtimer_start,alarmtimer_alarmtimer_suspend,alignment-faults,ase_spec,block_block_bio_backmerge,block_block_bio_bounce,block_block_bio_complete,...,writeback_writeback_wait_iff_congested,writeback_writeback_wake_background,writeback_writeback_write_inode,writeback_writeback_write_inode_start,writeback_writeback_written,State,Attack,Scenario,Label,interface
0,5.001476508,0,0,0,0,0,693371795,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
1,5.001487254,0,0,0,0,0,699964025,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
2,5.001640501,0,0,0,0,0,549770341,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
3,5.003761639,0,0,0,0,0,571970875,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
4,10.06573989,0,0,0,0,0,553199786,0,0,0,...,0,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,5808.438729,0,0,0,0,0,1333391,0,0,0,...,0,0,0,0,0,idle,backdoor,host-attack,attack,any
8464,5813.510716,0,0,0,0,0,1127358,0,0,0,...,0,0,0,0,0,idle,backdoor,host-attack,attack,any
8465,5818.574812,0,0,0,0,0,6370769,0,0,0,...,0,0,0,0,0,idle,backdoor,host-attack,attack,any
8466,5823.638711,0,0,0,0,0,7294614,0,0,0,...,0,0,0,0,0,idle,backdoor,host-attack,attack,any


In [27]:
combined_df.shape

(8468, 911)

In [28]:
combined_df['isDoS'] = combined_df['Scenario']=='DoS'
combined_df

Unnamed: 0,time,alarmtimer_alarmtimer_cancel,alarmtimer_alarmtimer_fired,alarmtimer_alarmtimer_start,alarmtimer_alarmtimer_suspend,alignment-faults,ase_spec,block_block_bio_backmerge,block_block_bio_bounce,block_block_bio_complete,...,writeback_writeback_wake_background,writeback_writeback_write_inode,writeback_writeback_write_inode_start,writeback_writeback_written,State,Attack,Scenario,Label,interface,isDoS
0,5.001476508,0,0,0,0,0,693371795,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
1,5.001487254,0,0,0,0,0,699964025,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
2,5.001640501,0,0,0,0,0,549770341,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
3,5.003761639,0,0,0,0,0,571970875,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
4,10.06573989,0,0,0,0,0,553199786,0,0,0,...,0,0,0,0,Charging,cryptojacking,Cryptojacking,attack,any,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8463,5808.438729,0,0,0,0,0,1333391,0,0,0,...,0,0,0,0,idle,backdoor,host-attack,attack,any,False
8464,5813.510716,0,0,0,0,0,1127358,0,0,0,...,0,0,0,0,idle,backdoor,host-attack,attack,any,False
8465,5818.574812,0,0,0,0,0,6370769,0,0,0,...,0,0,0,0,idle,backdoor,host-attack,attack,any,False
8466,5823.638711,0,0,0,0,0,7294614,0,0,0,...,0,0,0,0,idle,backdoor,host-attack,attack,any,False


In [30]:
cleaned_na = combined_df.dropna()

In [31]:
cleaned_na.shape

(6166, 912)

In [33]:
column_types = set()
for col in cleaned_na.columns:
    column_types.add(pd.api.types.infer_dtype(cleaned_na[col]))
column_types

{'boolean', 'string'}

In [34]:
cleaned_na.to_csv(path.join('dataset', 'EVSE-B-HPC-Kernel-Events-cleaned.csv'), index=False)