In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import csv
import time

%matplotlib inline
plt.rcParams['figure.dpi'] = 100

In [2]:
test_dos = pd.read_csv('hacker_dataset/DoS_dataset.csv')
test_fuzzy = pd.read_csv('hacker_dataset/Fuzzy_dataset.csv')
test_gear = pd.read_csv('hacker_dataset/gear_dataset.csv')
test_rpm = pd.read_csv('hacker_dataset/RPM_dataset.csv')

In [3]:
test_dos['Type'] = 'DoS'
test_fuzzy['Type'] = 'Fuzzy'
test_gear['Type'] = 'Spoofing_gear'
test_rpm['Type'] = 'Spoofing_rpm'

In [4]:
test = pd.concat([test_dos, test_fuzzy, test_gear, test_rpm], axis=0)
test.reset_index(drop=True, inplace=True)

In [5]:
test.head(10)

Unnamed: 0,Timestamp,CAN_ID,DLC,byte0,byte1,byte2,byte3,byte4,byte5,byte6,byte7,Flag,Type
0,1478198000.0,0316,8,05,21,68,09,21,21,00,6f,R,DoS
1,1478198000.0,018f,8,fe,5b,00,00,0,3c,00,00,R,DoS
2,1478198000.0,0260,8,19,21,22,30,8,8e,6d,3a,R,DoS
3,1478198000.0,02a0,8,64,00,9a,1d,97,02,bd,00,R,DoS
4,1478198000.0,0329,8,40,bb,7f,14,11,20,00,14,R,DoS
5,1478198000.0,0545,8,d8,00,00,8a,0,00,00,00,R,DoS
6,1478198000.0,0002,8,00,00,00,00,0,03,0b,11,R,DoS
7,1478198000.0,0153,8,00,21,10,ff,0,ff,00,00,R,DoS
8,1478198000.0,02c0,8,14,00,00,00,0,00,00,00,R,DoS
9,1478198000.0,0130,8,08,80,00,ff,31,80,0b,7f,R,DoS


As DLC of some samples are shorter than 8 bytes and the missing bytes values are skipped the flag values for such samples are shifted to the left (on the place next to last non-nan column). Therefore for this cases we need to shift flag values to the appropriate place (to the last column).

In [6]:
# Check how many unique DLCs are in the dataset 
print(f'DoS: {pd.unique(test_dos.DLC)}')
print(f'Fuzzy: {pd.unique(test_fuzzy.DLC)}')
print(f'Gear: {pd.unique(test_gear.DLC)}')
print(f'RPM: {pd.unique(test_rpm.DLC)}')

DoS: [8 2]
Fuzzy: [8 5 2 6]
Gear: [8 2]
RPM: [8 2]


In [7]:
# offest to calculate column index from which to take value for Flag column
offset = 3

for dlc in pd.unique(test.DLC):
    if dlc < 8:
        indx = test[test.DLC==dlc].index
        test.loc[indx, 'Flag'] = test.iloc[indx, dlc+offset]
        test.iloc[indx, dlc+offset] = np.nan

In [9]:
for col in ('byte0', 'byte1', 'byte2', 'byte3', 'byte4', 'byte5', 'byte6', 'byte7'):
    idx = test[test[col].notna()].index
    test.loc[idx, col] = test.loc[idx, col].apply(lambda x: int(x, 16))

In [10]:
test

Unnamed: 0,Timestamp,CAN_ID,DLC,byte0,byte1,byte2,byte3,byte4,byte5,byte6,byte7,Flag,Type
0,1.478198e+09,0316,8,5,33,104,9,33,33,0,111,R,DoS
1,1.478198e+09,018f,8,254,91,0,0,0,60,0,0,R,DoS
2,1.478198e+09,0260,8,25,33,34,48,8,142,109,58,R,DoS
3,1.478198e+09,02a0,8,100,0,154,29,151,2,189,0,R,DoS
4,1.478198e+09,0329,8,64,187,127,20,17,32,0,20,R,DoS
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16569470,1.478201e+09,018f,8,254,89,0,0,0,65,0,0,R,Spoofing_rpm
16569471,1.478201e+09,0260,8,24,33,33,48,8,143,109,25,R,Spoofing_rpm
16569472,1.478201e+09,02a0,8,36,0,154,29,151,2,189,0,R,Spoofing_rpm
16569473,1.478201e+09,0329,8,220,183,127,20,17,32,0,20,R,Spoofing_rpm


In [11]:
# Fill NaN elements
test.fillna('noval', inplace=True)

In [12]:
print(f'NaN values: {test.isnull().sum().sum()}')

NaN values: 0
