Read processed file

In [10]:
import pandas as pd

dir = 'pro/'

# read 
df = pd.read_csv(dir + 'processed.csv')

df.head()

Unnamed: 0,id.orig_p,id.resp_p,proto,conn_state,history,target,id.orig_h_1,id.orig_h_2,id.orig_h_3,id.orig_h_4,...,id.resp_h_3,id.resp_h_4,duration,orig_bytes,resp_bytes,missed_bytes,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes
0,dyn,dyn,tcp,S0,S,1,192,168,1,198,...,215,13,3.141696,0,0,0.0,3.0,180.0,0.0,0.0
1,reg,reg,tcp,S0,S,1,192,168,1,198,...,30,168,-,-,-,0.0,1.0,40.0,0.0,0.0
2,reg,reg,tcp,S0,S,1,192,168,1,198,...,164,168,-,-,-,0.0,1.0,40.0,0.0,0.0
3,reg,reg,tcp,S0,S,1,192,168,1,198,...,153,227,-,-,-,0.0,1.0,40.0,0.0,0.0
4,reg,reg,tcp,S0,S,1,192,168,1,198,...,31,187,-,-,-,0.0,1.0,40.0,0.0,0.0


Check a few numeric features before encoding

In [11]:
# get the percentage of counts where value is '-' in column duration, orig_bytes, and resp_bytes
print('duration: ', df[df['duration'] == '-'].shape[0] / df.shape[0])
print('orig_bytes: ', df[df['orig_bytes'] == '-'].shape[0] / df.shape[0])
print('resp_bytes: ', df[df['resp_bytes'] == '-'].shape[0] / df.shape[0])


duration:  0.9982134129132123
orig_bytes:  0.9982134129132123
resp_bytes:  0.9982134129132123


In [12]:
# drop columns duration, orig_bytes, and resp_bytes
df.drop(['duration', 'orig_bytes', 'resp_bytes'], axis=1, inplace=True)

# for each column in df count the number of value is equal to '-'
for col in df.columns:
    print(col, df[df[col] == '-'].shape[0] / df.shape[0])

id.orig_p 0.0
id.resp_p 0.0
proto 0.0
conn_state 0.0
history 0.000295221334725535
target 0.0
id.orig_h_1 0.0
id.orig_h_2 0.0
id.orig_h_3 0.0
id.orig_h_4 0.0
id.resp_h_1 0.0
id.resp_h_2 0.0
id.resp_h_3 0.0
id.resp_h_4 0.0
missed_bytes 0.0
orig_pkts 0.0
orig_ip_bytes 0.0
resp_pkts 0.0
resp_ip_bytes 0.0


In [14]:
# check missing values in df
df.isnull().sum()

id.orig_p        0
id.resp_p        0
proto            0
conn_state       0
history          0
target           0
id.orig_h_1      0
id.orig_h_2      0
id.orig_h_3      0
id.orig_h_4      0
id.resp_h_1      0
id.resp_h_2      0
id.resp_h_3      0
id.resp_h_4      0
missed_bytes     0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
dtype: int64

Encoding

In [15]:
from sklearn.preprocessing import OneHotEncoder

# columns to be one-hot encoded
columns_to_encode = ['proto', 'conn_state', 'history', 'id.orig_p', 'id.resp_p']

# OneHotEncoder
encoder = OneHotEncoder(sparse=False)

# fit and transform the data
encoded_data = encoder.fit_transform(df[columns_to_encode])

# get feature names
encoded_feature_names = encoder.get_feature_names_out(columns_to_encode)

# create a new DataFrame with encoded data
df_encoded = pd.DataFrame(encoded_data, columns=encoded_feature_names)

# drop original columns and concatenate the new one-hot encoded columns
df = df.drop(columns_to_encode, axis=1)
df = pd.concat([df, df_encoded], axis=1)

df.head()


Unnamed: 0,target,id.orig_h_1,id.orig_h_2,id.orig_h_3,id.orig_h_4,id.resp_h_1,id.resp_h_2,id.resp_h_3,id.resp_h_4,missed_bytes,...,history_B,history_S,history_d,history_r,id.orig_p_dyn,id.orig_p_reg,id.orig_p_wk,id.resp_p_dyn,id.resp_p_reg,id.resp_p_wk
0,1,192,168,1,198,185,130,215,13,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,1,192,168,1,198,105,49,30,168,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,1,192,168,1,198,18,206,164,168,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1,192,168,1,198,181,237,153,227,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1,192,168,1,198,182,150,31,187,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [17]:
# print value counts for target column
print(df['target'].value_counts())

1    13642435
2    11378759
0       78618
Name: target, dtype: int64


Output files for ml

In [18]:
# split by label
df_benign = df[df['target'] == 0]
df_mirai = df[df['target'] == 1]
df_okiru = df[df['target'] == 2]

In [19]:
# save dfs to csv files
out_dir = 'out/'
df_benign.to_csv(out_dir + 'benign.csv', index=False)
df_mirai.to_csv(out_dir + 'mirai.csv', index=False)
df_okiru.to_csv(out_dir + 'okiru.csv', index=False)