In [1]:
import os
from glob import glob
from typing import List
from datetime import datetime
from multiprocessing import Pool

import pandas as pd

# Dataset Analysis

### Train Normal Data

In [45]:

train_path = "./data-test/CAShift-CSV/train"
train_file_number = 1000

all_frequencies = {}
for i in range(train_file_number):
    train_df = pd.read_csv(f"{train_path}/normal_{i}.csv")
    frequency = train_df['SysCall'].value_counts(ascending=True)
    for syscall, freq in frequency.items():
        if syscall not in all_frequencies:
            all_frequencies[syscall] = []
        all_frequencies[syscall].append(freq)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
average_frequencies = {syscall: sum(freq_list) / train_file_number for syscall, freq_list in all_frequencies.items()}
average_frequencies_df_train = pd.DataFrame(list(average_frequencies.items()), columns=['syscall', 'average_frequency'])
average_frequencies_df_train = average_frequencies_df_train .sort_values(by='average_frequency', ascending=False)
print(len(average_frequencies_df_train))
print(average_frequencies_df_train.head(10))



61
        syscall  average_frequency
42  epoll_pwait           1122.941
43   newfstatat           1024.087
34         read           1014.409
41    epoll_ctl           1012.722
39        futex            960.275
38       switch            928.119
35    nanosleep            827.794
24        fcntl            628.712
37       openat            575.209
36        close            515.302


### Test Normal Data

In [46]:
test_normal_path = "./data-test/CAShift-CSV/test-normal"
test_normal_file_number = 100

all_frequencies = {}
for i in range(test_normal_file_number):
    test_df = pd.read_csv(f"{test_normal_path}/normal_{i}.csv")
    frequency = test_df['SysCall'].value_counts(ascending=True)
    for syscall, freq in frequency.items():
        if syscall not in all_frequencies:
            all_frequencies[syscall] = []
        all_frequencies[syscall].append(freq)

# print(all_frequencies)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
average_frequencies = {syscall: sum(freq_list) / test_normal_file_number for syscall, freq_list in all_frequencies.items()}
average_frequencies_df_test = pd.DataFrame(list(average_frequencies.items()), columns=['syscall', 'average_frequency'])
average_frequencies_df_test = average_frequencies_df_test.sort_values(by='average_frequency', ascending=False)
print(len(average_frequencies_df_test))
print(average_frequencies_df_test.head(10))

60
        syscall  average_frequency
22  epoll_pwait            1166.46
19    epoll_ctl            1047.66
20         read            1035.92
25   newfstatat            1025.00
24        futex             984.14
23       switch             948.95
21    nanosleep             843.04
15        fcntl             643.14
18       openat             594.14
17        close             532.66


### Compare Train Normal and Test Normal

In [47]:
# Compare difference
unique_to_df1 = average_frequencies_df_train[~average_frequencies_df_train['syscall'].isin(average_frequencies_df_test['syscall'])]
unique_to_df2 = average_frequencies_df_test[~average_frequencies_df_test['syscall'].isin(average_frequencies_df_train['syscall'])]
print("Unique to Train:")
print(unique_to_df1)
print('\n')
print("Unique to Test:")
if len(unique_to_df2) > 0:
    print(unique_to_df2)
else: 
    print("None")

Unique to Train:
  syscall  average_frequency
4    dup3              0.002


Unique to Test:
None


### Test Attack Data

In [None]:
attack_id = "CVE_2016-10033"
attack_path = "./data-test/CAShift-CSV/test-attack"
attack_number = 100

all_frequencies = {}
for i in range(attack_number):
    test_df = pd.read_csv(f"{attack_path}/{attack_id}_{i}.csv")
    frequency = test_df['SysCall'].value_counts(ascending=True)
    for syscall, freq in frequency.items():
        if syscall not in all_frequencies:
            all_frequencies[syscall] = []
        all_frequencies[syscall].append(freq)

# print(all_frequencies)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
average_frequencies = {syscall: sum(freq_list) / attack_number for syscall, freq_list in all_frequencies.items()}
average_frequencies_df_test_attack = pd.DataFrame(list(average_frequencies.items()), columns=['syscall', 'average_frequency'])
average_frequencies_df_test_attack = average_frequencies_df_test_attack.sort_values(by='average_frequency', ascending=False)
print(len(average_frequencies_df_test_attack))
print(average_frequencies_df_test_attack.head(10))

65
        syscall  average_frequency
23   newfstatat            1503.60
22        futex            1088.94
18         read            1026.23
19  epoll_pwait             965.83
17    epoll_ctl             927.18
21       switch             875.30
11        fcntl             726.40
14       openat             671.60
20    nanosleep             620.91
13        close             610.70


### Compare Train Normal and Test Attack

In [49]:
# Compare difference
unique_to_df1 = average_frequencies_df_train[~average_frequencies_df_train['syscall'].isin(average_frequencies_df_test_attack['syscall'])]
unique_to_df2 = average_frequencies_df_test_attack[~average_frequencies_df_test_attack['syscall'].isin(average_frequencies_df_train['syscall'])]
print("Unique to Train:")
print(unique_to_df1)
print('\n')
print("Unique to Attack:")
if len(unique_to_df2) > 0:
    print(unique_to_df2)
else: 
    print("None")

Unique to Train:
              syscall  average_frequency
56        sigaltstack             11.872
52             capget              0.958
53      epoll_create1              0.958
51           readlink              0.958
50  sched_getaffinity              0.958
4                dup3              0.002


Unique to Attack:
            syscall  average_frequency
46             rseq               5.44
42            ioctl               3.88
43          fstatfs               3.60
38      setpriority               1.80
36      getpriority               1.80
39           fchdir               1.80
30       epoll_wait               0.08
31  timerfd_settime               0.06
33        ftruncate               0.02
64          recvmsg               0.02


In [29]:
attack_id = "CVE_2019-5736"
attack_path = "./data-test/CAShift-CSV/test-attack"
attack_number = 100

all_frequencies = {}
for i in range(attack_number):
    test_df = pd.read_csv(f"{attack_path}/{attack_id}_{i}.csv")
    frequency = test_df['SysCall'].value_counts(ascending=True)
    for syscall, freq in frequency.items():
        if syscall not in all_frequencies:
            all_frequencies[syscall] = []
        all_frequencies[syscall].append(freq)

# print(all_frequencies)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
average_frequencies = {syscall: sum(freq_list) / attack_number for syscall, freq_list in all_frequencies.items()}
average_frequencies_df_test_attack2 = pd.DataFrame(list(average_frequencies.items()), columns=['syscall', 'average_frequency'])
average_frequencies_df_test_attack2 = average_frequencies_df_test_attack2.sort_values(by='average_frequency', ascending=False)
print(len(average_frequencies_df_test_attack2))
print(average_frequencies_df_test_attack2.head(10))

62
        syscall  average_frequency
17         read            1152.22
16    epoll_ctl            1145.95
15        fcntl            1072.68
13       openat             981.85
10   newfstatat             980.24
12        close             854.10
21        futex             814.97
19  epoll_pwait             808.80
20       switch             718.64
18    nanosleep             452.21


In [43]:
# Compare difference
unique_to_df1 = average_frequencies_df_train[~average_frequencies_df_train['syscall'].isin(average_frequencies_df_test_attack2['syscall'])]
unique_to_df2 = average_frequencies_df_test_attack2[~average_frequencies_df_test_attack2['syscall'].isin(average_frequencies_df_train['syscall'])]
print("Unique to Train:")
print(unique_to_df1)
print('\n')
print("Unique to Attack:")
if len(unique_to_df2) > 0:
    print(unique_to_df2)
else: 
    print("None")

Unique to Train:
              syscall  average_frequency
56        sigaltstack              11.28
52             capget               0.92
53      epoll_create1               0.92
51           readlink               0.92
50  sched_getaffinity               0.92
8               flock               0.04
4                dup3               0.02


Unique to Attack:
            syscall  average_frequency
42             rseq              12.00
34          fstatfs               8.00
36            ioctl               8.00
29           fchdir               4.00
27      getpriority               4.00
26      setpriority               4.00
8        epoll_wait               1.40
61  timerfd_settime               0.02
