In [4]:
import os
import time
import pandas as pd
import numpy as np
import tqdm

from pprint import PrettyPrinter
pprint = PrettyPrinter(indent=2).pprint

file_name_class_map = {
    "interactive" : 0,
    "bulk" : 1,
    "video" : 3,
    "web" : 4
}

DATA_ROOT_PATH = "./data"
RAW_DATA_DIR_PATH = os.path.join(DATA_ROOT_PATH, "train")

raw_csv_name_list = list(filter(
    lambda file_name : not os.path.isdir(file_name) and "._" not in file_name,
    sorted(os.listdir(RAW_DATA_DIR_PATH))
))

print(raw_csv_name_list)

file_class_list = list(map(
    lambda file_name : file_name_class_map[file_name.split('_')[0]],
    raw_csv_name_list
))

['bulk_115s_01.csv', 'bulk_130s_01.csv', 'bulk_170s_01.csv', 'bulk_xs_01.csv', 'bulk_xs_02.csv', 'bulk_xs_03.csv', 'bulk_xs_04.csv', 'bulk_xs_05.csv', 'bulk_xs_06.csv', 'bulk_xs_07.csv', 'bulk_xs_08.csv', 'bulk_xs_09.csv', 'bulk_xs_10.csv', 'bulk_xs_11.csv', 'bulk_xs_12.csv', 'bulk_xs_16.csv', 'interactive_01.csv', 'interactive_02.csv', 'interactive_03.csv', 'interactive_04.csv', 'interactive_05.csv', 'interactive_06.csv', 'interactive_07.csv', 'interactive_08.csv', 'interactive_13.csv', 'interactive_14.csv', 'interactive_15.csv', 'interactive_16.csv', 'interactive_17.csv', 'interactive_18.csv', 'interactive_shell_01.csv', 'interactive_shell_02.csv', 'interactive_shell_03.csv', 'interactive_shell_04.csv', 'interactive_shell_05.csv', 'interactive_shell_06.csv', 'interactive_shell_07.csv', 'interactive_shell_08.csv', 'interactive_shell_09.csv', 'interactive_shell_10.csv', 'interactive_shell_11.csv', 'interactive_shell_12.csv', 'interactive_shell_13.csv', 'interactive_shell_14.csv', 'inte

In [76]:
def preprocessCsvFile(
    file_path,
    file_class,
    min_count_thresh = 40,
    window_size = 11
) :

    data = pd.read_csv(file_path)
    data["strftime"] = list(map(
        lambda epoch_time : time.strftime("%Y/%m/%d/ %H:%M:%S", time.localtime(epoch_time)),
        data["time"]
    ))
    #mask = data.groupby(["strftime", "ip_dst"])["ip_dst"].transform("count") > min_count_thresh
    #filtered_data.filter(lambda x : len(x) > min_count_thresh)
    #filtered_data = data[mask]
    #filtered_data_np = filtered_data["data_len"].to_numpy()

    data["hash"] = list(map(
        lambda epoch_time, ip_dst : int(epoch_time) * 1000**4 + sum(list(map(
            lambda val, idx : int(val) * 1000**idx,
            ip_dst.split('.'),
            range(3, -1, -1)
        ))),
        data["time"],
        data["ip_dst"]
    ))
    data_np = data[["hash", "data_len"]].to_numpy()

    unique_hashes, counts = np.unique(data_np[:, 0], return_counts=True)
    data_filtered_np = data_np[
        np.isin(data[:, 0], unique_hashes[counts > min_count_thresh])
    ]
    
    filtered_windowed_np = np.array(list(map(
        lambda idx : data_filtered_np[:, 1][idx:-(window_size-idx)],
        range(window_size)
    ))).T

    filtered_windowed = pd.DataFrame(
        filtered_windowed_np,
        columns = list(map(
            lambda i : f"traffic(t-{i})",
            range(window_size-1, -1, -1)
        ))
    )
    print(file_path, len(data), len(data_filtered_np), len(data) - len(data_filtered_np))
    return filtered_windowed

In [51]:
window_df_list = []
with tqdm.tqdm(total=len(raw_csv_name_list)) as pbar :
    for file_name, file_class in zip(raw_csv_name_list, file_class_list) :
        file_path = os.path.join(RAW_DATA_DIR_PATH, file_name)
        window_df_list.append(preprocessCsvFile(file_path, file_class))
    
        pbar.update(1)
    
preprocessed_df = pd.concat(window_df_list, axis=0)

  1%|          | 1/84 [00:00<00:44,  1.88it/s]

./data/train/bulk_115s_01.csv 324370 324361 9


  2%|▏         | 2/84 [00:01<00:53,  1.54it/s]

./data/train/bulk_130s_01.csv 465190 465143 47


  4%|▎         | 3/84 [00:01<00:55,  1.47it/s]

./data/train/bulk_170s_01.csv 449725 449710 15


  5%|▍         | 4/84 [00:02<00:54,  1.46it/s]

./data/train/bulk_xs_01.csv 435648 435629 19


  6%|▌         | 5/84 [00:03<00:45,  1.75it/s]

./data/train/bulk_xs_02.csv 222544 222527 17


  8%|▊         | 7/84 [00:03<00:34,  2.21it/s]

./data/train/bulk_xs_03.csv 429864 429841 23
./data/train/bulk_xs_04.csv 77338 77296 42


 10%|▉         | 8/84 [00:04<00:27,  2.80it/s]

./data/train/bulk_xs_05.csv 85068 85033 35


 11%|█         | 9/84 [00:04<00:29,  2.56it/s]

./data/train/bulk_xs_06.csv 291521 291506 15


 12%|█▏        | 10/84 [00:05<00:45,  1.63it/s]

./data/train/bulk_xs_07.csv 704629 704605 24


 13%|█▎        | 11/84 [00:06<00:51,  1.42it/s]

./data/train/bulk_xs_08.csv 577735 577719 16


 15%|█▌        | 13/84 [00:06<00:31,  2.23it/s]

./data/train/bulk_xs_09.csv 212347 212163 184
./data/train/bulk_xs_10.csv 61929 61903 26


 18%|█▊        | 15/84 [00:07<00:19,  3.56it/s]

./data/train/bulk_xs_11.csv 71307 71284 23
./data/train/bulk_xs_12.csv 75051 75019 32


 40%|████      | 34/84 [00:07<00:01, 26.12it/s]

./data/train/bulk_xs_16.csv 193000 192849 151
./data/train/interactive_01.csv 2472 0 2472
./data/train/interactive_02.csv 2104 0 2104
./data/train/interactive_03.csv 2913 0 2913
./data/train/interactive_04.csv 2615 0 2615
./data/train/interactive_05.csv 2305 0 2305
./data/train/interactive_06.csv 1886 0 1886
./data/train/interactive_07.csv 1849 48 1801
./data/train/interactive_08.csv 3515 0 3515
./data/train/interactive_13.csv 2540 0 2540
./data/train/interactive_14.csv 2377 0 2377
./data/train/interactive_15.csv 2499 0 2499
./data/train/interactive_16.csv 2285 0 2285
./data/train/interactive_17.csv 2693 0 2693
./data/train/interactive_18.csv 2845 0 2845
./data/train/interactive_shell_01.csv 1675 0 1675
./data/train/interactive_shell_02.csv 414 0 414
./data/train/interactive_shell_03.csv 685 0 685
./data/train/interactive_shell_04.csv 1090 0 1090
./data/train/interactive_shell_05.csv 1108 0 1108
./data/train/interactive_shell_06.csv 974 0 974
./data/train/interactive_shell_07.csv 629 0

 61%|██████    | 51/84 [00:07<00:00, 39.85it/s]

./data/train/video_180s1080p_01.csv 108951 108400 551
./data/train/video_180s1080p_02.csv 53561 52934 627
./data/train/video_180s1080p_03.csv 87324 86902 422
./data/train/video_180s1080p_04.csv 83383 83009 374


 69%|██████▉   | 58/84 [00:08<00:01, 24.86it/s]

./data/train/video_180s1080p_05.csv 80067 79718 349
./data/train/video_180s480p_01.csv 14711 14382 329
./data/train/video_180s480p_02.csv 18184 17730 454
./data/train/video_180s720p_01.csv 28935 28386 549
./data/train/video_180s720p_02.csv 14576 14176 400
./data/train/video_180s720p_03.csv 23125 22743 382
./data/train/video_180s720p_04.csv 9406 9093 313


 83%|████████▎ | 70/84 [00:08<00:00, 29.12it/s]

./data/train/video_180s720p_05.csv 51542 51163 379
./data/train/video_180s720p_06.csv 52798 52368 430
./data/train/video_180s720p_07.csv 19095 18659 436
./data/train/video_210s480p_01.csv 19840 19478 362
./data/train/web_1page_04.csv 4174 3337 837
./data/train/web_1page_05.csv 568 390 178
./data/train/web_1page_06.csv 1940 1439 501
./data/train/web_1page_07.csv 6832 6293 539
./data/train/web_2page_01.csv 1498 1001 497
./data/train/web_2page_02.csv 4041 3330 711
./data/train/web_2page_03.csv 2287 1816 471


100%|██████████| 84/84 [00:08<00:00,  9.34it/s]


./data/train/web_2page_04.csv 3569 2994 575
./data/train/web_2page_05.csv 4156 3492 664
./data/train/web_2page_06.csv 1321 1009 312
./data/train/web_3page_01.csv 2721 1894 827
./data/train/web_3page_02.csv 14502 10278 4224
./data/train/web_3page_03.csv 2306 1656 650
./data/train/web_3page_04.csv 3579 2862 717
./data/train/web_multiple_01.csv 3588 2315 1273
./data/train/web_multiple_02.csv 3356 2653 703
./data/train/web_multiple_04.csv 35107 27638 7469
./data/train/web_multiple_05.csv 14124 10120 4004
./data/train/web_multiple_06.csv 1844 1245 599


In [77]:
window_df_list = []

for file_name, file_class in zip(raw_csv_name_list, file_class_list) :
    file_path = os.path.join(RAW_DATA_DIR_PATH, file_name)
    window_df_list.append(preprocessCsvFile(file_path, file_class))
    
    
preprocessed_df = pd.concat(window_df_list, axis=0)

InvalidIndexError: (slice(None, None, None), 0)

In [36]:
len(preprocessed_df)

5421009

In [37]:
FILE_IDX = 0

min_count = 40
window_size = 11

FILE_NAME = raw_csv_name_list[FILE_IDX]
FILE_PATH = os.path.join(RAW_DATA_DIR_PATH, FILE_NAME)


data = pd.read_csv(FILE_PATH)

data["strftime"] = list(map(
    lambda epoch_time : time.strftime("%Y/%m/%d/ %H:%M:%S", time.localtime(epoch_time)),
    data["time"]
))

filtered_data = data.groupby(["strftime", "ip_dst"]).filter(lambda x : len(x) > min_count)

filtered_data_np = filtered_data["data_len"].to_numpy()

filtered_windowed_np = np.array(list(map(
    lambda idx : filtered_data_np[idx:-(window_size-idx)],
    range(window_size)
))).T

print(filtered_windowed_np.shape)

t_finished = time.time()
print("finished", t_finished - t_started)

(324350, 11)


NameError: name 't_started' is not defined

In [78]:
FILE_IDX = 0

min_count = 40
window_size = 11

FILE_NAME = raw_csv_name_list[FILE_IDX]
FILE_PATH = os.path.join(RAW_DATA_DIR_PATH, FILE_NAME)

print(FILE_PATH)

data = pd.read_csv(FILE_PATH)

data["strftime"] = list(map(
    lambda epoch_time : time.strftime("%Y/%m/%d/ %H:%M:%S", time.localtime(epoch_time)),
    data["time"]
))

data["hash"] = list(map(
    lambda epoch_time, ip_dst : int(epoch_time) * 1000**4 + sum(list(map(
        lambda val, idx : int(val) * 1000**idx,
        ip_dst.split('.'),
        range(3, -1, -1)
    ))),
    data["time"],
    data["ip_dst"]
))
data_np = data[["hash", "data_len"]].to_numpy()

#unique_hashes, counts = np.unique(data_np[:, 0], return_counts=True)
#data_filtered_np = data_np[
#    np.isin(data[:, 0], unique_hashes[counts > min_count_thresh])
#]

print(data_np)

print(set(data_np[:, 0]))



./data/train/bulk_115s_01.csv
[[1551438658192168001001 58]
 [1551438659192168001149 230]
 [1551438659080249099148 603]
 ...
 [1551438767192168001149 1448]
 [1551438767192168001149 1448]
 [1551438767192168001149 1448]]
{1551438740192168001149, 1551438757192168001149, 1551438693192168001149, 1551438725192168001149, 1551438669192168001149, 1551438661192168001149, 1551438665192168001149, 1551438673192168001149, 1551438663192168001149, 1551438697192168001149, 1551438729192168001149, 1551438717192168001149, 1551438659080249099148, 1551438731192168001149, 1551438748192168001149, 1551438765192168001149, 1551438705192168001149, 1551438739192168001149, 1551438756192168001149, 1551438662192168001149, 1551438679192168001149, 1551438696192168001149, 1551438713192168001149, 1551438730192168001149, 1551438747192168001149, 1551438764192168001149, 1551438687192168001149, 1551438658192168001001, 1551438738192168001149, 1551438755192168001149, 1551438666192168001149, 1551438674192168001149, 1551438670192

In [81]:
data_np.astype(np.uint64)

OverflowError: Python int too large to convert to C long

In [61]:
arr = np.array([
    [1, 2, 3],
    [1, 4, 5],
    [2, 6, 7],
    [3, 8, 9],
    [1, 10, 11],
    [2, 12, 13]
])
unique_vals, counts = np.unique(arr[:,0], return_counts=True)

print(unique_vals, counts)

new_arr = arr[np.isin(arr[:,0], unique_vals[counts > 1])]

print(new_arr)

[1 2 3] [3 2 1]
[[ 1  2  3]
 [ 1  4  5]
 [ 2  6  7]
 [ 1 10 11]
 [ 2 12 13]]
