In [1]:
import os
import time
import pandas as pd
import numpy as np
import tqdm

from pprint import PrettyPrinter
pprint = PrettyPrinter(indent=2).pprint

file_name_class_map = {
    "interactive" : 0,
    "bulk" : 1,
    "video" : 3,
    "web" : 4
}


DATA_ROOT_PATH = "./data"
RAW_DATA_DIR_PATH = os.path.join(DATA_ROOT_PATH, "train")

raw_csv_name_list = list(filter(
    lambda file_name : not os.path.isdir(file_name),
    sorted(os.listdir(RAW_DATA_DIR_PATH))
))

file_class_list = list(map(
    lambda file_name : file_name_class_map[file_name.split('_')[0]],
    raw_csv_name_list
))

In [10]:
def preprocessCsvFile(
    file_path,
    file_class,
    min_count_thresh = 40,
    window_size = 11
) :
    data = pd.read_csv(file_path)
    data["strftime"] = list(map(
        lambda epoch_time : time.strftime("%Y/%m/%d/ %H:%M:%S", time.localtime(epoch_time)),
        data["time"]
    ))
    filtered_data = data.groupby(["strftime", "ip_dst"]).filter(lambda x : len(x) > min_count_thresh)
    filtered_data_np = filtered_data["data_len"].to_numpy()
    filtered_windowed_np = np.array(list(map(
        lambda idx : filtered_data_np[idx:-(window_size-idx)],
        range(window_size)
    ))).T

    filtered_windowed = pd.DataFrame(
        filtered_windowed_np,
        columns = list(map(
            lambda i : f"traffic(t-{i})",
            range(window_size-1, -1, -1)
        ))
    )
    
    return filtered_windowed

In [4]:
window_df_list = []
with tqdm.tqdm(total=len(raw_csv_name_list)) as pbar :
    for file_name, file_class in zip(raw_csv_name_list, file_class_list) :
        file_path = os.path.join(RAW_DATA_DIR_PATH, file_name)
        window_df_list.append(preprocessCsvFile(
            file_path, file_class
        ))
    
        pbar.update(1)
    

100%|███████████████████████████████████████████████████████████████████████████████████| 84/84 [00:13<00:00,  6.08it/s]


In [7]:
pd.concat(window_df_list, axis=0)

Unnamed: 0,traffic(t-10),traffic(t-9),traffic(t-8),traffic(t-7),traffic(t-6),traffic(t-5),traffic(t-4),traffic(t-3),traffic(t-2),traffic(t-1),traffic(t-0)
0,230,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448
1,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448
2,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448
3,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448
4,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448,1448
...,...,...,...,...,...,...,...,...,...,...,...
1229,1418,1418,1418,1418,1418,1418,1418,1418,1418,1418,1418
1230,1418,1418,1418,1418,1418,1418,1418,1418,1418,1418,1418
1231,1418,1418,1418,1418,1418,1418,1418,1418,1418,1418,574
1232,1418,1418,1418,1418,1418,1418,1418,1418,1418,574,744


In [15]:
FILE_IDX = 0

min_count = 40
window_size = 11

FILE_NAME = raw_csv_name_list[FILE_IDX]
FILE_PATH = os.path.join(RAW_DATA_DIR_PATH, FILE_NAME)


data = pd.read_csv(FILE_PATH)

data["strftime"] = list(map(
    lambda epoch_time : time.strftime("%Y/%m/%d/ %H:%M:%S", time.localtime(epoch_time)),
    data["time"]
))

filtered_data = data.groupby(["strftime", "ip_dst"]).filter(lambda x : len(x) > min_count)

filtered_data_np = filtered_data["data_len"].to_numpy()

filtered_windowed_np = np.array(list(map(
    lambda idx : filtered_data_np[idx:-(window_size-idx)],
    range(window_size)
))).T

print(filtered_windowed_np.shape)

t_finished = time.time()
print("finished", t_finished - t_started)

(324350, 11)
finished 0.7728850841522217


In [4]:
FILE_IDX = 0

min_count = 40
window_size = 11

FILE_NAME = raw_csv_name_list[FILE_IDX]
FILE_PATH = os.path.join(RAW_DATA_DIR_PATH, FILE_NAME)


data = pd.read_csv(FILE_PATH)


data["strftime"] = list(map(
    lambda epoch_time : time.strftime("%Y/%m/%d/ %H:%M:%S", time.localtime(epoch_time)),
    data["time"]
))


data

Unnamed: 0,time,proto,data_len,ip_src,ip_dst,src_port,dst_port,strftime
0,1.551439e+09,17,58,192.168.1.149,192.168.1.1,52835,53,2019/03/01/ 20:10:58
1,1.551439e+09,17,230,192.168.1.1,192.168.1.149,53,52835,2019/03/01/ 20:10:59
2,1.551439e+09,6,603,192.168.1.149,80.249.99.148,51850,80,2019/03/01/ 20:10:59
3,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:10:59
4,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:10:59
...,...,...,...,...,...,...,...,...
324365,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:12:47
324366,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:12:47
324367,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:12:47
324368,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:12:47


In [5]:

filtered_data = data.groupby(["strftime", "ip_dst"]).filter(lambda x : len(x) > min_count)

filtered_data

Unnamed: 0,time,proto,data_len,ip_src,ip_dst,src_port,dst_port,strftime
1,1.551439e+09,17,230,192.168.1.1,192.168.1.149,53,52835,2019/03/01/ 20:10:59
3,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:10:59
4,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:10:59
5,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:10:59
6,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:10:59
...,...,...,...,...,...,...,...,...
324365,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:12:47
324366,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:12:47
324367,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:12:47
324368,1.551439e+09,6,1448,80.249.99.148,192.168.1.149,80,51850,2019/03/01/ 20:12:47


In [None]:
data.to_csvv("data1.csv")