In [87]:
import os
import pickle as pkl

import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedShuffleSplit

In [88]:
MARCH_PARENT_DIR = "./cicddos_data/CSV-03-11/03-11"
JAN_PARENT_DIR = "./cicddos_data/CSV-01-12/01-12"

OUT_DIR = "./cicddos_data/out"

In [75]:
def get_headers():
  return pkl.load(open(f"./pkls/headers.pkl", "rb"))

In [136]:
def create_draft(parent_dir, out_dir):
    
    # init csv with headers
    col_names = [col.strip() for col in get_headers()]
    cols_df = pd.DataFrame(columns=col_names)
    train_csv_path = f"{OUT_DIR}/train.csv"
    test_csv_path = f"{OUT_DIR}/test.csv"
    cols_df.to_csv(train_csv_path, index=False, mode='w')
    cols_df.to_csv(test_csv_path, index=False, mode='w')
    
    for filename in os.listdir(parent_dir):
        if filename.endswith(".csv") or filename.endswith(".CSV"):
            csv_file_path = os.path.join(parent_dir, filename)
            attack_type = filename.split(".")[0].split("_")[-1]
            print(f"Reading CSV [{attack_type}].")
            
            df = pd.read_csv(csv_file_path, low_memory=False)
#             df = pd.read_csv(os.path.join(parent_dir, "DrDoS_NTP.csv"), low_memory=False)
            print(f"Read Complete [{attack_type}].\nProcessing [{attack_type}].")
            
            df.drop("Unnamed: 0", axis=1, inplace=True)
            
            df["attack_type"] = attack_type
            

            stratified_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
            for train_index, test_index in stratified_splitter.split(df, df[' Label']):
                train_df = df.loc[train_index]
                test_df = df.loc[test_index]
                
                train_df = train_df[:200000]
                test_df = test_df[:2000]
                print(f"Processing Complete [{attack_type}].")
                
                train_df.to_csv(train_csv_path, mode='a', index=False, header=False)
                test_df.to_csv(test_csv_path, mode='a', index=False, header=False)
                
                print(f"Append Complete [{attack_type}].")
        print()
                
    print("All Done.")

In [None]:
create_draft(JAN_PARENT_DIR, OUT_DIR)

Reading CSV [SSDP].
Read Complete [SSDP].
Processing [SSDP].
Processing Complete [SSDP].
Append Complete [SSDP].

Reading CSV [NetBIOS].
Read Complete [NetBIOS].
Processing [NetBIOS].
Processing Complete [NetBIOS].
Append Complete [NetBIOS].

Reading CSV [UDP].
Read Complete [UDP].
Processing [UDP].
Processing Complete [UDP].
Append Complete [UDP].

Reading CSV [TFTP].
Read Complete [TFTP].
Processing [TFTP].
Processing Complete [TFTP].
Append Complete [TFTP].

Reading CSV [Syn].
Read Complete [Syn].
Processing [Syn].
Processing Complete [Syn].
Append Complete [Syn].

Reading CSV [MSSQL].
Read Complete [MSSQL].
Processing [MSSQL].
Processing Complete [MSSQL].
Append Complete [MSSQL].

Reading CSV [SNMP].
Read Complete [SNMP].
Processing [SNMP].
Processing Complete [SNMP].
Append Complete [SNMP].

Reading CSV [LDAP].
Read Complete [LDAP].
Processing [LDAP].
Processing Complete [LDAP].
Append Complete [LDAP].

Reading CSV [NTP].
Read Complete [NTP].
Processing [NTP].
Processing Complete 

In [115]:
for filename in os.listdir(JAN_PARENT_DIR):
    if filename.endswith(".csv"):
        csv_path = os.path.join(JAN_PARENT_DIR, filename)
        df = pd.read_csv(csv_path, low_memory=False)
        print(df[" Label"].value_counts())
        print()

DrDoS_SSDP    2610611
BENIGN            763
Name:  Label, dtype: int64

DrDoS_NetBIOS    4093279
BENIGN              1707
Name:  Label, dtype: int64

DrDoS_UDP    3134645
BENIGN          2157
Name:  Label, dtype: int64

TFTP      20082580
BENIGN       25247
Name:  Label, dtype: int64

Syn       1582289
BENIGN        392
Name:  Label, dtype: int64

DrDoS_MSSQL    4522492
BENIGN            2006
Name:  Label, dtype: int64

DrDoS_SNMP    5159870
BENIGN           1507
Name:  Label, dtype: int64



ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [116]:
df_test = pd.read_csv(os.path.join(JAN_PARENT_DIR, "DrDoS_NTP.csv"), low_memory=False)

In [133]:
actual_labels = df_test[" Label"].index

In [129]:
req_labels = [label.split("DrDoS_")[-1] for label in df_test[" Label"].value_counts().index.to_list()]

In [131]:
req_labels.remove('BENIGN')
req_labels

['NTP']

In [70]:
test_df = pd.DataFrame(columns=get_headers())

In [71]:
test_df

Unnamed: 0,Flow ID,Source IP,Source Port,Destination IP,Destination Port,Protocol,Timestamp,Flow Duration,Total Fwd Packets,Total Backward Packets,...,Active Max,Active Min,Idle Mean,Idle Std,Idle Max,Idle Min,SimillarHTTP,Inbound,Label,attack_type


In [138]:
get_headers()

array(['Flow ID', ' Source IP', ' Source Port', ' Destination IP',
       ' Destination Port', ' Protocol', ' Timestamp', ' Flow Duration',
       ' Total Fwd Packets', ' Total Backward Packets',
       'Total Length of Fwd Packets', ' Total Length of Bwd Packets',
       ' Fwd Packet Length Max', ' Fwd Packet Length Min',
       ' Fwd Packet Length Mean', ' Fwd Packet Length Std',
       'Bwd Packet Length Max', ' Bwd Packet Length Min',
       ' Bwd Packet Length Mean', ' Bwd Packet Length Std',
       'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean',
       ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total',
       ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min',
       'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max',
       ' Bwd IAT Min', 'Fwd PSH Flags', ' Bwd PSH Flags',
       ' Fwd URG Flags', ' Bwd URG Flags', ' Fwd Header Length',
       ' Bwd Header Length', 'Fwd Packets/s', ' Bwd Packets/s',
       ' Min Packet Length', '