In [1]:
import pandas as pd 
import numpy as np
from scapy.all import * 
from datetime import datetime
from datetime import timezone
from collections import defaultdict

In [2]:
UTC = timezone.utc

# Creating Main DataFrame 

Setting TCP Flag Mappings

In [3]:
# TCP Flags Mapping
# Check here for bitmap: https://www.noction.com/blog/tcp-flags#:~:text=The%20hexadecimal%20number%200x02%20tells,a%20particular%20flag%20is%20set.
FIN = 0x01
SYN = 0x02
RST = 0x04
PSH = 0x08
ACK = 0x10
URG = 0x20
ECE = 0x40
CWR = 0x80

Here, we make important assumption that the first flow we see in the pcap for a unique flow_id, is considered as the "forward packet". This may not necessarily be the packet initiated by the host (perhaps), but it is a fair assumption to make. 

This assumption just helps us define a direction of the flow and should not change anything major. If any flow has a lot of packets being sent from one direction (think DDoS), then this imbalance will be captured in the fwd or bwd total packets column. (should not matter which one specifically) 

In [4]:
def create_dataframe(INPUT_FILE):

    """Finds all unique flows based on flow id. Returns data frame with basic metrics computed for each flow. """
    print("Reading .pcap file.")
    packets = rdpcap(INPUT_FILE)
    print("Reading .pcap file DONE.")


    print("Creating initial dataframe.")
    all_data = {}
    flow_fwd_states = []
    i = 0
    for pkt in packets:
    
        if IP in pkt:
            tmp_pack_dict = {}

            tmp_pack_dict["sport"] = pkt[IP].sport if hasattr(pkt[IP], "sport") else None
            tmp_pack_dict["src_ip"] = pkt[IP].src 
            tmp_pack_dict["dst_port"] = pkt[IP].dport if hasattr(pkt[IP], "dport") else None
            tmp_pack_dict["dst_ip"] = pkt[IP].dst
            flow_size = pkt.len

            # Check https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml for Assigned Internet Protocol Numbers
            tmp_pack_dict['protocol'] = pkt.proto

            # Flow Unique Identifier / Flow ID 
            flow_id = frozenset([tmp_pack_dict["sport"], tmp_pack_dict["src_ip"], tmp_pack_dict["dst_port"], tmp_pack_dict["dst_ip"]])
            # Need set representation because if there is a backward flow (with just order changed of source and destination) then it should be marked as "seen" previously 
            # Ordered flow id (to check if belongs to the same stream or not)
            flow_id_ordered = (tmp_pack_dict['sport'], tmp_pack_dict['src_ip'], tmp_pack_dict['dst_port'], tmp_pack_dict['dst_ip']) # save it in order

            if flow_id not in all_data: #meaning this is a new flow (from a different stream) 
                tmp_pack_dict["sizes"] = [flow_size]
                tmp_pack_dict["first_timestamp"] = pkt.time
                tmp_pack_dict["last_timestamp"] = pkt.time 
                tmp_pack_dict["flow_duration"] = 0 
                tmp_pack_dict["arrival_times"] = [pkt.time]


                # Forward packets 
                tmp_pack_dict["total_fwd_packets"] = 1 # To count the first instance 
                tmp_pack_dict["fwd_pkt_sizes"] = [pkt.len]
                tmp_pack_dict["first_timestamp_fwd"] = pkt.time
                tmp_pack_dict["last_timestamp_fwd"] = pkt.time 
                tmp_pack_dict["arrival_times_fwd"] = [pkt.time]


                # Backward packets 
                tmp_pack_dict["total_bwd_packets"] = 0 
                tmp_pack_dict["bwd_pkt_sizes"] = [] 
                tmp_pack_dict["first_timestamp_bwd"] = -1
                tmp_pack_dict["last_timestamp_bwd"] = -1
                tmp_pack_dict["arrival_times_bwd"] = []

                # Add flag counts 
                tmp_pack_dict["syn_flag_count"] = 0
                tmp_pack_dict["fin_flag_count"] = 0
                tmp_pack_dict["rst_flag_count"] = 0 
                tmp_pack_dict["psh_flag_count"] = 0
                tmp_pack_dict["ack_flag_count"] = 0 
                tmp_pack_dict["urg_flag_count"] = 0 
                tmp_pack_dict["cwr_flag_count"] = 0
                tmp_pack_dict["ece_flag_count"] = 0

                # create first time dictionary 
                all_data[flow_id] = tmp_pack_dict
                # save the first instance of the flow as the forward trace 
                flow_fwd_states.append(flow_id_ordered)

            else: # meaning either forward or backward trace (from the same flow!)

                # Update the general features first 
                all_data[flow_id]["sizes"].append(flow_size) 
                all_data[flow_id]["first_timestamp"] = min(all_data[flow_id]["first_timestamp"], pkt.time)
                all_data[flow_id]["last_timestamp"] = max(all_data[flow_id]["last_timestamp"], pkt.time)
                all_data[flow_id]["flow_duration"] = all_data[flow_id]["last_timestamp"] - all_data[flow_id]["first_timestamp"]
                all_data[flow_id]["arrival_times"].append(pkt.time)
                

                # Add forward packet features 
                if flow_id_ordered in flow_fwd_states: # check if forward packet and not backward 
                    all_data[flow_id]["total_fwd_packets"] += 1 
                    all_data[flow_id]["fwd_pkt_sizes"].append(pkt.len) 
                    all_data[flow_id]["first_timestamp_fwd"] = min(all_data[flow_id]["first_timestamp_fwd"], pkt.time)
                    all_data[flow_id]["last_timestamp_fwd"] = max(all_data[flow_id]["last_timestamp_fwd"], pkt.time)
                    all_data[flow_id]["arrival_times_fwd"].append(pkt.time)

                else:
                    all_data[flow_id]["total_bwd_packets"] += 1 
                    all_data[flow_id]["bwd_pkt_sizes"].append(pkt.len) 
                    all_data[flow_id]["first_timestamp_bwd"] = pkt.time if all_data[flow_id]["first_timestamp_bwd"] == -1 else min(all_data[flow_id]["first_timestamp_bwd"], pkt.time)
                    all_data[flow_id]["last_timestamp_bwd"] = max(all_data[flow_id]["last_timestamp_bwd"], pkt.time)
                    all_data[flow_id]["arrival_times_bwd"].append(pkt.time)

            if TCP in pkt[IP]:
                all_data[flow_id]["syn_flag_count"] += 1 if pkt[IP][TCP].flags & SYN else 0 
                all_data[flow_id]["fin_flag_count"] += 1 if pkt[IP][TCP].flags & FIN else 0
                all_data[flow_id]["rst_flag_count"] += 1 if pkt[IP][TCP].flags & RST else 0 
                all_data[flow_id]["psh_flag_count"] += 1 if pkt[IP][TCP].flags & PSH else 0
                all_data[flow_id]["ack_flag_count"] += 1 if pkt[IP][TCP].flags & ACK else 0 
                all_data[flow_id]["urg_flag_count"] += 1 if pkt[IP][TCP].flags & URG else 0 
                all_data[flow_id]["cwr_flag_count"] += 1 if pkt[IP][TCP].flags & CWR else 0
                all_data[flow_id]["ece_flag_count"] += 1 if pkt[IP][TCP].flags & ECE else 0
                

    df = pd.DataFrame.from_dict(all_data, orient="index")
    df.reset_index(drop=True, inplace=True)

    print("Initial data frame created.")

    return df 

In [5]:
FILE = "WebAttack_Sqli.pcap"
df = create_dataframe(INPUT_FILE=FILE)

Reading .pcap file.
Reading .pcap file DONE.
Creating initial dataframe.
Initial data frame created.


In [6]:
df.head()

Unnamed: 0,sport,src_ip,dst_port,dst_ip,protocol,sizes,first_timestamp,last_timestamp,flow_duration,arrival_times,...,last_timestamp_bwd,arrival_times_bwd,syn_flag_count,fin_flag_count,rst_flag_count,psh_flag_count,ack_flag_count,urg_flag_count,cwr_flag_count,ece_flag_count
0,36196,172.16.0.1,80,192.168.10.50,6,"[60, 60, 52, 499, 52, 582, 52, 52, 52, 52, 52]",1499348407.419016,1499348412.425928,5.006912,"[1499348407.419016, 1499348407.419147, 1499348...",...,1499348412.425455,"[1499348407.419147, 1499348407.420554, 1499348...",2,2,0,2,10,0,0,0
1,36198,172.16.0.1,80,192.168.10.50,6,"[60, 60, 52, 512, 52, 1892, 52, 52, 52, 52]",1499348413.192475,1499348418.262971,5.070496,"[1499348413.192475, 1499348413.192603, 1499348...",...,1499348418.262971,"[1499348413.192603, 1499348413.193473, 1499348...",2,2,0,2,9,0,0,0
2,36200,172.16.0.1,80,192.168.10.50,6,"[60, 60, 52, 589, 52, 1933, 52, 52, 52, 52]",1499348422.024349,1499348427.063652,5.039303,"[1499348422.024349, 1499348422.024463, 1499348...",...,1499348427.063652,"[1499348422.024463, 1499348422.025335, 1499348...",2,2,0,2,9,0,0,0
3,36202,172.16.0.1,80,192.168.10.50,6,"[60, 60, 52, 652, 52, 4201, 52, 52, 52, 52, 52]",1499348433.464668,1499348438.551871,5.087203,"[1499348433.464668, 1499348433.464810, 1499348...",...,1499348438.551871,"[1499348433.464810, 1499348433.465657, 1499348...",2,2,0,2,10,0,0,0
4,36204,172.16.0.1,80,192.168.10.50,6,"[60, 60, 52, 651, 52, 2073, 52, 52, 52, 52]",1499348467.295664,1499348472.302394,5.00673,"[1499348467.295664, 1499348467.295837, 1499348...",...,1499348472.302394,"[1499348467.295837, 1499348467.296825, 1499348...",2,2,0,2,9,0,0,0


Running sanity check to make sure there are no duplicates based on flow_id

In [7]:
subset_cols = ["sport", "src_ip", "dst_port", "dst_ip"]
duplicates = df.duplicated(subset=subset_cols)

duplicate_rows = df[duplicates]
assert duplicate_rows.shape[0] == 0

In [8]:
duplicate_rows.shape[0]

0

In [9]:
df.columns

Index(['sport', 'src_ip', 'dst_port', 'dst_ip', 'protocol', 'sizes',
       'first_timestamp', 'last_timestamp', 'flow_duration', 'arrival_times',
       'total_fwd_packets', 'fwd_pkt_sizes', 'first_timestamp_fwd',
       'last_timestamp_fwd', 'arrival_times_fwd', 'total_bwd_packets',
       'bwd_pkt_sizes', 'first_timestamp_bwd', 'last_timestamp_bwd',
       'arrival_times_bwd', 'syn_flag_count', 'fin_flag_count',
       'rst_flag_count', 'psh_flag_count', 'ack_flag_count', 'urg_flag_count',
       'cwr_flag_count', 'ece_flag_count'],
      dtype='object')

Some sanity checks

In [10]:
# check if any first_timestamp_fwd == -1
first_fwd_timestamp = df[df.loc[:, "first_timestamp_fwd"] == -1]
assert first_fwd_timestamp.shape[0] == 0

change to NaN?

## Cleanup

### Size features

In [11]:
## Overall
df["total_size"] = round(df.loc[:, "sizes"].apply(lambda x: np.sum(x)), 3)
df["avg_size"] = round(df.loc[:, "sizes"].apply(lambda x: np.mean(x)), 3)
df["std_size"] = round(df.loc[:, "sizes"].apply(lambda x: np.std(x)), 3)

## Forward
df["total_fwd_pkt_size"] = round(df.loc[:, "fwd_pkt_sizes"].apply(lambda x: np.sum(x)), 3)
df["avg_fwd_pkt_size"] = round(df.loc[:, "fwd_pkt_sizes"].apply(lambda x: np.mean(x)), 3)
df["std_fwd_pkt_size"] = round(df.loc[:, "fwd_pkt_sizes"].apply(lambda x: np.std(x)), 3)
## Backward
df["total_bwd_pkt_size"] = round(df.loc[:, "bwd_pkt_sizes"].apply(lambda x: np.sum(x)), 3)
df["avg_bwd_pkt_size"] = round(df.loc[:, "bwd_pkt_sizes"].apply(lambda x: np.mean(x)), 3)
df["std_bwd_pkt_size"] = round(df.loc[:, "bwd_pkt_sizes"].apply(lambda x: np.std(x)),3)


In [12]:
# check cases when first_timestamp_bwd == -1 => last_timestamp_bwd has to be -1 as well. Also, we should perhaps change total_bwd_pkt_size to NaN in this case too!
first_bwd_timestamp = df[df.loc[:, "first_timestamp_bwd"] == -1]
first_bwd_timestamp.loc[:, ["first_timestamp_bwd", "last_timestamp_bwd", "arrival_times_bwd", "total_bwd_pkt_size", "avg_bwd_pkt_size", "std_bwd_pkt_size"]]

Unnamed: 0,first_timestamp_bwd,last_timestamp_bwd,arrival_times_bwd,total_bwd_pkt_size,avg_bwd_pkt_size,std_bwd_pkt_size


### BWD timestamps conversion

Convert -1 values to np.nan for better readability when converting to human-readable formats ahead

In [13]:
bwd_cols = ["first_timestamp_bwd", "last_timestamp_bwd"]
df[bwd_cols] = df[bwd_cols].replace(-1, np.nan)

In [14]:
df.eq(-1).any()

sport                  False
src_ip                 False
dst_port               False
dst_ip                 False
protocol               False
sizes                  False
first_timestamp        False
last_timestamp         False
flow_duration          False
arrival_times          False
total_fwd_packets      False
fwd_pkt_sizes          False
first_timestamp_fwd    False
last_timestamp_fwd     False
arrival_times_fwd      False
total_bwd_packets      False
bwd_pkt_sizes          False
first_timestamp_bwd    False
last_timestamp_bwd     False
arrival_times_bwd      False
syn_flag_count         False
fin_flag_count         False
rst_flag_count         False
psh_flag_count         False
ack_flag_count         False
urg_flag_count         False
cwr_flag_count         False
ece_flag_count         False
total_size             False
avg_size               False
std_size               False
total_fwd_pkt_size     False
avg_fwd_pkt_size       False
std_fwd_pkt_size       False
total_bwd_pkt_

### Time-based Features

Computing flow durations

NOTE: Flow durations are in SECONDS

In [15]:
df["fwd_flow_duration"] = df.loc[:, "last_timestamp_fwd"] - df.loc[:, "first_timestamp_fwd"]
df["bwd_flow_duration"] = df.loc[:, "last_timestamp_bwd"] - df.loc[:, "first_timestamp_bwd"]
df["flow_duration"] = df.loc[:, "last_timestamp"] - df.loc[:, "first_timestamp"]
df.loc[:, ["first_timestamp", "last_timestamp", "flow_duration"]]

Unnamed: 0,first_timestamp,last_timestamp,flow_duration
0,1499348407.419016,1499348412.425928,5.006912
1,1499348413.192475,1499348418.262971,5.070496
2,1499348422.024349,1499348427.063652,5.039303
3,1499348433.464668,1499348438.551871,5.087203
4,1499348467.295664,1499348472.302394,5.00673
5,1499348480.992304,1499348486.002003,5.009699
6,1499348494.345596,1499348499.355969,5.010373
7,1499348506.489087,1499348511.497289,5.008202
8,1499348514.064531,1499348519.077716,5.013185
9,1499348532.265347,1499348537.27147,5.006123


Check what happens to bwd_flow_duration when timestamps were -1.
Sanity check to make sure it is also NaN to differentiate from case where there was exactly one bwd packet!

In [16]:
assert df[df.loc[:, "first_timestamp_bwd"].isna()].loc[:, "bwd_flow_duration"].isna().all() == True

NOTE: flow_duration will be 0 if only one packet was sent (overall, fwd or bwd)

In [17]:
# Case when fwd_flow_duration is 0. total_fwd_packets should be 1 
zero_fwd_flow_duration = df[df.loc[:, "fwd_flow_duration"] == 0]
assert zero_fwd_flow_duration.shape[0] == zero_fwd_flow_duration.loc[:, "total_fwd_packets"].sum()

NOTE: there are cases when flow_duration is > 0 but both bwd_flow_duration and fwd_flow_duration = 0. These are cases where at most one forward and backward packet

In [18]:
# Sanity check for this case as well 
mismatched_durations = df[(df.loc[:, "flow_duration"] > 0) & (df.loc[:, "fwd_flow_duration"] == 0) & (df.loc[:, "bwd_flow_duration"] == 0)]
mismatched_durations.loc[:, ["total_fwd_packets", "total_bwd_packets", "flow_duration", "fwd_flow_duration", "bwd_flow_duration"]]

Unnamed: 0,total_fwd_packets,total_bwd_packets,flow_duration,fwd_flow_duration,bwd_flow_duration


In [19]:
# Make these assertions for checking
if mismatched_durations.shape[0] > 0:
    assert mismatched_durations.loc[:, "total_fwd_packets"].max() == 1
    assert mismatched_durations.loc[:, "total_bwd_packets"].max() == 1

Converting timestamps to human-readable form

In [20]:
df_test = df.copy()

# Overall
df_test.loc[:, "first_timestamp"] = df.loc[:, "first_timestamp"].apply(lambda x: datetime.fromtimestamp(float(x), UTC).strftime("%Y-%m-%d %H:%M:%S.%f"))
df_test.loc[:, "last_timestamp"] = df.loc[:, "last_timestamp"].apply(lambda x: datetime.fromtimestamp(float(x), UTC).strftime("%Y-%m-%d %H:%M:%S.%f"))

# Forward 
df_test.loc[:, "first_timestamp_fwd"] = df.loc[:, "first_timestamp_fwd"].apply(lambda x: datetime.fromtimestamp(float(x), UTC).strftime("%Y-%m-%d %H:%M:%S.%f"))
df_test.loc[:, "last_timestamp_fwd"] = df.loc[:, "last_timestamp_fwd"].apply(lambda x: datetime.fromtimestamp(float(x), UTC).strftime("%Y-%m-%d %H:%M:%S.%f"))

# Backward 
df_test.loc[:, "first_timestamp_bwd_new"] = df.loc[:, "first_timestamp_bwd"].apply(lambda x: datetime.fromtimestamp(float(x), UTC).strftime("%Y-%m-%d %H:%M:%S.%f") if not pd.isna(x) else np.nan)
df_test.loc[:, "last_timestamp_bwd_new"] = df.loc[:, "last_timestamp_bwd"].apply(lambda x: datetime.fromtimestamp(float(x), UTC).strftime("%Y-%m-%d %H:%M:%S.%f") if not pd.isna(x) else np.nan)

Sanity check to make sure timestamp conversion preserves NaN values

In [21]:
df_test[df_test.loc[:, "first_timestamp_bwd"].isna()].loc[:, ["first_timestamp_bwd_new", "first_timestamp_bwd"]].head()

Unnamed: 0,first_timestamp_bwd_new,first_timestamp_bwd


In [22]:
df.columns

Index(['sport', 'src_ip', 'dst_port', 'dst_ip', 'protocol', 'sizes',
       'first_timestamp', 'last_timestamp', 'flow_duration', 'arrival_times',
       'total_fwd_packets', 'fwd_pkt_sizes', 'first_timestamp_fwd',
       'last_timestamp_fwd', 'arrival_times_fwd', 'total_bwd_packets',
       'bwd_pkt_sizes', 'first_timestamp_bwd', 'last_timestamp_bwd',
       'arrival_times_bwd', 'syn_flag_count', 'fin_flag_count',
       'rst_flag_count', 'psh_flag_count', 'ack_flag_count', 'urg_flag_count',
       'cwr_flag_count', 'ece_flag_count', 'total_size', 'avg_size',
       'std_size', 'total_fwd_pkt_size', 'avg_fwd_pkt_size',
       'std_fwd_pkt_size', 'total_bwd_pkt_size', 'avg_bwd_pkt_size',
       'std_bwd_pkt_size', 'fwd_flow_duration', 'bwd_flow_duration'],
      dtype='object')

Computing inter-arrival times and statistics

In [23]:
import warnings

warnings.filterwarnings("ignore")

df_time = df_test.copy()
def find_diff(arrival_times):
    return [float(arrival_times[i+1] - arrival_times[i]) for i in range(len(arrival_times)-1)]

# Overall 
df_time.loc[:, "inter_arrival_times"] = df_time.loc[:, "arrival_times"].apply(find_diff)
df_time.loc[:, "inter_arrival_mean"] = df_time.loc[:, "inter_arrival_times"].apply(lambda x: np.mean(x))
df_time.loc[:, "inter_arrival_std"] = df_time.loc[: ,"inter_arrival_times"].apply(lambda x: np.std(x))

# Forward
df_time.loc[:, "inter_arrival_times_fwd"] = df_time.loc[:, "arrival_times_fwd"].apply(find_diff)
df_time.loc[:, "inter_arrival_mean_fwd"] = df_time.loc[:, "inter_arrival_times_fwd"].apply(lambda x: np.mean(x))
df_time.loc[:, "inter_arrival_std_fwd"] = df_time.loc[: ,"inter_arrival_times_fwd"].apply(lambda x: np.std(x))

# Backward
df_time.loc[:, "inter_arrival_times_bwd"] = df_time.loc[:, "arrival_times_bwd"].apply(find_diff)
df_time.loc[:, "inter_arrival_mean_bwd"] = df_time.loc[:, "inter_arrival_times_bwd"].apply(lambda x: np.mean(x))
df_time.loc[:, "inter_arrival_std_bwd"] = df_time.loc[: ,"inter_arrival_times_bwd"].apply(lambda x: np.std(x))

# df_time[df_time.loc[:, "arrival_times"].apply(lambda x: len(x) == 1)].loc[:, ["arrival_times", "inter_arrival_times", "inter_arrival_mean", "inter_arrival_std"]]

In [24]:
df_time.loc[:, ["arrival_times", "inter_arrival_times", "inter_arrival_mean", "inter_arrival_std"]]

Unnamed: 0,arrival_times,inter_arrival_times,inter_arrival_mean,inter_arrival_std
0,"[1499348407.419016, 1499348407.419147, 1499348...","[0.000131, 0.001311, 4e-06, 9.2e-05, 0.003578,...",0.500691,1.499908
1,"[1499348413.192475, 1499348413.192603, 1499348...","[0.000128, 0.000773, 4e-06, 9.3e-05, 0.06727, ...",0.563388,1.569043
2,"[1499348422.024349, 1499348422.024463, 1499348...","[0.000114, 0.0008, 4e-06, 6.8e-05, 0.032476, 0...",0.559923,1.571372
3,"[1499348433.464668, 1499348433.464810, 1499348...","[0.000142, 0.000744, 4e-06, 9.9e-05, 0.080424,...",0.50872,1.498585
4,"[1499348467.295664, 1499348467.295837, 1499348...","[0.000173, 0.00055, 0.00033, 0.000108, 0.00428...",0.556303,1.571102
5,"[1499348480.992304, 1499348480.992428, 1499348...","[0.000124, 0.000791, 4.9e-05, 4.3e-05, 0.00276...",0.556633,1.572624
6,"[1499348494.345596, 1499348494.345725, 1499348...","[0.000129, 0.000792, 4.9e-05, 4.8e-05, 0.0035,...",0.556708,1.572516
7,"[1499348506.489087, 1499348506.489193, 1499348...","[0.000106, 0.000808, 4e-06, 6.6e-05, 0.005116,...",0.556467,1.571265
8,"[1499348514.064531, 1499348514.064644, 1499348...","[0.000113, 0.000813, 3e-06, 6.4e-05, 0.010187,...",0.455744,1.437073
9,"[1499348532.265347, 1499348532.265484, 1499348...","[0.000137, 0.000775, 4e-06, 9.7e-05, 0.003553,...",0.556236,1.571159


## Features to keep 

In [25]:
df_time.columns

Index(['sport', 'src_ip', 'dst_port', 'dst_ip', 'protocol', 'sizes',
       'first_timestamp', 'last_timestamp', 'flow_duration', 'arrival_times',
       'total_fwd_packets', 'fwd_pkt_sizes', 'first_timestamp_fwd',
       'last_timestamp_fwd', 'arrival_times_fwd', 'total_bwd_packets',
       'bwd_pkt_sizes', 'first_timestamp_bwd', 'last_timestamp_bwd',
       'arrival_times_bwd', 'syn_flag_count', 'fin_flag_count',
       'rst_flag_count', 'psh_flag_count', 'ack_flag_count', 'urg_flag_count',
       'cwr_flag_count', 'ece_flag_count', 'total_size', 'avg_size',
       'std_size', 'total_fwd_pkt_size', 'avg_fwd_pkt_size',
       'std_fwd_pkt_size', 'total_bwd_pkt_size', 'avg_bwd_pkt_size',
       'std_bwd_pkt_size', 'fwd_flow_duration', 'bwd_flow_duration',
       'first_timestamp_bwd_new', 'last_timestamp_bwd_new',
       'inter_arrival_times', 'inter_arrival_mean', 'inter_arrival_std',
       'inter_arrival_times_fwd', 'inter_arrival_mean_fwd',
       'inter_arrival_std_fwd', 'inter_ar

In [26]:
general_features = ["sport", "src_ip", "dst_port", "dst_ip", "protocol"]
overall_features = ["first_timestamp", "last_timestamp", "flow_duration", "total_size", "avg_size", "std_size", "inter_arrival_mean", "inter_arrival_std"]
fwd_features = ["total_fwd_packets", "first_timestamp_fwd", "last_timestamp_fwd", "fwd_flow_duration", "total_fwd_pkt_size", "avg_fwd_pkt_size", "std_fwd_pkt_size", "inter_arrival_mean_fwd", "inter_arrival_std_fwd"]
bwd_features = ['total_bwd_packets', 'first_timestamp_bwd', 'last_timestamp_bwd', 'bwd_flow_duration', 'total_bwd_pkt_size', 'avg_bwd_pkt_size', 'std_bwd_pkt_size', 'inter_arrival_mean_bwd', 'inter_arrival_std_bwd']
flag_features = ['syn_flag_count', 'fin_flag_count', 'rst_flag_count', 'psh_flag_count', 'ack_flag_count', 'urg_flag_count','cwr_flag_count', 'ece_flag_count']

In [27]:
features_to_keep = general_features + overall_features + fwd_features + bwd_features + flag_features
print(f"Total features kept: {len(features_to_keep)}")

Total features kept: 39


## Attaching label for model data

In [28]:
df_final = df_time.copy()
df_final = df_final[features_to_keep]
df_final.loc[:, "label"] = "webattack_sqli"

## Saving as csv

In [29]:
df_final.to_csv("webattack_sqli.csv", index=False)