### Preliminary setup

In [18]:
# For data transformation
import pandas as pd            
# For statistical analysis
import numpy as np
import statistics as stats
# For ASN lookup
import pyasn
asndb = pyasn.pyasn('ipasn_20140513.dat')

In [9]:
# input csv
input_file = "./benign_flow/yt_test/test10.csv"
df = pd.read_csv(input_file)

column_names = ["DURATION", "SRC_IP", "DST_IP", "SRC_PORT", "DST_PORT", "QUIC_VERSION", "BYTES", "PROTOCOL"]
df.columns = column_names

# delete protocol column
df = df.drop("PROTOCOL", axis=1)

df["BYTES"] = pd.to_numeric(df["BYTES"], errors='coerce').fillna(0)
#df["DURATION"] = df["DURATION"] - df["DURATION"].iloc[0]

In [14]:
# set source IP, web service category, and label
ipsrc = "10.10.3.10"
cat = "Streaming"
label = "0"
quic_ver = "1"

# initialize the interpacket variables
flow = []
arr_port = []
arr_asn = []
arr_ver = []
def initialize_variables():
    global arr_fwd_bytes, arr_rev_bytes, arr_fwd_iat, arr_rev_iat, arr_port, arr_asn, arr_ver
    global fwd_pkt, fwd_bytes, rev_pkt, rev_bytes, init_dur, dur, fwd_dur, rev_dur, ctr
    arr_fwd_bytes = []
    arr_rev_bytes = []
    arr_fwd_iat = []
    arr_rev_iat = []
    fwd_pkt = 0
    fwd_bytes = 0.0
    rev_pkt = 0
    rev_bytes = 0.0
    init_dur = 0.0
    dur = 0.0
    fwd_dur = 0.0
    rev_dur = 0.0
    ctr = 0

initialize_variables()
for index, row in df.iterrows():
    value = row['SRC_IP']
    if ctr == 29 or index == len(df) - 1 or (index > 0 and df.at[index, 'DURATION'] - df.at[index - 1, 'DURATION'] > 1.0):
        if value == ipsrc:
            fwd_dur += (row['DURATION'] - init_dur)
            arr_fwd_iat.append(row['DURATION'] - init_dur)
            arr_port.append(row['DST_PORT'])
            arr_asn.append(asndb.lookup(row['DST_IP'])[0])
            arr_ver.append(row['QUIC_VERSION'])
            fwd_pkt += 1
            fwd_bytes += row['BYTES']
            arr_fwd_bytes.append(row['BYTES'])
        else:
            rev_dur += (row['DURATION'] - init_dur)
            arr_rev_iat.append(row['DURATION'] - init_dur)
            arr_port.append(row['SRC_PORT'])
            arr_asn.append(asndb.lookup(row['SRC_IP'])[0])
            arr_ver.append(row['QUIC_VERSION'])
            rev_pkt += 1
            rev_bytes += row['BYTES']
            arr_rev_bytes.append(row['BYTES'])

        dst_port = stats.mode(arr_port)
        dst_asn = stats.mode([x for x in arr_asn if isinstance(x, int)])
        new_quic_ver = stats.mode(arr_ver)

        if isinstance(new_quic_ver, tuple):
            quic_ver = max([int(h) for h in new_quic_ver])
        elif isinstance(new_quic_ver, int):
            quic_ver = [int(new_quic_ver)]
        
        dur += (row['DURATION'] - init_dur)
        ratio = fwd_pkt / rev_pkt
        tot_pkt = fwd_pkt + rev_pkt
        tot_bytes = fwd_bytes + rev_bytes
        flow_pkt = tot_pkt / dur
        flow_bytes = tot_bytes / dur
        
        if len(arr_fwd_bytes) > 0 and len(arr_rev_bytes) > 0:
            combined_bytes = np.array(arr_fwd_bytes + arr_rev_bytes)
            max_bytes = np.max(combined_bytes)
            min_bytes = np.min(combined_bytes)
            ave_bytes = np.mean(combined_bytes)
            std_bytes = np.std(combined_bytes)
            var_bytes = np.var(combined_bytes)

        if len(arr_fwd_bytes) > 0:
            max_fwd_bytes = np.max(arr_fwd_bytes)
            min_fwd_bytes = np.min(arr_fwd_bytes)
            ave_fwd_bytes = np.mean(arr_fwd_bytes)
            std_fwd_bytes = np.std(arr_fwd_bytes)
            var_fwd_bytes = np.var(arr_fwd_bytes)

        if len(arr_rev_bytes) > 0:
            max_rev_bytes = np.max(arr_rev_bytes)
            min_rev_bytes = np.min(arr_rev_bytes)
            ave_rev_bytes = np.mean(arr_rev_bytes)
            std_rev_bytes = np.std(arr_rev_bytes)
            var_rev_bytes = np.var(arr_rev_bytes)

        if len(arr_fwd_iat) > 0 and len(arr_rev_iat) > 0:
            combined_iat = np.array(arr_fwd_iat + arr_rev_iat)
            max_iat = np.max(combined_iat)
            min_iat = np.min(combined_iat)
            ave_iat = np.mean(combined_iat)
            std_iat = np.std(combined_iat)
            var_iat = np.var(combined_iat)

        if len(arr_fwd_iat) > 0:
            max_fwd_iat = np.max(arr_fwd_iat)
            min_fwd_iat = np.min(arr_fwd_iat)
            ave_fwd_iat = np.mean(arr_fwd_iat)
            std_fwd_iat = np.std(arr_fwd_iat)
            var_fwd_iat = np.var(arr_fwd_iat)

        if len(arr_rev_iat) > 0:
            max_rev_iat = np.max(arr_rev_iat)
            min_rev_iat = np.min(arr_rev_iat)
            ave_rev_iat = np.mean(arr_rev_iat)
            std_rev_iat = np.std(arr_rev_iat)
            var_rev_iat = np.var(arr_rev_iat)

        flow.append([dst_port, dst_asn, quic_ver, dur, cat, ratio, flow_pkt, flow_bytes, tot_pkt, tot_bytes, max_bytes, min_bytes, ave_bytes, std_bytes, var_bytes, fwd_pkt, fwd_bytes, max_fwd_bytes, min_fwd_bytes, ave_fwd_bytes, std_fwd_bytes, var_fwd_bytes, rev_pkt, rev_bytes, max_rev_bytes, min_rev_bytes, ave_rev_bytes, std_rev_bytes, var_rev_bytes, max_iat, min_iat, ave_iat, std_iat, var_iat, fwd_dur, max_fwd_iat, min_fwd_iat, ave_fwd_iat, std_fwd_iat, var_fwd_iat, rev_dur, max_rev_iat, min_rev_iat, ave_rev_iat, std_rev_iat, var_rev_iat, label])
        initialize_variables()
    elif value == ipsrc and ctr < 29:
        if ctr == 0:
            init_dur = row['DURATION']
        else:
            fwd_dur += (row['DURATION'] - init_dur)
            arr_fwd_iat.append(row['DURATION'] - init_dur)
        ctr += 1
        arr_port.append(row['DST_PORT'])
        arr_asn.append(asndb.lookup(row['DST_IP'])[0])
        arr_ver.append(row['QUIC_VERSION'])
        fwd_pkt += 1
        fwd_bytes += row['BYTES']
        arr_fwd_bytes.append(row['BYTES'])
    elif value != ipsrc and ctr < 29:
        if ctr == 0:
            init_dur = row['DURATION']
        else:
            rev_dur += (row['DURATION'] - init_dur)
            arr_rev_iat.append(row['DURATION'] - init_dur)
        ctr += 1
        arr_port.append(row['SRC_PORT'])
        arr_asn.append(asndb.lookup(row['SRC_IP'])[0])
        arr_ver.append(row['QUIC_VERSION'])
        rev_pkt += 1
        rev_bytes += row['BYTES']
        arr_rev_bytes.append(row['BYTES'])
    
    # track progress in percent with respect to total rows
    if index % 1000 == 0:
        print(f"{index / len(df) * 100:.2f}%")



0.00%
6.96%
13.92%
20.87%
27.83%
34.79%
41.75%
48.71%
55.66%
62.62%
69.58%
76.54%
83.50%
90.45%
97.41%


In [None]:
for item in flow:
    print(item)

print(len(flow))

In [16]:
# Read the existing CSV file
file_path = "./benign_flow/benign.csv"
df = pd.read_csv(file_path)

# Append each item in the flow list as a new row in the DataFrame
for item in flow:
    df.loc[len(df)] = item

# Write the updated DataFrame back to the CSV file
df.to_csv(file_path, index=False)

In [17]:
# Save the filtered DataFrame to a new CSV file
output_file = "./benign_flow/filtered_output.csv"
df.to_csv(output_file, index=False)

### archive

In [None]:
# source IP
ipsrc = df["SRC_IP"].iloc[0]
portsrc = df["SRC_PORT"].iloc[0]
cat = "Streaming"

flow = []

df["true_dest"] = df.apply(lambda row: row["DST_IP"] if row["SRC_IP"] == ipsrc else row["SRC_IP"], axis=1)
df["group"] = (df["true_dest"] != df["true_dest"].shift()).cumsum()

for group, group_df in df.groupby("group"):
    num_subgroups = (len(group_df) + 29) // 30
    subgroups = [group_df.iloc[i * 30:(i+1) * 30] for i in range(num_subgroups)]
    
    # print(f"Group {group}:")
    # print(group_df)

    for  subgroup in subgroups:
        ppi_dir = []
        ipdst = subgroup["true_dest"].iloc[0]  # The unique normalized destination for this subgroup
        portdst = subgroup["DST_PORT"].iloc[0] 

        ppi_time = [0]
        ppi_size = [int(subgroup["BYTES"].iloc[0])]
        for i in range(1, len(subgroup)):
            # Calculate the time difference between consecutive packets
            duration = int((subgroup["DURATION"].iloc[i] - subgroup["DURATION"].iloc[i - 1]) * 1000)
            ppi_time.append(duration)
            ppi_size.append(int(subgroup["BYTES"].iloc[i]))



        for _, row in subgroup.iterrows():
            if row["SRC_IP"] == ipsrc and row["DST_IP"] == ipdst:
                ppi_dir.append(1)
            elif row["SRC_IP"] == ipdst  and row["DST_IP"] == ipsrc:
                ppi_dir.append(-1)
            else:
                ppi_dir.append(0)

            bytes_fromsrc = int(subgroup.loc[(subgroup["SRC_IP"] == ipsrc) & (subgroup["DST_IP"] == ipdst),"BYTES"].sum())
            bytes_rev = int(subgroup.loc[(subgroup["SRC_IP"] == ipdst) & (subgroup["DST_IP"] == ipsrc),"BYTES"].sum())
            packets = int(subgroup.loc[(subgroup["SRC_IP"] == ipsrc) & (subgroup["DST_IP"] == ipdst),"BYTES"].count())
            packets_rev = int(subgroup.loc[(subgroup["SRC_IP"] == ipdst) & (subgroup["DST_IP"] == ipsrc),"BYTES"].count())
            ppi_len = len(ppi_dir)

            ppi_rtt = 0
            in_group = False

            # Iterate through the list
            for value in ppi_dir:
                if value == -1:  # Start or continue a group of -1's
                    if not in_group:
                        in_group = True  # Beginning of a group
                elif value == 1:  # Start or continue a group of 1's
                    if not in_group:
                        in_group = True  # Beginning of a group
                    elif in_group:
                        ppi_rtt += 1
                        in_group = False  # Reset for the next group

            # Handle the case where the list ends with a valid pair
            if in_group:
                ppi_rtt += 1

        # print("Subgroup:")
        # print(subgroup)

        dur = round(subgroup["DURATION"].max() - subgroup["DURATION"].min(), ndigits=6)

        #flow.append([ipsrc, ipdst, asndb.lookup(ipdst)[0], portsrc, portdst, 1, sni, dur, bytes_fromsrc, bytes_rev, packets, packets_rev, ppi_len, ppi_rtt, cat, [ppi_time, ppi_dir, ppi_size]])
        flow.append([portdst, asndb.lookup(ipdst)[0], portsrc, portdst, 1, sni, dur, bytes_fromsrc, bytes_rev, packets, packets_rev, ppi_len, ppi_rtt, cat, [ppi_time, ppi_dir, ppi_size]])
        