### Preliminary setup

In [3]:
import pandas as pd             # For data transformation
import pyasn
asndb = pyasn.pyasn('ipasn_20140513.dat')

In [24]:
# input csv
input_file = "./benign_flow/test16.csv"
df = pd.read_csv(input_file)

column_names = ["DURATION", "SRC_IP", "DST_IP", "SRC_PORT", "DST_PORT", "QUIC_VERSION", "BYTES", "PROTOCOL"]
df.columns = column_names

# Specify the column name and the string to filter out
column_name = "PROTOCOL"  # Replace with the actual column name
protocol_filter = "QUIC"  # Replace with the string to remove

# Filter out rows containing the specific string in the specified column
df = df[df[column_name].str.contains(protocol_filter, na=False)]

df["BYTES"] = pd.to_numeric(df["BYTES"], errors='coerce').fillna(0)
df["DURATION"] = df["DURATION"] - df["DURATION"].iloc[0]

FileNotFoundError: [Errno 2] No such file or directory: './benign_flow/test16.csv'

In [21]:
# source IP
ipsrc = df["SRC_IP"].iloc[0]
portsrc = df["SRC_PORT"].iloc[0]
sni = "youtube.com"
cat = "Streaming"

flow = []

df["true_dest"] = df.apply(lambda row: row["DST_IP"] if row["SRC_IP"] == ipsrc else row["SRC_IP"], axis=1)
df["group"] = (df["true_dest"] != df["true_dest"].shift()).cumsum()

for group, group_df in df.groupby("group"):
    num_subgroups = (len(group_df) + 29) // 30
    subgroups = [group_df.iloc[i * 30:(i+1) * 30] for i in range(num_subgroups)]
    
    # print(f"Group {group}:")
    # print(group_df)

    for  subgroup in subgroups:
        ppi_dir = []
        ipdst = subgroup["true_dest"].iloc[0]  # The unique normalized destination for this subgroup
        portdst = subgroup["DST_PORT"].iloc[0] 

        ppi_time = [0]
        ppi_size = [int(subgroup["BYTES"].iloc[0])]
        for i in range(1, len(subgroup)):
            # Calculate the time difference between consecutive packets
            duration = int((subgroup["DURATION"].iloc[i] - subgroup["DURATION"].iloc[i - 1]) * 1000)
            ppi_time.append(duration)
            ppi_size.append(int(subgroup["BYTES"].iloc[i]))



        for _, row in subgroup.iterrows():
            if row["SRC_IP"] == ipsrc and row["DST_IP"] == ipdst:
                ppi_dir.append(1)
            elif row["SRC_IP"] == ipdst  and row["DST_IP"] == ipsrc:
                ppi_dir.append(-1)
            else:
                ppi_dir.append(0)

            bytes_fromsrc = int(subgroup.loc[(subgroup["SRC_IP"] == ipsrc) & (subgroup["DST_IP"] == ipdst),"BYTES"].sum())
            bytes_rev = int(subgroup.loc[(subgroup["SRC_IP"] == ipdst) & (subgroup["DST_IP"] == ipsrc),"BYTES"].sum())
            packets = int(subgroup.loc[(subgroup["SRC_IP"] == ipsrc) & (subgroup["DST_IP"] == ipdst),"BYTES"].count())
            packets_rev = int(subgroup.loc[(subgroup["SRC_IP"] == ipdst) & (subgroup["DST_IP"] == ipsrc),"BYTES"].count())
            ppi_len = len(ppi_dir)

            ppi_rtt = 0
            in_group = False

            # Iterate through the list
            for value in ppi_dir:
                if value == -1:  # Start or continue a group of -1's
                    if not in_group:
                        in_group = True  # Beginning of a group
                elif value == 1:  # Start or continue a group of 1's
                    if not in_group:
                        in_group = True  # Beginning of a group
                    elif in_group:
                        ppi_rtt += 1
                        in_group = False  # Reset for the next group

            # Handle the case where the list ends with a valid pair
            if in_group:
                ppi_rtt += 1

        # print("Subgroup:")
        # print(subgroup)

        dur = round(subgroup["DURATION"].max() - subgroup["DURATION"].min(), ndigits=6)

        flow.append([ipsrc, ipdst, asndb.lookup(ipdst)[0], portsrc, portdst, 1, sni, dur, bytes_fromsrc, bytes_rev, packets, packets_rev, ppi_len, ppi_rtt, cat, [ppi_time, ppi_dir, ppi_size]])

In [22]:
for item in flow:
    print(item)

['10.10.3.10', '142.250.71.246', 15169, 52442, 443, 1, 'youtube.com', 0.525763, 3860, 8623, 7, 11, 18, 6, 'Streaming', [[0, 0, 166, 0, 0, 0, 1, 11, 0, 0, 0, 1, 151, 1, 164, 0, 0, 25], [1, 1, -1, -1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, -1, -1, 1, -1], [1250, 1250, 40, 1250, 1250, 1250, 1250, 1250, 1250, 1250, 39, 40, 0, 0, 937, 122, 31, 24]]]
['10.10.3.10', '202.90.156.78', 9821, 52442, 443, 1, 'youtube.com', 0.022014, 6016, 11358, 13, 17, 30, 9, 'Streaming', [[0, 0, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0], [1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, -1, -1, -1], [1250, 1250, 40, 1250, 0, 1250, 1250, 0, 39, 39, 37, 41, 83, 72, 530, 116, 1246, 1246, 654, 34, 33, 23, 27, 33, 585, 1246, 1250, 1250, 1250, 1250]]]
['10.10.3.10', '202.90.156.78', 9821, 52442, 49405, 1, 'youtube.com', 0.004181, 202, 30000, 6, 24, 30, 5, 'Streaming', [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0,

In [23]:
# Read the existing CSV file
file_path = "./benign_flow/benign.csv"
df = pd.read_csv(file_path)

# Append each item in the flow list as a new row in the DataFrame
for item in flow:
    df.loc[len(df)] = item

# Write the updated DataFrame back to the CSV file
df.to_csv(file_path, index=False)

In [144]:
# Save the filtered DataFrame to a new CSV file
output_file = "./benign_flow/filtered_output.csv"
df.to_csv(output_file, index=False)