### Preliminary setup

In [2]:
# For data transformation
import pandas as pd            
# For statistical analysis
import numpy as np
import statistics as stats
# For ASN lookup
import pyasn
asndb = pyasn.pyasn('ipasn_20140513.dat')

In [5]:
# input csv
input_file = "./benign_flow/master_file.csv"
df = pd.read_csv(input_file, dtype={"DURATION": float, "SRC_IP": str, "DST_IP": str, "SRC_PORT": int, "DST_PORT": int}, sep=",", low_memory=False, header= 0)

# delete protocol column
df = df.drop(columns=["PROTOCOL"], axis=1)
df["BYTES"] = pd.to_numeric(df["BYTES"], errors='coerce').fillna(0)

# Read the existing CSV file
output_file = "./benign_flow/benign.csv"
flow_df = pd.read_csv(output_file,sep=",", header=0, low_memory=False)

In [None]:
# set source IP, web service category, and label
ipsrc = "10.10.3.10"
label = "0"
quic_ver = "1"

# define the list for each feature
df_dst_port = []
df_dst_asn = []
df_dur = []
df_ratio = []
df_flow_pkt = []
df_flow_bytes = []
df_tot_pkt = []
df_tot_bytes = []
df_max_bytes = []
df_min_bytes = []
df_ave_bytes = []
df_std_bytes = []
df_var_bytes = []
df_fwd_pkt = []
df_fwd_bytes = []
df_max_fwd_bytes = []
df_min_fwd_bytes = []
df_ave_fwd_bytes = []
df_std_fwd_bytes = []
df_var_fwd_bytes = []
df_rev_pkt = []
df_rev_bytes = []
df_max_rev_bytes = []
df_min_rev_bytes = []
df_ave_rev_bytes = []
df_std_rev_bytes = []
df_var_rev_bytes = []
df_max_iat = []
df_min_iat = []
df_ave_iat = []
df_std_iat = []
df_var_iat = []
df_fwd_dur = []
df_max_fwd_iat = []
df_min_fwd_iat = []
df_ave_fwd_iat = []
df_std_fwd_iat = []
df_var_fwd_iat = []
df_rev_dur = []
df_max_rev_iat = []
df_min_rev_iat = []
df_ave_rev_iat = []
df_std_rev_iat = []
df_var_rev_iat = []
df_label = []

# initialize the interpacket variables
arr_port = []
arr_asn = []
arr_ver = []
def initialize_variables():
    global arr_fwd_bytes, arr_rev_bytes, arr_fwd_iat, arr_rev_iat, arr_port, arr_asn, arr_ver
    global fwd_pkt, fwd_bytes, rev_pkt, rev_bytes, init_dur, dur, fwd_dur, rev_dur, ctr
    arr_fwd_bytes = []
    arr_rev_bytes = []
    arr_fwd_iat = []
    arr_rev_iat = []
    fwd_pkt = 0
    fwd_bytes = 0.0
    rev_pkt = 0
    rev_bytes = 0.0
    init_dur = 0.0
    dur = 0.0
    fwd_dur = 0.0
    rev_dur = 0.0
    ctr = 0

def get_asn(ip):
    try:
        return asndb.lookup(ip)[0]
    except:
        false_ip = ip.split(",")[0]
        return asndb.lookup(false_ip)[0]

initialize_variables()
for index, row in df.iterrows():
    value = row['SRC_IP']
    if ctr == 29 or index == len(df) - 1 or (index > 0 and abs(df.at[index, 'DURATION'] - df.at[index - 1, 'DURATION']) > 1.0):
        if value.startswith(ipsrc):
            fwd_dur += (row['DURATION'] - init_dur)
            arr_fwd_iat.append(row['DURATION'] - init_dur)
            arr_port.append(row['DST_PORT'])
            arr_asn.append(get_asn(row['DST_IP']))
            arr_ver.append(row['QUIC_VERSION'])
            fwd_pkt += 1
            fwd_bytes += row['BYTES']
            arr_fwd_bytes.append(row['BYTES'])
        else:
            rev_dur += (row['DURATION'] - init_dur)
            arr_rev_iat.append(row['DURATION'] - init_dur)
            arr_port.append(row['SRC_PORT'])
            arr_asn.append(get_asn(row['SRC_IP']))
            arr_ver.append(row['QUIC_VERSION'])
            rev_pkt += 1
            rev_bytes += row['BYTES']
            arr_rev_bytes.append(row['BYTES'])

        dst_port = stats.mode(arr_port)
        dst_asn = stats.mode([x for x in arr_asn if isinstance(x, int)])
        new_quic_ver = stats.mode(arr_ver)

        if isinstance(new_quic_ver, tuple):
            quic_ver = max([int(h) for h in new_quic_ver])
        elif isinstance(new_quic_ver, int):
            quic_ver = [int(new_quic_ver)]
        
        dur += (row['DURATION'] - init_dur)
        ratio = 1 if rev_pkt > fwd_pkt else 0
        tot_pkt = fwd_pkt + rev_pkt
        tot_bytes = fwd_bytes + rev_bytes
        flow_pkt = tot_pkt / dur
        flow_bytes = tot_bytes / dur
        
        if len(arr_fwd_bytes) > 0 and len(arr_rev_bytes) > 0:
            combined_bytes = np.array(arr_fwd_bytes + arr_rev_bytes)
            max_bytes = np.max(combined_bytes)
            min_bytes = np.min(combined_bytes)
            ave_bytes = np.mean(combined_bytes)
            std_bytes = np.std(combined_bytes)
            var_bytes = np.var(combined_bytes)

        if len(arr_fwd_bytes) > 0:
            max_fwd_bytes = np.max(arr_fwd_bytes)
            min_fwd_bytes = np.min(arr_fwd_bytes)
            ave_fwd_bytes = np.mean(arr_fwd_bytes)
            std_fwd_bytes = np.std(arr_fwd_bytes)
            var_fwd_bytes = np.var(arr_fwd_bytes)

        if len(arr_rev_bytes) > 0:
            max_rev_bytes = np.max(arr_rev_bytes)
            min_rev_bytes = np.min(arr_rev_bytes)
            ave_rev_bytes = np.mean(arr_rev_bytes)
            std_rev_bytes = np.std(arr_rev_bytes)
            var_rev_bytes = np.var(arr_rev_bytes)

        if len(arr_fwd_iat) > 0 and len(arr_rev_iat) > 0:
            combined_iat = np.array(arr_fwd_iat + arr_rev_iat)
            max_iat = np.max(combined_iat)
            min_iat = np.min(combined_iat)
            ave_iat = np.mean(combined_iat)
            std_iat = np.std(combined_iat)
            var_iat = np.var(combined_iat)

        if len(arr_fwd_iat) > 0:
            max_fwd_iat = np.max(arr_fwd_iat)
            min_fwd_iat = np.min(arr_fwd_iat)
            ave_fwd_iat = np.mean(arr_fwd_iat)
            std_fwd_iat = np.std(arr_fwd_iat)
            var_fwd_iat = np.var(arr_fwd_iat)

        if len(arr_rev_iat) > 0:
            max_rev_iat = np.max(arr_rev_iat)
            min_rev_iat = np.min(arr_rev_iat)
            ave_rev_iat = np.mean(arr_rev_iat)
            std_rev_iat = np.std(arr_rev_iat)
            var_rev_iat = np.var(arr_rev_iat)

        df_dst_port.append(dst_port)
        df_dst_asn.append(dst_asn)
        df_dur.append(dur)
        df_ratio.append(ratio)
        df_flow_pkt.append(flow_pkt)
        df_flow_bytes.append(flow_bytes)
        df_tot_pkt.append(tot_pkt)
        df_tot_bytes.append(tot_bytes)
        df_max_bytes.append(max_bytes)
        df_min_bytes.append(min_bytes)
        df_ave_bytes.append(ave_bytes)
        df_std_bytes.append(std_bytes)
        df_var_bytes.append(var_bytes)
        df_fwd_pkt.append(fwd_pkt)
        df_fwd_bytes.append(fwd_bytes)
        df_max_fwd_bytes.append(max_fwd_bytes)
        df_min_fwd_bytes.append(min_fwd_bytes)
        df_ave_fwd_bytes.append(ave_fwd_bytes)
        df_std_fwd_bytes.append(std_fwd_bytes)
        df_var_fwd_bytes.append(var_fwd_bytes)
        df_rev_pkt.append(rev_pkt)
        df_rev_bytes.append(rev_bytes)
        df_max_rev_bytes.append(max_rev_bytes)
        df_min_rev_bytes.append(min_rev_bytes)
        df_ave_rev_bytes.append(ave_rev_bytes)
        df_std_rev_bytes.append(std_rev_bytes)
        df_var_rev_bytes.append(var_rev_bytes)
        df_max_iat.append(max_iat)
        df_min_iat.append(min_iat)
        df_ave_iat.append(ave_iat)
        df_std_iat.append(std_iat)
        df_var_iat.append(var_iat)
        df_fwd_dur.append(fwd_dur)
        df_max_fwd_iat.append(max_fwd_iat)
        df_min_fwd_iat.append(min_fwd_iat)
        df_ave_fwd_iat.append(ave_fwd_iat)
        df_std_fwd_iat.append(std_fwd_iat)
        df_var_fwd_iat.append(var_fwd_iat)
        df_rev_dur.append(rev_dur)
        df_max_rev_iat.append(max_rev_iat)
        df_min_rev_iat.append(min_rev_iat)
        df_ave_rev_iat.append(ave_rev_iat)
        df_std_rev_iat.append(std_rev_iat)
        df_var_rev_iat.append(var_rev_iat)
        df_label.append(label) 

        initialize_variables()
    elif value.startswith(ipsrc) and ctr < 29:
        if ctr == 0:
            init_dur = row['DURATION']
        else:
            fwd_dur += (row['DURATION'] - init_dur)
            arr_fwd_iat.append(row['DURATION'] - init_dur)
        ctr += 1
        arr_port.append(row['DST_PORT'])
        arr_asn.append(get_asn(row['DST_IP']))
        arr_ver.append(row['QUIC_VERSION'])
        fwd_pkt += 1
        fwd_bytes += row['BYTES']
        arr_fwd_bytes.append(row['BYTES'])
    elif value.startswith(ipsrc) == False and ctr < 29:
        if ctr == 0:
            init_dur = row['DURATION']
        else:
            rev_dur += (row['DURATION'] - init_dur)
            arr_rev_iat.append(row['DURATION'] - init_dur)
        ctr += 1
        arr_port.append(row['SRC_PORT'])
        arr_asn.append(get_asn(row['SRC_IP']))
        arr_ver.append(row['QUIC_VERSION'])
        rev_pkt += 1
        rev_bytes += row['BYTES']
        arr_rev_bytes.append(row['BYTES'])
    
    # track progress in percent with respect to total rows
    if index % 1000 == 0:
        print(f"{index / len(df) * 100:.2f}%")



0.00%
0.02%
0.04%
0.05%
0.07%
0.09%
0.11%
0.12%
0.14%
0.16%
0.18%
0.20%
0.21%
0.23%
0.25%
0.27%
0.29%
0.30%
0.32%
0.34%
0.36%
0.37%
0.39%
0.41%
0.43%
0.45%
0.46%
0.48%
0.50%
0.52%
0.53%
0.55%
0.57%
0.59%
0.61%
0.62%
0.64%
0.66%
0.68%
0.69%
0.71%
0.73%
0.75%
0.77%
0.78%
0.80%
0.82%
0.84%
0.86%
0.87%
0.89%
0.91%
0.93%
0.94%
0.96%
0.98%
1.00%
1.02%
1.03%
1.05%
1.07%
1.09%
1.10%
1.12%
1.14%
1.16%
1.18%
1.19%
1.21%
1.23%
1.25%
1.26%
1.28%
1.30%
1.32%
1.34%
1.35%
1.37%
1.39%
1.41%
1.43%
1.44%
1.46%
1.48%
1.50%
1.51%
1.53%
1.55%
1.57%
1.59%
1.60%
1.62%
1.64%
1.66%
1.67%
1.69%
1.71%
1.73%
1.75%
1.76%
1.78%
1.80%
1.82%
1.83%
1.85%
1.87%
1.89%
1.91%
1.92%
1.94%
1.96%
1.98%
2.00%
2.01%
2.03%
2.05%
2.07%
2.08%
2.10%
2.12%
2.14%
2.16%
2.17%
2.19%
2.21%
2.23%
2.24%
2.26%
2.28%
2.30%
2.32%
2.33%
2.35%
2.37%
2.39%
2.40%
2.42%
2.44%
2.46%
2.48%
2.49%
2.51%
2.53%
2.55%
2.57%
2.58%
2.60%
2.62%
2.64%
2.65%
2.67%
2.69%
2.71%
2.73%
2.74%
2.76%
2.78%
2.80%
2.81%
2.83%
2.85%
2.87%
2.89%
2.90%
2.92%
2.94%
2.96

In [None]:
# for item in flow:
#     print(item)

print(new_flow_df)

In [None]:
# Create a list to store all flow records
flow_rows = []

# Loop through all indices
for i in range(len(df_dst_port)):
    # Create dictionary for each row
    flow_row = {
        'dst_port': df_dst_port[i],
        'dst_asn': df_dst_asn[i],
        'quic_ver': quic_ver,
        'dur': df_dur[i],
        'ratio': df_ratio[i],
        'flow_pkt': df_flow_pkt[i],
        'flow_bytes': df_flow_bytes[i],
        'tot_pkt': df_tot_pkt[i],
        'tot_bytes': df_tot_bytes[i],
        'max_bytes': df_max_bytes[i],
        'min_bytes': df_min_bytes[i],
        'ave_bytes': df_ave_bytes[i],
        'std_bytes': df_std_bytes[i],
        'var_bytes': df_var_bytes[i],
        'fwd_pkt': df_fwd_pkt[i],
        'fwd_bytes': df_fwd_bytes[i],
        'max_fwd_bytes': df_max_fwd_bytes[i],
        'min_fwd_bytes': df_min_fwd_bytes[i],
        'ave_fwd_bytes': df_ave_fwd_bytes[i],
        'std_fwd_bytes': df_std_fwd_bytes[i],
        'var_fwd_bytes': df_var_fwd_bytes[i],
        'rev_pkt': df_rev_pkt[i],
        'rev_bytes': df_rev_bytes[i],
        'max_rev_bytes': df_max_rev_bytes[i],
        'min_rev_bytes': df_min_rev_bytes[i],
        'ave_rev_bytes': df_ave_rev_bytes[i],
        'std_rev_bytes': df_std_rev_bytes[i],
        'var_rev_bytes': df_var_rev_bytes[i],
        'max_iat': df_max_iat[i],
        'min_iat': df_min_iat[i],
        'ave_iat': df_ave_iat[i],
        'std_iat': df_std_iat[i],
        'var_iat': df_var_iat[i],
        'fwd_dur': df_fwd_dur[i],
        'max_fwd_iat': df_max_fwd_iat[i],
        'min_fwd_iat': df_min_fwd_iat[i],
        'ave_fwd_iat': df_ave_fwd_iat[i],
        'std_fwd_iat': df_std_fwd_iat[i],
        'var_fwd_iat': df_var_fwd_iat[i],
        'rev_dur': df_rev_dur[i],
        'max_rev_iat': df_max_rev_iat[i],
        'min_rev_iat': df_min_rev_iat[i],
        'ave_rev_iat': df_ave_rev_iat[i],
        'std_rev_iat': df_std_rev_iat[i],
        'var_rev_iat': df_var_rev_iat[i],
        'label': df_label[i]
    }
    flow_rows.append(flow_row)

# Create DataFrame from the list of dictionaries
new_flow_df = pd.DataFrame(flow_rows)

In [13]:
file_path = "./benign_flow/benign1.csv"

# Write the updated DataFrame back to the CSV file
new_flow_df.to_csv(file_path, index=False)

In [17]:
# Save the filtered DataFrame to a new CSV file
output_file = "./benign_flow/filtered_output.csv"
df.to_csv(output_file, index=False)

### archive

In [None]:
# Cell to inspect DataFrame contents and diagnose issues

# Check basic DataFrame info
print(f"DataFrame shape: {df.shape}")
print(f"DataFrame columns: {df.columns.tolist()}")
print("\nSample of first 5 rows:")
print(df.head().to_string())

# Check for commas in IP fields
src_ip_with_commas = df[df['SRC_IP'].str.contains(',', na=False)]
dst_ip_with_commas = df[df['DST_IP'].str.contains(',', na=False)]

print(f"\nRows with commas in SRC_IP: {len(src_ip_with_commas)}")
print(f"Rows with commas in DST_IP: {len(dst_ip_with_commas)}")

if len(src_ip_with_commas) > 0:
    print("\nSample of rows with commas in SRC_IP:")
    print(src_ip_with_commas.head().to_string())
    
    # Count unique values with commas
    unique_problematic_src_ips = src_ip_with_commas['SRC_IP'].unique()
    print(f"\nUnique problematic SRC_IP values ({len(unique_problematic_src_ips)}):")
    for ip in unique_problematic_src_ips[:10]:  # Show first 10
        print(f"  - {ip}")
    if len(unique_problematic_src_ips) > 10:
        print(f"  ... and {len(unique_problematic_src_ips) - 10} more")

# Check specific comparison that's failing
ipsrc = "10.10.3.10"
problem_rows = df[df['SRC_IP'].str.startswith(ipsrc) & (df['SRC_IP'] != ipsrc)]
print(f"\nRows where SRC_IP starts with '{ipsrc}' but isn't exactly '{ipsrc}': {len(problem_rows)}")
if len(problem_rows) > 0:
    print("\nSample of these problematic rows:")
    print(problem_rows.head().to_string())

# Compare string values and lengths
if len(problem_rows) > 0:
    example = problem_rows['SRC_IP'].iloc[0]
    print(f"\nDetailed comparison:")
    print(f"Expected: '{ipsrc}' (length: {len(ipsrc)})")
    print(f"Actual  : '{example}' (length: {len(example)})")
    print(f"Equal?  : {ipsrc == example}")
    
    # Check character by character
    print("\nCharacter by character comparison:")
    for i in range(max(len(ipsrc), len(example))):
        if i < len(ipsrc) and i < len(example):
            match = ipsrc[i] == example[i]
            print(f"Position {i}: '{ipsrc[i]}' vs '{example[i]}' - Match: {match}")
        elif i < len(ipsrc):
            print(f"Position {i}: '{ipsrc[i]}' vs (no character) - No match")
        else:
            print(f"Position {i}: (no character) vs '{example[i]}' - No match")

In [None]:
# source IP
ipsrc = df["SRC_IP"].iloc[0]
portsrc = df["SRC_PORT"].iloc[0]
cat = "Streaming"

flow = []

df["true_dest"] = df.apply(lambda row: row["DST_IP"] if row["SRC_IP"] == ipsrc else row["SRC_IP"], axis=1)
df["group"] = (df["true_dest"] != df["true_dest"].shift()).cumsum()

for group, group_df in df.groupby("group"):
    num_subgroups = (len(group_df) + 29) // 30
    subgroups = [group_df.iloc[i * 30:(i+1) * 30] for i in range(num_subgroups)]
    
    # print(f"Group {group}:")
    # print(group_df)

    for  subgroup in subgroups:
        ppi_dir = []
        ipdst = subgroup["true_dest"].iloc[0]  # The unique normalized destination for this subgroup
        portdst = subgroup["DST_PORT"].iloc[0] 

        ppi_time = [0]
        ppi_size = [int(subgroup["BYTES"].iloc[0])]
        for i in range(1, len(subgroup)):
            # Calculate the time difference between consecutive packets
            duration = int((subgroup["DURATION"].iloc[i] - subgroup["DURATION"].iloc[i - 1]) * 1000)
            ppi_time.append(duration)
            ppi_size.append(int(subgroup["BYTES"].iloc[i]))



        for _, row in subgroup.iterrows():
            if row["SRC_IP"] == ipsrc and row["DST_IP"] == ipdst:
                ppi_dir.append(1)
            elif row["SRC_IP"] == ipdst  and row["DST_IP"] == ipsrc:
                ppi_dir.append(-1)
            else:
                ppi_dir.append(0)

            bytes_fromsrc = int(subgroup.loc[(subgroup["SRC_IP"] == ipsrc) & (subgroup["DST_IP"] == ipdst),"BYTES"].sum())
            bytes_rev = int(subgroup.loc[(subgroup["SRC_IP"] == ipdst) & (subgroup["DST_IP"] == ipsrc),"BYTES"].sum())
            packets = int(subgroup.loc[(subgroup["SRC_IP"] == ipsrc) & (subgroup["DST_IP"] == ipdst),"BYTES"].count())
            packets_rev = int(subgroup.loc[(subgroup["SRC_IP"] == ipdst) & (subgroup["DST_IP"] == ipsrc),"BYTES"].count())
            ppi_len = len(ppi_dir)

            ppi_rtt = 0
            in_group = False

            # Iterate through the list
            for value in ppi_dir:
                if value == -1:  # Start or continue a group of -1's
                    if not in_group:
                        in_group = True  # Beginning of a group
                elif value == 1:  # Start or continue a group of 1's
                    if not in_group:
                        in_group = True  # Beginning of a group
                    elif in_group:
                        ppi_rtt += 1
                        in_group = False  # Reset for the next group

            # Handle the case where the list ends with a valid pair
            if in_group:
                ppi_rtt += 1

        # print("Subgroup:")
        # print(subgroup)

        dur = round(subgroup["DURATION"].max() - subgroup["DURATION"].min(), ndigits=6)

        #flow.append([ipsrc, ipdst, asndb.lookup(ipdst)[0], portsrc, portdst, 1, sni, dur, bytes_fromsrc, bytes_rev, packets, packets_rev, ppi_len, ppi_rtt, cat, [ppi_time, ppi_dir, ppi_size]])
        flow.append([portdst, asndb.lookup(ipdst)[0], portsrc, portdst, 1, sni, dur, bytes_fromsrc, bytes_rev, packets, packets_rev, ppi_len, ppi_rtt, cat, [ppi_time, ppi_dir, ppi_size]])
        

In [None]:
import os
import pandas as pd
import glob
from tqdm import tqdm
import time

def merge_csv_files(parent_folder, output_file='master_file.csv'):
    """
    Merge all CSV files from multiple folders into one master file.
    
    Args:
        parent_folder (str): Path to the parent folder containing subfolders with CSV files
        output_file (str): Path where the merged CSV file will be saved
    """
    start_time = time.time()
    
    # Get all CSV files from all subfolders
    all_csv_files = glob.glob(os.path.join(parent_folder, '**', '*.csv'), recursive=True)
    
    if not all_csv_files:
        print(f"No CSV files found in {parent_folder} or its subfolders.")
        return
    
    print(f"Found {len(all_csv_files)} CSV files in {len(set(os.path.dirname(f) for f in all_csv_files))} folders.")
    
    # Read the first file to get the header
    first_df = pd.read_csv(all_csv_files[0])
    header = first_df.columns.tolist()
    
    # Write the header to the output file
    with open(output_file, 'w', encoding='utf-8') as f_output:
        pd.DataFrame(columns=header).to_csv(f_output, index=False)
    
    # Process each file and append to master file
    print("Merging files...")
    for file_path in tqdm(all_csv_files):
        try:
            # Read each CSV file in chunks to handle large files efficiently
            for chunk in pd.read_csv(file_path, chunksize=100000):
                # Append to the master file without writing the header again
                chunk.to_csv(output_file, mode='a', header=False, index=False)
        except Exception as e:
            print(f"Error processing file {file_path}: {str(e)}")
    
    # Final processing time
    elapsed_time = time.time() - start_time
    print(f"Merge completed in {elapsed_time:.2f} seconds.")
    print(f"Master file saved as: {os.path.abspath(output_file)}")
    
    # Get some basic stats about the merged file
    try:
        file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
        row_count = sum(1 for _ in open(output_file, 'r')) - 1  # Subtract 1 for header
        print(f"Master file size: {file_size_mb:.2f} MB")
        print(f"Total rows: {row_count:,}")
    except Exception as e:
        print(f"Error getting file stats: {str(e)}")

if __name__ == "__main__":
    # Specify the parent folder containing subfolders with CSV files
    parent_folder = input("Enter the path to the parent folder: ").strip()
    
    # Specify output file path
    output_file = input("Enter the path for the master file (or press Enter for 'master_file.csv'): ").strip()
    if not output_file:
        output_file = "master_file.csv"
    
    # Run the merge
    merge_csv_files(parent_folder, output_file)

In [None]:
# set source IP, web service category, and label
ipsrc = "10.10.3.10"
lst_asn = []

for index, row in df.iterrows():
    value = row['SRC_IP']
    if value.startswith(ipsrc):
        try:
            lst_asn.append(asndb.lookup(row['DST_IP'])[0])
        except:
            false_ip = row['DST_IP'].split(",")[0]
            lst_asn.append(asndb.lookup(false_ip)[0])
            print(f"ip: {false_ip} -> asn: {asndb.lookup(false_ip)[0]}")
    elif value.startswith(ipsrc) == False:
        try:
            lst_asn.append(asndb.lookup(row['SRC_IP'])[0])
        except:
            false_ip = row['SRC_IP'].split(",")[0]
            lst_asn.append(asndb.lookup(false_ip)[0])
            print(f"ip: {false_ip} -> asn: {asndb.lookup(false_ip)[0]}")
        
    if index % 1000 == 0:
        print(f"{index / len(df) * 100:.2f}%")
        