In [1]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
import pandas as pd
import pickle
import torch
import os
from tqdm import tqdm
from collections import Counter
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset, DataLoader

In [2]:
# file_name = "/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/BotNet/Botnet-attack.csv"

In [3]:
## help functions
# Define a function to modify 'ServePort' based on the condition
def modify_serve_port(df):
    df['ServePort'] = np.where(df['ServePort'] > 1024, 1025, df['ServePort'])
    return df

def assign_flow_id(df):
    # Function to create a sorted string from IPs and Ports
    def create_flow_identifier(row):
        # Sort IPs
        ips = sorted([row['IP_source'], row['IP_destination']])
        # Sort Ports
        ports = sorted([row['Port_source'], row['Port_destination']])
        # Combine with Layer 4 protocol
        return f"{ips[0]}_{ips[1]}_{ports[0]}_{ports[1]}_{row['Layer_4_protocol']}"

    # Apply the function to each row to create a flow identifier
    df['flow_identifier'] = df.apply(create_flow_identifier, axis=1)

    # Assign flow ID based on the unique flow identifiers
    df['flow_id'] = pd.factorize(df['flow_identifier'])[0]

    return df


def create_flow_summary(df):
    # Group by flow_id and aggregate
    df['serve_ip'] = df.apply(lambda x: min(x['IP_source'], x['IP_destination']), axis=1)

    flow_summary = df.groupby('flow_id').agg(
        serve_ip=pd.NamedAgg(column='serve_ip', aggfunc='first'),  # Include serve_ip in the summary
        start_time=pd.NamedAgg(column='TIME', aggfunc='min'),
        end_time=pd.NamedAgg(column='TIME', aggfunc='max'),
        average_packet_size=pd.NamedAgg(column='Size', aggfunc='mean'),
        average_IAT=pd.NamedAgg(column='IAT', aggfunc='mean'),
        num_packets=pd.NamedAgg(column='TIME', aggfunc='count'),
        l4_protocol = pd.NamedAgg(column='Layer_4_protocol', aggfunc='first'),
        srcIP=pd.NamedAgg(column='IP_source', aggfunc='first'),  # Extract the Source IP
        dstIP=pd.NamedAgg(column='IP_destination', aggfunc='first'),  # Extract the Destination IP
        ServePort = pd.NamedAgg(column='ServePort', aggfunc='first')
    )

    # Calculate duration
    flow_summary['duration'] = flow_summary['end_time'] - flow_summary['start_time']
    # Determine if flow is bidirectional
    flow_summary['direction'] = df.groupby('flow_id').apply(
        lambda x: 1 if len(x['IP_source'].unique()) > 1 else 0
    )
    flow_summary = flow_summary.reset_index()
    return flow_summary

last_end_time = {}
global_session_id = 0
# Function to assign session IDs
def assign_session_id(row):
    global global_session_id

    serve_ip = row['serve_ip']
    start_time = row['start_time']
    
    # Initialize if this is the first flow for the serve_ip
    if serve_ip not in last_end_time:
        last_end_time[serve_ip] = row['end_time']
        global_session_id += 1
        return global_session_id
    
    # Calculate inter-time
    inter_time = (start_time - last_end_time[serve_ip]).total_seconds()
    
    # Check if new session should start
    if inter_time > 30:
        global_session_id += 1
    
    # Update the last end time
    last_end_time[serve_ip] = row['end_time']
    return global_session_id

In [4]:
## help functions
### Tokenization Binning
# process for the Packet
def equal_width_binning_packet(df):
    n_bins = 1026
    strategy = 'uniform' # quantile for equal-frequency, kmeans for k-clustering
    subsample_size = 200000  # Set this to None to disable subsampling
    scalers = {
        'IAT': KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy, subsample=subsample_size),
        'Size': KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy, subsample=subsample_size),
        'Payload_Size': KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy, subsample=subsample_size)  
    }

    df_copy = df.copy()
    for feature, scaler in scalers.items():
        scaler.fit(df_copy[[feature]])
        df_copy[feature] = scaler.transform(df_copy[[feature]]).astype(int)
    return df_copy

# process for the flow
def equal_width_binning_flow(df):
    n_bins = 1026
    strategy = 'uniform' # quantile for equal-frequency, kmeans for k-clustering
    subsample_size = 200000  # Set this to None to disable subsampling
    scaler = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy=strategy, subsample=subsample_size)

    df_copy = df.copy()
    columns_to_bin = ['duration','average_packet_size','average_IAT','num_packets']
    
    for col in columns_to_bin:
        scaler.fit(df_copy[[col]])
        df_copy[col] = scaler.transform(df_copy[[col]]).astype(int)
    return df_copy

In [5]:
def processing_file(file_name):
    df = dd.read_csv(file_name)
    df['Direction'] = (df['IP_source'] < df['IP_destination']).astype(int)
    df['ServePort'] = df[['Port_source', 'Port_destination']].min(axis=1)

    # Apply the serve port function
    df = df.map_partitions(modify_serve_port, meta=df)
    dask_df = df.copy()
    dask_df['Flow'] = dask_df['IP_source'] + dask_df['IP_destination'] + dask_df['Port_source'].astype(str) + dask_df['Port_destination'].astype(str) + dask_df['Layer_4_protocol'].astype(str)
    dask_df['Inverse_Flow'] = dask_df['IP_destination'] + dask_df['IP_source'] + dask_df['Port_destination'].astype(str) + dask_df['Port_source'].astype(str)  + dask_df['Layer_4_protocol'].astype(str)
    ## change from dask df to pandas
    df = dask_df.compute()
    df = assign_flow_id(df)

    
    # Sort by flow_id and TIME
    df = df.sort_values(by=['flow_id', 'TIME'])
    # Calculate Inter-Arrival Time within each flow
    df['IAT'] = df.groupby('flow_id')['TIME'].diff().fillna(0)

    ## filter the huge size packet
    df = df[df['Size'] <= 5000]
    df = df[df['Payload_Size']<= 5000]
    print("Packet Processing Done")
    # Create the flow-level summary DataFrame
    flow_summary_df = create_flow_summary(df)

    flow_df = flow_summary_df.copy()
    print(flow_df.shape)

    # Assuming flow_summary_df is your DataFrame
    # Convert 'start_time' and 'end_time' to datetime if they are not already
    flow_summary_df['start_time'] = pd.to_datetime(flow_summary_df['start_time'])
    flow_summary_df['end_time'] = pd.to_datetime(flow_summary_df['end_time'])
    
    # Sort by serve_ip and start_time
    flow_summary_df = flow_summary_df.sort_values(by=['serve_ip', 'start_time'])
    
    # Initialize a dictionary to track the last end time for each serve_ip
    # and a global session ID counter


    # Apply function to each row
    flow_summary_df['session_id'] = flow_summary_df.apply(assign_session_id, axis=1)


    df_with_sessions = pd.merge(df, flow_summary_df[['flow_id', 'session_id']], on='flow_id', how='left')
    df_with_sessions = df_with_sessions.sort_values(by=['session_id'])
    ## final flow representation 
    final_flow = flow_summary_df[['session_id','flow_id','direction', 'duration', 'l4_protocol', 'average_packet_size','average_IAT','num_packets','ServePort']].copy()
    ## final packet representation
    final_packet = df_with_sessions[['session_id','flow_id','Direction', 'Layer_4_protocol','Size','Payload_Size','IAT', 'ServePort']].copy()

    print("Tokenization: Binning")
    df_packet_token = equal_width_binning_packet(final_packet)
    df_flow_token = equal_width_binning_flow(final_flow)

    return df_packet_token, df_flow_token

In [6]:
# df_packet_token, df_flow_token = processing_file(file_name)

In [7]:
## part 2 input generation 
def create_embeddings(length, embedding_value):
    return np.full(length, embedding_value)

def input_generation(df_flow_token, df_packet_token, def_label):
    max_len = 2000
    padding_value = 0
    
    session_dict = df_flow_token.groupby('session_id')['flow_id'].apply(list).to_dict()
    flow_id_dict = df_packet_token.groupby('flow_id').apply(lambda x: x.drop(['flow_id', 'session_id'], axis=1).values.flatten().tolist()).to_dict()
    for key in flow_id_dict.keys():
        if len(flow_id_dict[key]) >= 6*9: ## more than 9 packet
            flow_id_dict[key] = flow_id_dict[key][:54]  
    flow_feature_dict = df_flow_token.groupby('flow_id').apply(lambda x: x.drop(['flow_id', 'session_id'], axis=1).values.flatten().tolist()).to_dict()


    # Use the session_dict to create the final dictionary
    session_feature_dict = {}
    segment_embedding_dict = {}
    
    # Wrap the outer loop with tqdm for the progress bar
    for session_id in tqdm(session_dict.keys(), desc="Processing sessions"):
        session_features = []
        session_embeddings = []
        for flow_id in session_dict[session_id]:
            # Retrieve flow features and create flow embeddings
            flow_feat = flow_feature_dict[flow_id]
            flow_embedding = create_embeddings(len(flow_feat), 0)  # 0 for flow features
            session_features.extend(flow_feat)
            session_embeddings.extend(flow_embedding)
    
            # Retrieve packet features and create packet embeddings
            packet_feat = flow_id_dict[flow_id]
            packet_embedding = create_embeddings(len(packet_feat), 1)  # 1 for packet features
            session_features.extend(packet_feat)
            session_embeddings.extend(packet_embedding)
        
        session_feature_dict[session_id] =  session_features
        segment_embedding_dict[session_id] = session_embeddings
    
    print("Processing complete.")



    input_sequences = []
    input_segments = []
    
    def pad_sequence(seq, target_len, pad_value=0):
        return np.pad(seq, (0, max(target_len - len(seq), 0)), mode='constant', constant_values=pad_value)
    
    for s_id in tqdm(session_feature_dict.keys(), desc="Processing sessions"):
        ids = np.array(session_feature_dict[s_id])  # Convert to numpy array for memory efficiency
        segs = np.array(segment_embedding_dict[s_id])  # Convert to numpy array for memory efficiency
    
        # Splitting and padding the sequences
        for i in range(0, len(ids), max_len):
            end_idx = min(i + max_len, len(ids))
            pieces = pad_sequence(ids[i:end_idx], max_len, padding_value)
            seg_pieces = pad_sequence(segs[i:end_idx], max_len, padding_value)
    
            input_sequences.append(pieces)
            input_segments.append(seg_pieces)
    
    print("Processing complete.")
  
    labels = def_label * np.ones((len(input_sequences)))

    return input_sequences,input_segments,labels 


In [8]:
# input_sequences,input_segments,labels = input_generation(df_flow_token, df_packet_token, 1)
# len(input_sequences)

In [12]:
class NetformerDatasetDownstream(Dataset):
    def __init__(self, input_sequences, input_segments, input_labels, seq_len = 2000):
        self.seq_len = seq_len
        self.session_flows = len(input_sequences)
        self.sessions = input_sequences
        self.segments = input_segments
        self.labels = input_labels
        self.special_token_dict =  {'PAD': 1027, 'MASK': 1028}
        self.mask_ratio = 0


    def __len__(self):
        return self.session_flows

    def __getitem__(self,item):

        ##step 1 : get random sessions 
        s1, seg1,seq_label = self.get_session_flow(item)

        ## step 2: replace random word in sentence 
        s1_random, s1_label, s1_idx = self.random_word(s1)
        
        segment_label = seg1

        netformer_input = s1_random
        netformer_label = s1_label
        netformer_idx = s1_idx

        output = {"netformer_input": netformer_input,
                  "netformer_label": netformer_label,
                  "netformer_idx":netformer_idx,
                  "segment_label": segment_label,
                "sequence_label": seq_label}

        return {key: torch.tensor(value,dtype=torch.float32) for key, value in output.items()}


    def random_word(self, sentence):
        output_label = []
        output = []
        output_idx =[]


        for i, token in enumerate(sentence):
            prob = random.random()

            if prob < self.mask_ratio:
                prob /= self.mask_ratio
    
                if prob < 0.8:
                    output.append(self.special_token_dict['MASK'])
                elif prob < 0.9:
                    output.append(self.random_selection(self.sessions))
                else:
                    output.append(token)
    
                output_label.append(token)
                output_idx.append(1)
    
            else:
                output.append(token)
                output_label.append(0)
                output_idx.append(0)
                

        assert len(output) == len(output_label)
        return output, output_label, output_idx
        

    def random_selection(self, input_sequences):
        rand_session = random.randrange(len(input_sequences))
        rand_flow = random.randrange(len(input_sequences[rand_session]))
        return input_sequences[rand_session][rand_flow]
        

    def get_session_flow(self, item):
        '''Return session data and segments'''
        return self.sessions[item], self.segments[item],self.labels[item]

In [13]:
def generate_the_final_input_data(input_sequences,input_segments, input_labels):
    # Desired length for each piece
    MAX_LEN = 2000
    # Padding value (you can use any value you prefer)
    padding_value = 0

    special_tokens = ['PAD', 'MASK']
    special_token_dict = {}
    for i in range(len(special_tokens)):
        special_token_dict[special_tokens[i]] = 1026+i+1
    
    # print(special_token_dict)

    train_data = NetformerDatasetDownstream(input_sequences,input_segments, input_labels,seq_len=MAX_LEN)
    return train_data 


In [11]:
# Labeling function for the given dataset
def label_data(base_path, attack_labels):
    file_list = []
    for folder in attack_labels:
        attack_file = f"{base_path}/{folder}/{folder}-attack.csv"
        benign_file = f"{base_path}/{folder}/{folder}-benign.csv"
        
        # Read attack files and label them
        df_attack = pd.read_csv(attack_file)
        df_attack['label'] = attack_labels[folder]
        file_list.append((attack_file, attack_labels[folder]))
        
        # Read benign files and label them
        df_benign = pd.read_csv(benign_file)
        df_benign['label'] = 0
        file_list.append((benign_file, 0))
        
    return file_list

# Define attack labels
attack_labels = {
    'Botnet': 1,
    'BruteForce': 2,
    'DDoS': 3,
    'DoS': 4,
    'Infiltration': 5
}

# Base path of your dataset
base_path = '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018'

# Get the labeled file list
file_name_list = label_data(base_path, attack_labels)
print(file_name_list)

[('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Botnet/Botnet-attack.csv', 1), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Botnet/Botnet-benign.csv', 0), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/BruteForce/BruteForce-attack.csv', 2), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/BruteForce/BruteForce-benign.csv', 0), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/DDoS/DDoS-attack.csv', 3), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/DDoS/DDoS-benign.csv', 0), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/DoS/DoS-attack.csv', 4), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/DoS/DoS-benign.csv', 0), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Infiltration/Infiltration-attack.csv', 5), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Infiltration/Infiltration-benign.csv', 0)]


In [11]:
file_name_list = [('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Botnet/Botnet-attack.csv', 1), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Botnet/Botnet-benign.csv', 0), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/BruteForce/BruteForce-attack.csv', 2), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/BruteForce/BruteForce-benign.csv', 0), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/DDoS/DDoS-attack.csv', 3), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/DDoS/DDoS-benign.csv', 0), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/DoS/DoS-attack.csv', 4), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/DoS/DoS-benign.csv', 0), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Infiltration/Infiltration-attack.csv', 5), ('/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Infiltration/Infiltration-benign.csv', 0)]

In [12]:
# Processing function, refined and completed
def process_and_save(file_name, label, output_path):
    print(f"Processing file: {file_name} with label: {label}")
    df_packet_token, df_flow_token = processing_file(file_name)
    input_sequences, input_segments, input_labels = input_generation(df_flow_token, df_packet_token, label)

    # Save the processed data to individual files
    with open(output_path, 'wb') as f:
        pickle.dump((input_sequences, input_segments, input_labels), f)
    return output_path


In [14]:
# Base path of your dataset
base_path = '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018'
output_path = '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output'
final_output_file = '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/CIC2018-dataset-all.pkl'

# # Ensure output directory exists
# if not os.path.exists(output_path):
#     os.makedirs(output_path)

# Process and save each file individually
processed_files = []
for file_name, label in file_name_list:
    output_file = os.path.join(output_path, os.path.basename(file_name).replace('.csv', '.pkl'))
    processed_files.append(process_and_save(file_name, label, output_file))


Processing file: /home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Botnet/Botnet-attack.csv with label: 1
Packet Processing Done
(138713, 13)
Tokenization: Binning


Processing sessions: 100%|██████████████████████| 10/10 [00:00<00:00, 20.63it/s]


Processing complete.


Processing sessions: 100%|██████████████████████| 10/10 [00:00<00:00, 19.93it/s]

Processing complete.





In [13]:
final_output_file = '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/CIC2018-dataset-2-classes.pkl'
output_path = '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output'
processed_files = []
for file_name, label in file_name_list:
    output_file = os.path.join(output_path, os.path.basename(file_name).replace('.csv', '.pkl'))
    processed_files.append(output_file)
print(processed_files)
# Combining function to concatenate saved data
def combine_saved_data(file_paths, output_file):
    all_input_sequences, all_input_segments, all_input_labels = [], [], []

    for file_path in file_paths:
        with open(file_path, 'rb') as f:
            input_sequences, input_segments, input_labels = pickle.load(f)
            if len(all_input_sequences) == 0:
                all_input_sequences = input_sequences
                all_input_segments = input_segments
                all_input_labels = input_labels
            else:
                all_input_sequences = np.concatenate((all_input_sequences, input_sequences), axis=0)
                all_input_segments = np.concatenate((all_input_segments, input_segments), axis=0)
                all_input_labels = np.concatenate((all_input_labels, input_labels), axis=0)
    print(np.unique(all_input_labels))
    all_input_labels = np.where(all_input_labels == 0, 0, 1)
    print(np.unique(all_input_labels))
    train_data = NetformerDatasetDownstream(all_input_sequences, all_input_segments, all_input_labels, seq_len=2000)
    with open(output_file, 'wb') as f:
        pickle.dump(train_data, f)

    return output_file



['/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/Botnet-attack.pkl', '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/Botnet-benign.pkl', '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/BruteForce-attack.pkl', '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/BruteForce-benign.pkl', '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/DDoS-attack.pkl', '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/DDoS-benign.pkl', '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/DoS-attack.pkl', '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/DoS-benign.pkl', '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/Infiltration-attack.pkl', '/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/output/Infiltration-benign.pkl']


In [14]:
# Combine all processed files into one final dataset
combine_saved_data(processed_files, final_output_file)

[0. 1. 2. 3. 4. 5.]
[0 1]


'/home/binghui/NDSS2025/Intrution-detection/CICdataset2018/CIC2018-dataset-2-classes.pkl'

In [10]:
def Model_input(file_name_list):
    MAX_LEN = 2000
    # Padding value (you can use any value you prefer)
    padding_value = 0

    special_tokens = ['PAD', 'MASK']
    special_token_dict = {}
    for i in range(len(special_tokens)):
        special_token_dict[special_tokens[i]] = 1026+i+1
    
    all_input_sequences, all_input_segments,all_input_labels = [],[],[]
    for file_name, label in file_name_list:
        print(f"Processing file: {file_name} with label: {label}")
        df_packet_token, df_flow_token = processing_file(file_name)
        input_sequences, input_segments, input_labels = input_generation(df_flow_token, df_packet_token, label)
        if len(all_input_sequences) == 0:
            all_input_sequences = input_sequences
            all_input_segments = input_segments
            all_input_labels = input_labels
        else:
            all_input_sequences = np.concatenate((all_input_sequences, input_sequences), axis=0)
            all_input_segments = np.concatenate((all_input_segments, input_segments), axis=0)
            all_input_labels = np.concatenate((all_input_labels, input_labels), axis=0)
    
    train_data = NetformerDatasetDownstream(all_input_sequences, all_input_segments, all_input_labels, seq_len=MAX_LEN)
    with open('CIC2018-dataset-all.pkl', 'wb') as f:
        pickle.dump(train_data, f)
    return 1
    

In [None]:
Model_input(file_name_list)

Processing file: /home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Botnet/Botnet-attack.csv with label: 1
Packet Processing Done
(138713, 13)
Tokenization: Binning


Processing sessions: 100%|██████████████████████| 10/10 [00:00<00:00, 20.69it/s]


Processing complete.


Processing sessions: 100%|██████████████████████| 10/10 [00:00<00:00, 22.53it/s]


Processing complete.
Processing file: /home/binghui/NDSS2025/Intrution-detection/CICdataset2018/Botnet/Botnet-benign.csv with label: 0
Packet Processing Done
(56507, 13)
Tokenization: Binning


Processing sessions: 100%|████████████████| 1216/1216 [00:00<00:00, 7401.99it/s]


Processing complete.


Processing sessions: 100%|████████████████| 1216/1216 [00:00<00:00, 7687.04it/s]


Processing complete.
Processing file: /home/binghui/NDSS2025/Intrution-detection/CICdataset2018/BruteForce/BruteForce-attack.csv with label: 2
Packet Processing Done
(28262, 13)
Tokenization: Binning
