In [1]:
import pandas as pd
import numpy as np
from google.colab import drive
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils import resample
from collections import defaultdict
import gc

# Mount Google Drive
drive.mount('/content/drive')

def print_summary(df, stage):
    print(f"\n--- Summary at {stage} ---")
    print(f"Shape: {df.shape}")
    print("\nColumns:")
    print(df.columns.tolist())
    print("\nValue counts of 'Label' column:")
    print(df['Label'].value_counts(normalize=True))
    if 'attack_cat' in df.columns:
        print("\nValue counts of 'attack_cat' column:")
        print(df['attack_cat'].value_counts(normalize=True))
    print("-----------------------------\n")


Mounted at /content/drive


In [2]:
# Step 1: Combine Dataset Files (with chunking)
def combine_datasets(chunk_size=100000):
    desired_features = [
        'Stime', 'Ltime', 'dur', 'proto', 'service', 'state', 'sbytes', 'dbytes', 'sttl', 'dttl',
        'sloss', 'dloss', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin',
        'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len',
        'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm',
        'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label'
    ]

    # Read headers from the first file
    first_file = '/content/drive/MyDrive/UNSW-NB15_1.csv'
    available_columns = pd.read_csv(first_file, nrows=0).columns.tolist()

    # Use only the columns that are actually present in the file
    usecols = [col for col in desired_features if col in available_columns]

    print(f"Using the following columns: {usecols}")

    for i in range(1, 5):
        file_path = f'/content/drive/MyDrive/UNSW-NB15_{i}.csv'
        for chunk in pd.read_csv(file_path, usecols=usecols, chunksize=chunk_size, low_memory=False):
            yield process_chunk(chunk)

def process_chunk(chunk):
    if 'attack_cat' in chunk.columns:
        chunk['attack_cat'] = chunk['attack_cat'].fillna('Normal')
    if 'Label' in chunk.columns:
        chunk.loc[chunk['Label'] == 0, 'attack_cat'] = 'Normal'
    return chunk

In [3]:
# Step 2 & 3: Ground Truth Alignment and Attack Category Assignment
def align_ground_truth(df, gt):
    df['Stime'] = pd.to_datetime(df['Stime'], unit='s')
    df['Ltime'] = pd.to_datetime(df['Ltime'], unit='s')

    def find_attack_cat(row):
        if row['Label'] == 0:
            return 'Normal'
        mask = (gt['Start time'] <= row['Stime']) & (gt['Last time'] >= row['Ltime'])
        matching = gt[mask]
        if not matching.empty:
            return matching.iloc[0]['Attack category']
        return row['attack_cat']

    df['attack_cat'] = df.apply(find_attack_cat, axis=1)
    return df

In [12]:
# Step 4: Class Balancing
def balance_classes(df):
    print("Initial class distribution:")
    print(df['attack_cat'].value_counts())

    # Identify the majority class (assumed to be 'normal')
    majority_class = df['attack_cat'].mode().iloc[0]

    df_normal = df[df['attack_cat'] == majority_class]
    df_attack = df[df['attack_cat'] != majority_class]

    n_normal = len(df_normal)
    n_attack = len(df_attack)

    if n_normal == 0 or n_attack == 0:
        print("Unable to balance classes: one or both classes have zero samples.")
        return df

    attack_counts = df_attack['attack_cat'].value_counts()

    # Use the median of non-zero counts
    non_zero_counts = attack_counts[attack_counts > 0]
    if len(non_zero_counts) > 0:
        median_attack_count = int(non_zero_counts.median())
    else:
        median_attack_count = 1

    # Ensure we have at least 1 sample per category
    target_samples = max(1, median_attack_count)

    # Determine the number of samples for the normal class
    n_normal_samples = min(n_attack, n_normal, target_samples * len(attack_counts))

    if n_normal_samples == 0:
        print("After balancing, no normal samples would remain. Skipping balancing.")
        return df

    df_normal_downsampled = resample(df_normal,
                                     replace=False,
                                     n_samples=n_normal_samples,
                                     random_state=42)

    balanced_attacks = []
    for category in attack_counts.index:
        df_category = df_attack[df_attack['attack_cat'] == category]
        n_category = len(df_category)
        if n_category == 0:
            continue  # Skip empty categories
        elif n_category > target_samples:
            df_category = resample(df_category,
                                   replace=False,
                                   n_samples=target_samples,
                                   random_state=42)
        elif n_category < target_samples:
            df_category = resample(df_category,
                                   replace=True,
                                   n_samples=target_samples,
                                   random_state=42)
        balanced_attacks.append(df_category)

    if not balanced_attacks:
        print("No attack samples after balancing. Returning original dataset.")
        return df

    df_balanced = pd.concat([df_normal_downsampled] + balanced_attacks)
    df_balanced = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

    print("Final class distribution:")
    print(df_balanced['attack_cat'].value_counts())

    return df_balanced

In [5]:
# Step 5: Feature Engineering
def engineer_features(df):
    df['time_diff'] = (df['Ltime'] - df['Stime']).dt.total_seconds()
    df['hour'] = df['Stime'].dt.hour
    df['day_of_week'] = df['Stime'].dt.dayofweek
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    return df

In [6]:
# Step 6: Categorical Feature Encoding
def encode_categorical_features(df, encoders=None):
    if encoders is None:
        encoders = {}

    categorical_columns = ['proto', 'service', 'state', 'attack_cat']
    for col in categorical_columns:
        if col not in encoders:
            encoders[col] = LabelEncoder()
            encoders[col].fit(df[col])
        else:
            # Combine known and unknown labels
            labels = np.unique(np.concatenate((encoders[col].classes_, df[col].unique())))
            encoders[col].classes_ = labels

        # Transform the data, handling unknown labels
        df[col] = df[col].map(lambda x: x if x in encoders[col].classes_ else 'Unknown')
        df[col] = encoders[col].transform(df[col])

    return df, encoders


In [7]:
# Step 7: Normalization & Scaling
def normalize_features(df):
    scaler = StandardScaler()
    numerical_columns = df.select_dtypes(include=[np.number]).columns.drop(['Label', 'hour', 'day_of_week', 'is_weekend'])
    df[numerical_columns] = scaler.fit_transform(df[numerical_columns])
    return df


In [13]:
# Step 9: Normalization & Scaling (performed after processing all data)
def preprocess_unsw_nb15(output_file, chunk_size=100000):
    print("Loading ground truth data...")
    gt = pd.read_csv('/content/drive/MyDrive/UNSW-NB15_GT.csv')
    gt['Start time'] = pd.to_datetime(gt['Start time'])
    gt['Last time'] = pd.to_datetime(gt['Last time'])

    if 'Attack category' not in gt.columns:
        raise ValueError("'Attack category' column not found in ground truth file")

    print("Processing data in chunks...")
    encoders = None
    processed_chunks = []
    for i, chunk in enumerate(combine_datasets(chunk_size)):
        print(f"Processing chunk {i+1}")

        if 'Stime' in chunk.columns and 'Ltime' in chunk.columns:
            chunk = align_ground_truth(chunk, gt)

        if 'Stime' in chunk.columns:
            chunk = engineer_features(chunk)

        chunk, encoders = encode_categorical_features(chunk, encoders)

        print_summary(chunk, f"End of Chunk {i+1}")

        processed_chunks.append(chunk)

        del chunk
        gc.collect()

    print("Combining processed chunks...")
    df = pd.concat(processed_chunks, ignore_index=True)
    del processed_chunks
    gc.collect()

    print_summary(df, "After Combining All Chunks")

    if 'Label' in df.columns and 'attack_cat' in df.columns:
        print("Balancing classes...")
        df = balance_classes(df)
    else:
        print("'Label' or 'attack_cat' column missing. Skipping class balancing.")

    print("Normalizing features...")
    df = normalize_features(df)

    print("Saving preprocessed data...")
    df.to_csv(output_file, index=False)
    print(f"Preprocessed data saved to {output_file}")

    if 'attack_cat' in df.columns:
        print("\nFinal Distribution of attack categories:")
        print(df['attack_cat'].value_counts(normalize=True))

    return df


In [14]:

# Run the preprocessing
output_file = '/content/drive/MyDrive/preprocessed_unsw_nb15.csv'
preprocessed_df = preprocess_unsw_nb15(output_file)



Loading ground truth data...
Processing data in chunks...
Using the following columns: ['Stime', 'Ltime', 'dur', 'proto', 'service', 'state', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label']
Processing chunk 1

--- Summary at End of Chunk 1 ---
Shape: (100000, 32)

Columns:
['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'Stime', 'Ltime', 'ct_state_ttl', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label', 'time_diff', 'hour', 'day_of_week', 'is_weekend']

Value counts of 'Label' column:
Label
0    0.8857
1    0.1143
Name: proportion, dtype: float64

Value counts of 'attack_cat' column:
attac

NameError: name 'print_df_info' is not defined

In [15]:
# Final display of preprocessed data
print_summary(preprocessed_df, "Final Preprocessed Data")


--- Summary at Final Preprocessed Data ---
Shape: (280764, 32)

Columns:
['proto', 'state', 'dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'Stime', 'Ltime', 'ct_state_ttl', 'ct_srv_src', 'ct_dst_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat', 'Label', 'time_diff', 'hour', 'day_of_week', 'is_weekend']

Value counts of 'Label' column:
Label
0    0.545455
1    0.454545
Name: proportion, dtype: float64

Value counts of 'attack_cat' column:
attack_cat
 0.673340    0.500000
-0.751033    0.045455
-0.466159    0.045455
-1.890532    0.045455
-1.320783    0.045455
 0.388465    0.045455
-1.035908    0.045455
-0.181284    0.045455
 0.103591    0.045455
 1.527964    0.045455
-1.605657    0.045455
-2.175407    0.045455
Name: proportion, dtype: float64
-----------------------------

