In [1]:
import pandas as pd
import numpy as np
from ast import literal_eval
import os

from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import confusion_matrix, f1_score

In [2]:
def load_meta_pkt_df(input_dir, output_dir=None, save=False):
    print(f"[*] Loading Dataset From {input_dir}")
    pkt_df = pd.DataFrame()
    meta_df = pd.DataFrame()
    temp_df = pd.DataFrame()
    for sub_dir in os.listdir(input_dir):
        d = os.path.join(input_dir, sub_dir)
        if os.path.isdir(d):
            for f in os.listdir(d):
                if "csv" in f:
                    name = f.split('.')[0].split('-')[-1]
                    if name == 'metadata':
                        temp_df = pd.read_csv(os.path.join(d, f), sep=',', header=None)
                        temp_df.columns = ['Timestamp', 'Unix Timestamp', 'Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol']
                        meta_df = pd.concat([meta_df, temp_df])
                    else:
                        temp_df = pd.read_csv(os.path.join(d, f), delimiter='\t', header=None)
                        temp_df.columns = [name]
                        pkt_df = pd.concat([pkt_df, temp_df], axis=1)
    if save:
            meta_df.to_csv(f'./{output_dir}/metadata.csv', index=False)
            pkt_df.to_csv(f'./{output_dir}/pkt.csv', index=False)

    return meta_df, pkt_df

def load_nested_meta_pkt_df(input_dir, output_dir=None, save=False):
    print(f"[*] Loading Dataset From {input_dir}")
    pkt_df = pd.DataFrame()
    meta_df = pd.DataFrame()
    temp_df = pd.DataFrame()
    for sub_dir in os.listdir(input_dir):
        d1 = os.path.join(input_dir, sub_dir)
        for sub_sub_dir in os.listdir(d1):
            d2 = os.path.join(d1, sub_sub_dir)
            temp_pkt_df = pd.DataFrame()
            if os.path.isdir(d2):
                for f in os.listdir(d2):
                    if "csv" in f:
                        name = f.split('.')[0].split('-')[-1]
                        if name == 'metadata':
                            temp_df = pd.read_csv(os.path.join(d2, f), sep=',', header=None)
                            temp_df.columns = ['Timestamp', 'Unix Timestamp', 'Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol']
                            meta_df = pd.concat([meta_df, temp_df])
                        else:
                            temp_df = pd.read_csv(os.path.join(d2, f), delimiter='\t', header=None)
                            temp_df.columns = [name]
                            temp_pkt_df = pd.concat([temp_pkt_df, temp_df], axis=1)
            pkt_df = pd.concat([pkt_df, temp_pkt_df])
    if save:
        meta_df.to_csv(f'./{output_dir}/metadata.csv', index=False)
        pkt_df.to_csv(f'./{output_dir}/pkt.csv', index=False)

    return meta_df, pkt_df

# Merge the columns of the metadata and packet dataframes
def combine_meta_pkt_dfs(meta_df, pkt_df):
    return pd.concat([meta_df, pkt_df], axis=1)

In [3]:
def preprocess(df):
    print(f"\t[*] Preprocessing . . .")
    # Sort the DataFrame by 'Source IP', 'Destination IP', and 'Unix Timestamp'
    df.sort_values(by=['Source IP', 'Destination IP', 'Unix Timestamp'], inplace=True)
    # Replace 6 with 0 and 17 with 1 in the specified column
    df['Protocol'] = df['Protocol'].replace({6: 0, 17: 1})
    # Convert strings to lists
    df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]] = df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]].applymap(literal_eval)
    # Drop unwanted columns
    df = df[['Protocol', 'Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]]

    return df

def get_datasets():

    pos_dir = './data/pos_wednesday.csv'
    pos_df = pd.read_csv(pos_dir)
    mon_dir = './data/pos_monday.csv'
    mon_df = pd.read_csv(mon_dir)

    pos_df = pd.concat([pos_df, mon_df])

    neg_dir = './data/neg_wednesday.csv' # All DoS Attacks
    neg_df = pd.read_csv(neg_dir)

    return pos_df, neg_df

def load_datasets(resample=None):
    pos_df, neg_df = get_datasets()

    pos_train_df = pos_df.sample(frac=0.8, random_state=42)
    neg_train_df = neg_df.sample(frac=0.8, random_state=42)

    pos_test_df = pos_df[~pos_df.index.isin(pos_train_df.index)]
    neg_test_df = neg_df[~neg_df.index.isin(neg_train_df.index)]

    print("\t[*] Rebalancing data . . .")
    # Drop unwanted rows
    if resample != None:
        neg_test_df = neg_test_df.sample(frac=resample)
    neg_test_df = neg_test_df.reset_index(drop=True)

    # Randomly sample from the negative dataframe to match the size of the positive dataframe
    # This probably isn't best practise but it's also what LUCID does, so let's leave it in
    pos_train_df = pos_train_df.sample(n=len(neg_train_df.index), random_state=42)
    pos_test_df = pos_test_df.sample(n=len(neg_test_df.index), random_state=42)

    pos_train_df = preprocess(pos_train_df)
    neg_train_df = preprocess(neg_train_df)

    pos_test_df = preprocess(pos_test_df)
    neg_test_df = preprocess(neg_test_df)

    return pos_train_df, neg_train_df, pos_test_df, neg_test_df

# Function to transform a row with varying length to a row with fixed length (m)
def transform_row_from_tuple(row, m):
    if len(row) > m:
        # Truncate values
        row = row[:m - 1]
    elif len(row) < m:
        # Fill with zeros until the desired length is reached
        row += [0] * (m - len(row))
    return row


def _feature_engineer(df, m=4, columns_to_transform=['Pkt_Direction', 'Pkt_IATs', 'Pkt_Sizes', 'Pkt_Flags', "Pkt_Header_Sizes"], save=False):
    print(f"\t[*] Feature engineering with packet depth {m}")
    for col in columns_to_transform:
        df[col] = df[col].apply(lambda x: transform_row_from_tuple(list(x), m) if isinstance(x, (list, tuple)) else transform_row_from_tuple([x], m))
        
    # Custom function to flatten lists and lists of lists
    def flatten_element(cell):
        if isinstance(cell, list):
            return [item for sublist in cell for item in sublist] if any(isinstance(item, list) for item in cell) else cell
        return cell

    df = df.applymap(flatten_element)
    return df


def feature_engineer(pos_train_df, neg_train_df, pos_test_df, neg_test_df, m):
    pos_train_df = _feature_engineer(pos_train_df, m=m)
    neg_train_df = _feature_engineer(neg_train_df, m=m)
    pos_test_df = _feature_engineer(pos_test_df, m=m)
    neg_test_df = _feature_engineer(neg_test_df, m=m)

    return pos_train_df, neg_train_df, pos_test_df, neg_test_df

In [4]:
train_pos, train_neg, test_pos, test_neg = load_datasets()

	[*] Rebalancing data . . .
	[*] Preprocessing . . .


  df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]] = df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]].applymap(literal_eval)


	[*] Preprocessing . . .


  df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]] = df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]].applymap(literal_eval)


	[*] Preprocessing . . .


  df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]] = df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]].applymap(literal_eval)


	[*] Preprocessing . . .


  df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]] = df[['Pkt_Direction', 'Pkt_Flags', 'Pkt_IATs', 'Pkt_Sizes', "Pkt_Header_Sizes"]].applymap(literal_eval)


In [5]:
train_pos, train_neg, test_pos, test_neg = feature_engineer(train_pos, train_neg, test_pos, test_neg, m = 10)

	[*] Feature engineering with packet depth 10


  df = df.applymap(flatten_element)


	[*] Feature engineering with packet depth 10
	[*] Feature engineering with packet depth 10
	[*] Feature engineering with packet depth 10


In [6]:
# Scale function
def scale_column(row, min, max):
    if isinstance(row, list):
        return [(r - min) / (max - min) if r < max else 1 for r in row ]
    else:
        return (row - min) / (max - min) if row < max else 1

In [7]:
def _to_numpy(pos_df, neg_df, save=False):
    print("\t[*] Converting to numpy format")
    # Add the labels column
    pos_df['label'] = 0
    neg_df['label'] = 1

    # Concatenate the 2 dataframes
    data_df = pd.concat([pos_df, neg_df])

    # Save the dataframe
    if save:
        data_df.to_csv(f'./data/data.csv', index=True)

    # Convert 'Pkt_Direction,' 'Pkt_Flags,' 'Pkt_IATs,' and 'Pkt_Sizes' to lists
    data_df['Pkt_Direction'] = data_df['Pkt_Direction'].apply(list)
    data_df['Pkt_Direction'] = data_df['Pkt_Direction'].apply(sum)

    data_df['Pkt_Flags'] = data_df['Pkt_Flags'].apply(list)
    #data_df['Pkt_Flags'] = data_df['Pkt_Flags'].apply(sum)

    data_df['Pkt_IATs'] = data_df['Pkt_IATs'].apply(list)
    data_df['Pkt_IATs'] = data_df['Pkt_IATs'].apply(sum)

    data_df['Pkt_Sizes'] = data_df['Pkt_Sizes'].apply(list)
    data_df['Pkt_Sizes'] = data_df['Pkt_Sizes'].apply(sum) 

    data_df["Pkt_Header_Sizes"] = data_df["Pkt_Header_Sizes"].apply(list)
    data_df["Pkt_Header_Sizes"] = data_df["Pkt_Header_Sizes"].apply(sum)

    # Ensure consistent data types )(float32 for floating-point values, int32 for integers)                                           # This makes it negative, for some reason
    data_df['Protocol'] = data_df['Protocol'].astype('int32')
    data_df['label'] = data_df['label'].astype('int32')
        
    # Normalise columns
    data_df['Pkt_Sizes'] = data_df['Pkt_Sizes'].apply(scale_column, min=0, max=100000)
    data_df["Pkt_Header_Sizes"] = data_df["Pkt_Header_Sizes"].apply(scale_column, min=0, max=10000)
    data_df['Pkt_IATs'] = data_df['Pkt_IATs'].apply(scale_column, min=0, max=750000000)                   
    data_df['Pkt_Flags'] = data_df['Pkt_Flags'].apply(scale_column, min=0, max=256)                       

    if save:
        # Save the dataframe
        data_df.to_csv(f'./data/data.csv', index=True)

    if save:
        # Save the dataframe
        data_df.to_csv(f'./data/data-2.csv', index=True)

    return data_df


def to_numpy(pos_train_df, neg_train_df, save=False):
    data_df = _to_numpy(pos_train_df,
                                         neg_train_df,
                                         save)

    return data_df  

In [8]:
train_df = to_numpy(train_pos, train_neg)

	[*] Converting to numpy format


In [9]:
test_df = to_numpy(test_pos, test_neg)

	[*] Converting to numpy format


In [10]:
def ids(mode=None, train_features=None, train_labels=None, test_features=None, test_labels=None, random_state=42):
    if mode == 'Forest':
        classifier = RandomForestClassifier(n_estimators = 1000, random_state = random_state)
        print("Classifier Initialised")
        classifier.fit(train_features, train_labels)
        print("Classifier Trained")
        predictions  = classifier.predict(test_features)
    elif mode == 'Logistic':
        classifier = LogisticRegression(random_state=100, max_iter=1000)
        print("Classifier Initialised")
        classifier.fit(train_features, train_labels)
        print("Classifier Trained")
        predictions = classifier.predict(test_features)
    elif mode == 'MLP':
        classifier = MLPClassifier(random_state = 100, hidden_layer_sizes=(150,100,50), max_iter=300, activation='relu', solver='adam')
        print("Classifier Initialised")
        classifier.fit(train_features, train_labels)
        print("Classifier Trained")
        predictions = classifier.predict(test_features)
    else:
        print('Choose valid mode.')
    test_score = np.mean(test_labels == predictions)
    
    return predictions, test_score, classifier

def results(test_score=None, predictions=None, test_labels=None):
    print("Test Score: ", test_score)
    print(confusion_matrix(test_labels.values, predictions))
    
    print("F1 Score:", f1_score(test_labels.values, predictions, average='macro'))

    return f1_score(test_labels.values, predictions, average='macro')


In [11]:
corrections = False # Make corrections to the dataset by discarding malformed flows from the data. 
# When corrections is false, F1 score can fluctuate from approx. 0.992 to 0.997, depending on dataset split
if corrections:
    train_df = train_df.drop(train_df[(train_df["Pkt_Sizes"] < 0.1) & (train_df["label"] == 1)].index) # drop positive flows which consist of flow appendices, based on outliers in size
    test_df = test_df.drop(test_df[(test_df["Pkt_Header_Sizes"] == 0) | (test_df["Pkt_Header_Sizes"] == 1)].index) # drop packets with unusual header sizes
    test_df = test_df.drop(test_df[(test_df["Pkt_IATs"] == 0) | (test_df["Pkt_IATs"] == 1)].index) # drop test flows with one packet or which timeout
    test_df = test_df.drop(test_df[(test_df["Pkt_Sizes"] < 0.1) & (test_df["label"] == 1)].index) # drop positive flows which consist of flow appendices, based on outliers in size
    test_df = test_df.drop(test_df[(test_df["Pkt_IATs"] > 0.9)].index) # Drop packets with outlier IATs
    test_df = test_df.drop(test_df[(test_df["Pkt_Direction"] < 2)].index) # drop test flows with malformed handshakes (< 2 packets in positive direction)

In [12]:
preds, score, classifier = ids("Forest", 
            train_df[["Pkt_IATs", "Pkt_Sizes", "Pkt_Header_Sizes"]], 
            train_df["label"], test_df[["Pkt_IATs", "Pkt_Sizes", "Pkt_Header_Sizes"]],
            test_df["label"],
            random_state=42)
results(score, preds, test_df["label"])

Classifier Initialised
Classifier Trained
Test Score:  0.9930033280050313
[[38048   113]
 [  421 37740]]
F1 Score: 0.9930032140586504


0.9930032140586504