In [None]:
!pip install -r requirements.txt

In [1]:
from pandas import set_option

def append_suffix(name: str, suffix: str) -> str:
    return f"{name.removesuffix('.parquet')}{suffix}.parquet"

set_option('display.max_columns', None)

## Remove unnecessary data

- remove non-TCP flows (https://en.wikipedia.org/wiki/List_of_IP_protocol_numbers)
- remove rows containing infinity/NaN values in numeric columns
- remove columns that contain low amount of unique values
- remove columns with high correlation

In [11]:
from pandas import read_parquet
from collections import Counter
from numpy import isfinite


files = ['Wednesday-14-02-2018.parquet', 'Thursday-15-02-2018.parquet', 'Friday-16-02-2018.parquet']

for file in files:
    print(f"Reading file {file}")
    df = read_parquet(file)
    df.reset_index(drop=True, inplace=True)
    # Drop old index column, as it was not in original dataset.
    # It was added due to ORC file format requirements
    df.drop(columns=['index'], inplace=True)
    print(df.columns)

    print(f"Starting dataframe shape: {df.shape}")
    print(f"Counted rows by protocol: {Counter(df['Protocol'])}")
    df = df[df['Protocol'] == 6]
    print("Removing non-TCP rows...")
    df.drop(columns=['Protocol'], inplace=True)

    cols_to_remove = []

    UNIQUE_VALUES_THRESHOLD = 10

    for column in df:
        unique_vals = len(set(df[column]))
        print(f"column: '{column}', unique values: {unique_vals}")
        if unique_vals < UNIQUE_VALUES_THRESHOLD and column != 'Label':
            cols_to_remove.append(column)
            print(f"Removing column {column}")
            print(f"Column values: {Counter(df[column])}")

    df.drop(columns=cols_to_remove, inplace=True)

    print("Correlation matrix")
    df_features = df.drop(columns=['Label'], inplace=False)

    # removing fows with Inf/Nan values
    finite_indexes = isfinite(df_features).all(1)
    df_features = df_features[finite_indexes]
    df = df[finite_indexes]

    correlation_matrix = df_features.corr()

    threshold = 0.95

    high_correlation_rows = correlation_matrix[abs(correlation_matrix) > threshold].stack().reset_index()
    high_correlation_rows.columns = ['Column 1', 'Column 2', 'Correlation']
    high_correlation_rows = high_correlation_rows[high_correlation_rows['Column 1'] != high_correlation_rows['Column 2']]\
        .drop_duplicates(subset='Column 1')\
        .reset_index()

    save_columns = []
    drop_columns = []
    for index, row in high_correlation_rows.iterrows():
        col1 = row['Column 1']
        col2 = row['Column 2']
        if col2 not in save_columns:
            drop_columns.append(col2)
            save_columns.append(col1)
            print(f"Removing column {col2} as its corellation to {col1} is {row['Correlation']}")

    df.drop(columns=drop_columns, inplace=True)
    df.reset_index(drop=True, inplace=True)
    print(f"""

###################################
Final data state for file {file}
Shape (rows, columns): {df.shape}
Label counts: {Counter(df['Label'])}
###################################

    """)
    df.to_parquet(append_suffix(file, '_pruned'))

Reading file Wednesday-14-02-2018.parquet
Index(['Dst Port', 'Protocol', 'Timestamp', 'Flow Duration', 'Tot Fwd Pkts',
       'Tot Bwd Pkts', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Pkt Len Max',
       'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std',
       'Bwd Pkt Len Max', 'Bwd Pkt Len Min', 'Bwd Pkt Len Mean',
       'Bwd Pkt Len Std', 'Flow Byts/s', 'Flow Pkts/s', 'Flow IAT Mean',
       'Flow IAT Std', 'Flow IAT Max', 'Flow IAT Min', 'Fwd IAT Tot',
       'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min',
       'Bwd IAT Tot', 'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max',
       'Bwd IAT Min', 'Fwd PSH Flags', 'Bwd PSH Flags', 'Fwd URG Flags',
       'Bwd URG Flags', 'Fwd Header Len', 'Bwd Header Len', 'Fwd Pkts/s',
       'Bwd Pkts/s', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
       'Pkt Len Std', 'Pkt Len Var', 'FIN Flag Cnt', 'SYN Flag Cnt',
       'RST Flag Cnt', 'PSH Flag Cnt', 'ACK Flag Cnt', 'URG Flag Cnt',
       'CWE Flag Count', 'ECE Flag Cnt

## Normalization

Additional step with results saved in separate files, to compare if normalization improves efficiency

In [12]:
from sklearn.preprocessing import MinMaxScaler
from pandas import DataFrame, concat
from numpy import isfinite

files = ['Wednesday-14-02-2018_pruned.parquet', 'Thursday-15-02-2018_pruned.parquet', 'Friday-16-02-2018_pruned.parquet']

for file in files:
    print(f"Reading file {file}")
    df = read_parquet(file)
    df_features = df.drop(columns=['Label'], inplace=False)

    scaler = MinMaxScaler()

    normalized_features = DataFrame(scaler.fit_transform(df_features), columns=df_features.columns)

    # Combine the normalized numeric data with the non-numeric data
    df = concat([normalized_features, df['Label']], axis=1, ignore_index=False)
    print("Example row after normalization:")
    # df = df.reset_index(drop=True)
    print(df.head(1))
    print(df.shape)
    print("dtypes:")
    print(df.dtypes)
    df.to_parquet(append_suffix(file, '_normalized'))

Reading file Wednesday-14-02-2018_pruned.parquet
Example row after normalization:
   Dst Port  Timestamp  Flow Duration  Tot Fwd Pkts  Tot Bwd Pkts  \
0  0.000153   0.639205       0.053783      0.002738      0.001087   

   TotLen Fwd Pkts  Fwd Pkt Len Max  Fwd Pkt Len Min  Fwd Pkt Len Mean  \
0         0.000144         0.011546              0.0          0.007364   

   Bwd Pkt Len Max  Bwd Pkt Len Min  Bwd Pkt Len Mean  Flow Byts/s  \
0         0.668493              0.0          0.155766     0.000001   

    Flow Pkts/s  Flow IAT Mean  Flow IAT Std  Flow IAT Max  Fwd IAT Mean  \
0  9.641875e-07       0.002264      0.002939      0.005633      0.003881   

   Fwd IAT Std  Fwd IAT Min  Bwd IAT Tot  Bwd IAT Mean  Bwd IAT Std  \
0     0.001462     0.001934     0.046983      0.006496     0.005395   

   Bwd IAT Max  Bwd IAT Min  Bwd Header Len    Bwd Pkts/s  Pkt Len Min  \
0     0.009763     0.000006        0.001783  7.747174e-07          0.0   

   Pkt Len Max  Pkt Len Std  Pkt Len Var  Do