# 1. Import libraries and data 

In [16]:
import sys
import os
import pandas as pd
import numpy as np

In [None]:
# import functions from the project

#get_path = os.path.abspath("")
project_path = 'd:\\PYTHON\\CS_Bootcamp\\programs\\cs-intrusion-detection-system'

# add path to load own functions from .py files in scrips folder
sys.path.insert(0, project_path + '\scripts')

from preprocessing import *
from plotting import *

In [None]:
# get data 
file_name_train_data = "KDDTrain+.txt"
file_name_test_tata = "KDDTest+.txt"

# column names
column_names = [
    "duration",
    "protocol_type",
    "service",
    "flag",
    "src_bytes",
    "dst_bytes",
    "land",
    "wrong_fragment",
    "urgent",
    "hot",
    "num_failed_logins",
    "logged_in",
    "num_compromised",
    "root_shell",
    "su_attempted",
    "num_root",
    "num_file_creations",
    "num_shells",
    "num_access_files",
    "num_outbound_cmds",
    "is_host_login",
    "is_guest_login",
    "count",
    "srv_count",
    "serror_rate",
    "srv_serror_rate",
    "rerror_rate",
    "srv_rerror_rate",
    "same_srv_rate",
    "diff_srv_rate",
    "srv_diff_host_rate",
    "dst_host_count",
    "dst_host_srv_count",
    "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate",
    "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate",
    "dst_host_serror_rate",
    "dst_host_srv_serror_rate",
    "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate",
    "attack_type",
    "difficulty_level"
]

# load data as df
train_data = pd.read_csv("../data/"+ file_name_train_data,  names=column_names)
test_data = pd.read_csv("../data/"+ file_name_test_tata, names=column_names)

print(train_data.shape)
print(test_data.shape)

# Preprocessing
1. Recode categorical features to category
2. Recode numerical features to binary category (based on EDA)
3. Recode numerical features to three categories (based on EDA)
4. Create binary target variable (attack: 0,1)

- Other numerical features are not further processed (**NUM_FEATURES**)
- all processed categorical features are added to **CAT_FEATURES**


In [None]:
# features to process 
NUM_FEATURES = set([
    'srv_serror_rate',
    'same_srv_rate',
    'dst_host_same_srv_rate',
    'dst_host_srv_diff_host_rate',
    'dst_host_count',
    'duration',
    'src_bytes',
    'dst_host_diff_srv_rate',
    'dst_host_srv_serror_rate',
    'dst_host_serror_rate',
    'srv_count',
    'dst_host_srv_rerror_rate',
    'dst_bytes',
    'dst_host_srv_count',
    'serror_rate',
    'diff_srv_rate',
    'dst_host_same_src_port_rate',
    'srv_diff_host_rate',
    'srv_rerror_rate',
    'dst_host_rerror_rate',
    'rerror_rate',
    'count'])

CAT_FEATURES = set(['logged_in', 'root_shell', 'is_guest_login', 'land', 'flag', 'difficulty_level', 'protocol_type', 'service'])

# features for recoding
to_recode_2_cat = ['num_shells',
                    'urgent',
                    'num_root',           # --> works with threshold .99
                    'num_file_creations',
                    'num_failed_logins',
                    'su_attempted',
                    'num_access_files',
                    'wrong_fragment',     # --> works with threshold .99
                    ]

# unfortunatelly hardcoded boundaries for now: 
to_recode_3_cat = {'num_compromised': 10, 
                       'hot': 5} 
condition_labels = ["none", "low", "high"] 


# recode to binary target variable (attack vs no attack)
target_feature = "attack_type" 

In [None]:
# 1. convert categorical features to categories 
for feature in list(CAT_FEATURES):
    train_data = convert_column_type(train_data, feature, 'category' )
    test_data = convert_column_type(test_data, feature, 'category' )


In [None]:
# 2. recode numerical to binary categorical features 
for feature in to_recode_2_cat:
    new_feature_name = feature + "_cat"
    recode_to_binary_feature(train_data, feature, new_feature_name)
    recode_to_binary_feature(test_data, feature, new_feature_name, threshold=0) # don't use the threshold here

    # add recoded features to CAT_FEATURES 
    if is_categorical_dtype(train_data, new_feature_name) & is_categorical_dtype(
        test_data, new_feature_name):
        # add only when both successfully transformed 
        CAT_FEATURES.add(new_feature_name)
    else:
        print(f"Feature {new_feature_name} not added, check processing.")

In [None]:
# 3. recode numerical features to 3 category features

for feature, bound in to_recode_3_cat.items(): 
    cond_train = get_conditions(train_data, feature, bound)
    cond_test = get_conditions(test_data, feature, bound)
    new_feature_name = feature +"_cat"

    recode_to_categories(train_data, new_feature_name, cond_train, condition_labels)
    recode_to_categories(test_data, new_feature_name, cond_test, condition_labels)

    # add recoded features to CAT_FEATURES 
    if is_categorical_dtype(train_data, new_feature_name) & is_categorical_dtype(
        test_data, new_feature_name):
        # add only when both successfully transformed 
        CAT_FEATURES.add(new_feature_name)
    else:
        print(f"Feature {new_feature_name} not added, check processing.")


In [None]:
# 4. recode target feature 

recode_binary_target_feature(train_data, target_feature, "attack")
recode_binary_target_feature(test_data, target_feature, "attack")

if is_categorical_dtype(train_data, "attack") & is_categorical_dtype(
        test_data, "attack"):
    print("New target variable created succsessfully.")
else:
    print("Something went wrong with the target variable!")



New target variable created succsessfully.


In [29]:
# we have now done the preprocessing for all features
numerical_features = list(NUM_FEATURES)
categorical_features = list(CAT_FEATURES)

print(f"There are {len(numerical_features)} numerical and {len(categorical_features)} categorical features.")

There are 22 numerical and 18 categorical features.
