In [67]:
from google.colab import drive
drive.mount('/content/drive')

RANDOM_SEED = 42
BASE = '/content/drive/MyDrive/Univerui/Bakalauras/'

import pandas as pd
import numpy as np

Columns = (['duration','protocol_type','service','flag','src_bytes','dst_bytes','land','wrong_fragment','urgent','hot',
            'num_failed_logins','logged_in','num_compromised','root_shell','su_attempted','num_root','num_file_creations',
            'num_shells','num_access_files','num_outbound_cmds','is_host_login','is_guest_login','count','srv_count',
            'serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate','diff_srv_rate','srv_diff_host_rate',
            'dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate',
            'dst_host_srv_rerror_rate','attack','level'])

train_df = pd.read_csv(BASE+'KDDTrain+.txt', sep = ',', encoding='utf-8', names=Columns)
test_df = pd.read_csv(BASE+'KDDTest+.txt', sep=',', encoding='utf-8', names=Columns)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [68]:
features_to_drop = ['land', 'root_shell', 'is_host_login',
                    'num_outbound_cmds', 'level']

skew_pos = ['duration', 'src_bytes','dst_bytes', 'wrong_fragment','urgent',
            'hot', 'num_failed_logins', 'is_guest_login', 'count', 'srv_count',
            'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
            'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_diff_srv_rate',
            'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
            'dst_host_serror_rate','dst_host_srv_serror_rate',
            'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
            
skew_neg = ['same_srv_rate', 'dst_host_count']

# Converting multi class target to binary target

In [69]:
def attack_binarization(row):
  if row['attack'] == 'normal':
    return 'normal'
  else:
    return 'attack'

train_df['attack'] = train_df.apply(lambda row: attack_binarization(row), axis=1)
test_df['attack'] = test_df.apply(lambda row: attack_binarization(row), axis=1)

# Dropping columns

In [70]:
train_df.drop(features_to_drop, axis=1, inplace=True)
test_df.drop(features_to_drop, axis=1, inplace=True)

# Removing Skewness

In [71]:
for f in skew_pos+skew_neg:
  train_df[f] = np.log(train_df[f]+1)
  test_df[f] = np.log(test_df[f]+1)

# Splitting targets from rest dataset

In [72]:
X_train = train_df.drop('attack', axis=1)
y_train = train_df.attack

X_test = test_df.drop('attack', axis=1)
y_test = test_df.attack

# One-Hot Encoding categorical features

In [73]:
categories = ['protocol_type', 'service', 'flag']
train_categories = X_train[categories].copy(deep=True)
test_categories = X_test[categories].copy(deep=True)

X_train.drop(categories, axis=1, inplace=True)
X_test.drop(categories, axis=1, inplace=True)

## Transforming service column to top 5 services and rest as other

In [74]:
top_5 = ['http', 'private', 'telnet', 'ftp_data', 'smtp']

In [75]:
def service_top5(row, top_5):
  if row['service'] in top_5:
    return row['service']
  else:
    return 'other'

train_categories['service'] = train_categories.apply(lambda row: service_top5(row, top_5), axis=1)
test_categories['service'] = test_categories.apply(lambda row: service_top5(row, top_5), axis=1)

In [76]:
train_categories = pd.get_dummies(train_categories)
test_categories = pd.get_dummies(test_categories)

# Scalling train data

In [77]:
from sklearn.preprocessing import MinMaxScaler

In [78]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [79]:
columns = scaler.get_feature_names_out()

In [80]:
X_train = pd.DataFrame(X_train, columns=columns)
X_test = pd.DataFrame(X_test, columns=columns)

# Combining numeric and categorical features

In [81]:
X_test = pd.concat([X_test, test_categories],axis=1)
X_train = pd.concat([X_train, train_categories],axis=1)

In [82]:
X_test = pd.concat([X_test, y_test], axis=1)
X_train = pd.concat([X_train, y_train], axis=1)

# Label encoding target

In [86]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

X_train['attack'] = encoder.fit_transform(X_train['attack'])
X_test['attack'] = encoder.transform(X_test['attack'])

In [92]:
encoder.classes_

array(['attack', 'normal'], dtype=object)

# Saving MinMaxScaler and LabelEncoder

In [93]:
import pickle

output_le = open(BASE+'label_encoder.pkl', 'wb')
pickle.dump(encoder, output_le)
output_le.close()

output_scaler = open(BASE+'minmaxscaler.pkl', 'wb')
pickle.dump(scaler, output_scaler)
output_scaler.close()

# Saving processed data

In [94]:
X_train.to_csv(BASE+'train_data.csv', index=False)
X_test.to_csv(BASE+'test_data.csv', index=False)