## Importing the libraries

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from google.colab import drive
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.cluster import KMeans
from sklearn.neighbors import kneighbors_graph
from sklearn.model_selection import train_test_split
import pickle

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Extracting the features

In [3]:
featuresPath = "/content/drive/MyDrive/Colab Notebooks/Anomaly Detection/features.txt"
with open(featuresPath, "r") as file:
  # splitting at ':' and taking the first word
  features = [line.split(':')[0] for line in file]
# dropping the first 2 lines as they are unnecessary
features.pop(0), features.pop(0)
features.append('attack_type')

In [4]:
len(features)

41

## Reading the dataset

In [5]:
path = "/content/drive/MyDrive/Colab Notebooks/Anomaly Detection/kddcup.data_10_percent.gz"
df = pd.read_csv(path, compression='gzip', names=features, header=None)
df.head()

Unnamed: 0,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,tcp,http,SF,181,5450,0,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
0,tcp,http,SF,239,486,0,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
0,tcp,http,SF,235,1337,0,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
0,tcp,http,SF,219,1337,0,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
0,tcp,http,SF,217,2032,0,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


## Changing the categorical features to numerical

In [6]:
# getting all categorical columns
cat_columns = df.select_dtypes(['object']).columns
# converting all categorical columns to numeric
df[cat_columns] = df[cat_columns].apply(lambda x: pd.factorize(x)[0])
df.head()

Unnamed: 0,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,num_failed_logins,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,attack_type
0,0,0,0,181,5450,0,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,0
0,0,0,0,239,486,0,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,0
0,0,0,0,235,1337,0,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
0,0,0,0,219,1337,0,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,0
0,0,0,0,217,2032,0,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0


In [7]:
y = df.iloc[:,-1:].to_numpy()
x = df.iloc[:,:-1].to_numpy()
print(y.shape)
print(x.shape)

(494021, 1)
(494021, 40)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(x, y, train_size=0.005, random_state=42, stratify=y)

In [9]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2470, 40)
(491551, 40)
(2470, 1)
(491551, 1)


In [10]:
file_name = 'X_train.pkl'
with open(file_name, 'wb') as file:
    pickle.dump(X_train, file)
    print(f'Object successfully saved to "{file_name}"')

Object successfully saved to "X_train.pkl"


In [11]:
file_name = 'X_test.pkl'
with open(file_name, 'wb') as file:
    pickle.dump(X_test, file)
    print(f'Object successfully saved to "{file_name}"')

Object successfully saved to "X_test.pkl"


In [12]:
file_name = 'y_train.pkl'
with open(file_name, 'wb') as file:
    pickle.dump(y_train, file)
    print(f'Object successfully saved to "{file_name}"')

Object successfully saved to "y_train.pkl"


In [13]:
file_name = 'y_test.pkl'
with open(file_name, 'wb') as file:
    pickle.dump(y_test, file)
    print(f'Object successfully saved to "{file_name}"')

Object successfully saved to "y_test.pkl"


In [14]:
file_name = 'X_train.pkl'
with open(file_name, 'rb') as file:
    X_train = pickle.load(file)
    print(f'Object successfully loaded from "{file_name}"')

Object successfully loaded from "X_train.pkl"


In [15]:
file_name = 'X_test.pkl'
with open(file_name, 'rb') as file:
    X_test = pickle.load(file)
    print(f'Object successfully loaded from "{file_name}"')

Object successfully loaded from "X_test.pkl"


In [16]:
file_name = 'y_train.pkl'
with open(file_name, 'rb') as file:
    y_train = pickle.load(file)
    print(f'Object successfully loaded from "{file_name}"')

Object successfully loaded from "y_train.pkl"


In [17]:
file_name = 'y_test.pkl'
with open(file_name, 'rb') as file:
    y_test = pickle.load(file)
    print(f'Object successfully loaded from "{file_name}"')

Object successfully loaded from "y_test.pkl"


In [18]:
def normcut(F,k,gamma,kernel = True):
    if kernel == True:
        kern = rbf_kernel(F, gamma=gamma)
    else:
        kern = kneighbors_graph(F,k).toarray()
    kern = np.array(kern)
    degree = np.zeros(len(kern))
    c_sum = kern.sum(axis = 0)
    r_sum = kern.sum(axis = 1)
    for i in range(c_sum.shape[0]):
        degree[i] = (c_sum[i] + r_sum[i])/2
    degree = np.diag(degree)
    kern = degree - kern
    degree = np.linalg.inv(degree)
    kern = np.matmul(degree,kern)
    (evalues,evectors) = np.linalg.eig(kern)
    idx = evalues.argsort()
    evalues = [idx]
    evectors = evectors[ : , idx]
    evectors = evectors.T[:k]
    normFactor = np.sqrt(np.sum(np.power(evectors,2),axis=1))
    normFactor = normFactor.reshape((len(evectors),1))
    for i in range(len(normFactor)):
        if normFactor[i] == 0:
            normFactor[i] = 1
    Y = np.real(evectors/normFactor).T
    return Y, KMeans(n_clusters=k).fit(Y).labels_

In [19]:
(Y_h , labels_h) = normcut(X_train,23,0.01,kernel = False)



In [20]:
Y_h.shape

(2470, 23)

In [21]:
labels_h.shape

(2470,)