In [None]:
'''
Import necessary libraries
'''

import pandas as pd
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
%matplotlib inline

'''
Get KDD dataset from original source and store as a csv file
'''
url = 'http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz'
df = pd.read_csv(url, header=None)
df.head()

'''
Put names on columns of KDD dataset because KDD dataset do not have names in advance
'''
# Assign names for columns
df.columns= [ 'duration','protocol_type', 'service', 'flag', 'src_bytes','dst_bytes','land','wrong_fragment','urgent','hot','num_failed_logins','logged_in',
'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
'is_host_login', 'is_guest_login','cnt','srv_count','serror_rate','srv_serror_rate','rerror_rate','srv_rerror_rate','same_srv_rate',
'diff_srv_rate','srv_diff_host_rate','dst_host_count','dst_host_srv_count','dst_host_same_srv_rate','dst_host_diff_srv_rate','dst_host_same_src_port_rate',
'dst_host_srv_diff_host_rate','dst_host_serror_rate','dst_host_srv_serror_rate','dst_host_rerror_rate','dst_host_srv_rerror_rate','outcome']

# Show the attributes for KDD dataset
print(df.describe())

# Choose categorical columns to remove because PCA show good performance with numeric data
columns_drop = ['is_host_login','protocol_type','service','flag','land', 'logged_in','is_guest_login']


In [None]:
# Remove the categorical data
df.drop(columns_drop, axis=1, inplace=True)

# Check number of feature after removing
df.head()

In [None]:
'''
Get all normal data
'''
df_normal = df[df['outcome']=='normal.']
df_normal.head()

'''
Get all abnormal data
'''
df_abnormal = df[df['outcome']!='normal.']
df_abnormal.head()

In [None]:
print(df_abnormal.describe())

In [None]:
df_normal.head()

In [None]:
print(df_normal.describe())

In [None]:
# Define the score function for abnormal detection
def anomalyScores(originalDF, reducedDF):
  loss = np.sum((np.array(originalDF) - np.array(reducedDF))**2, axis=1) 
  # loss = pd.Series(data=loss,index=originalDF.index)
  # loss = (loss-np.min(loss))/(np.max(loss)-np.min(loss))  
  return loss

In [None]:
# Define the PCA
from pandas.core.common import random_state
from sklearn.decomposition import PCA

def perform_pca(dataX):
  pca = PCA(0.8)

  X_train_PCA = pca.fit_transform(dataX)
  X_train_PCA = pd.DataFrame(data=X_train_PCA, index=dataX.index)

  X_train_PCA_inverse = pca.inverse_transform(X_train_PCA)
  X_train_PCA_inverse = pd.DataFrame(data=X_train_PCA_inverse, index=dataX.index)

  anomalyScoresPCA = anomalyScores(dataX, X_train_PCA_inverse)
  return pca, anomalyScoresPCA

# PCA transform
def self_pca_transform(X_train, pca):
  return (X_train - pca.mean_).dot(pca.components_.T)
  
# PCA inverse transform
def self_inverse_transform(X_pca, pca):
  return (X_pca.dot(pca.components_) + pca.mean_)

# PCA transform with zero mean
def self_pca_transform_with_zero_mean(X_train, pca):
  return (X_train ).dot(pca.components_.T)
  
# PCA inverse transform with zero mean
def self_inverse_transform_with_zero_mean(X_pca, pca):
  return (X_pca.dot(pca.components_))

In [None]:
dataX = df_normal[10000:90000].copy()
print(dataX.shape)
dataX = dataX.drop('outcome', axis=1)

pca = PCA(0.9)
pca.fit_transform(dataX)
pca.components_.shape

(80000, 35)


(2, 34)

In [None]:
dataX_test = df_normal[:10000]
dataX_test = dataX_test.drop('outcome', axis=1)
dataX_test_trans = self_pca_transform(dataX_test, pca)
dataX_test_inv = self_inverse_transform(dataX_test_trans, pca)

anomalyScoresPCA = anomalyScores(dataX_test, dataX_test_inv)
normal_max = np.amax(anomalyScoresPCA)
normal_min = np.amin(anomalyScoresPCA)
normal_min, normal_max

anomalyScoresPCA = (anomalyScoresPCA - normal_min)/(normal_max - normal_min)
index = anomalyScoresPCA > 0.003
len(anomalyScoresPCA[index]), normal_min, normal_max, anomalyScoresPCA

(2,
 7162.746870887269,
 149140353.90830803,
 array([0.0007186 , 0.00067385, 0.00063381, ..., 0.0004471 , 0.00045602,
        0.00045593]))

In [None]:
dataX_test = df_abnormal[100000:200000]

dataX_test = dataX_test.drop('outcome', axis=1)
dataX_test_trans = self_pca_transform(dataX_test, pca)
dataX_test_inv = self_inverse_transform(dataX_test_trans, pca)

anomalyScoresPCA = anomalyScores(dataX_test, dataX_test_inv)

abnormal_max = np.amax(anomalyScoresPCA)
abnormal_min = np.amin(anomalyScoresPCA)

anomalyScoresPCA = (anomalyScoresPCA - normal_min)/(normal_max - normal_min)
index = anomalyScoresPCA < 0.003

len(anomalyScoresPCA[index]), abnormal_min, abnormal_max, anomalyScoresPCA

(49,
 72261.57633794047,
 575134.0312532532,
 array([0.00380848, 0.00380848, 0.00380848, ..., 0.00380848, 0.00380848,
        0.00380848]))

In [None]:
df_abnormal[100000:140000]

In [None]:
from google.colab import drive

ROOT = "/content/drive"
print(ROOT)
drive.mount(ROOT)

/content/drive
Mounted at /content/drive


In [None]:
%cd drive/MyDrive/USyd/Lab/Projects/Grassmann-ADMM/

/content/drive/MyDrive/USyd/Lab/Projects/Grassmann-ADMM


In [None]:
import numpy as np
V_k = np.load('Grassman_Abnormaldetection_KDD_dim_9_std_client_20_iter_1000_lr_0.0001_sub_0.1_localEpochs_30.npy')
# V_k = np.load('Abnormaldetection_KDD_dim_9_std_client_20_iter_100_learningrate_1e-05.npy')
# V_k = np.load('Grassman_Abnormaldetection_KDD_dim_9_std_client_20_iter_1000_lr_0.0001_sub_0.1.npy')
V_k.shape

(34, 9)

In [None]:
# PCA transform with zero mean
def self_pca_transform_with_zero_mean(X_train, V_k):
  return (X_train).dot(V_k)
  
# PCA inverse transform with zero mean
def self_inverse_transform_with_zero_mean(X_pca, V_k):
  return (X_pca.dot(V_k.T))

In [None]:
dataX_test_n = df_normal[:10000].sample(n=8000, random_state=1)
dataX_test = dataX_test_n.drop('outcome', axis=1)
dataX_test_trans = self_pca_transform_with_zero_mean(dataX_test, V_k)
dataX_test_inv = self_inverse_transform_with_zero_mean(dataX_test_trans, V_k)

anomalyScoresPCA = anomalyScores(dataX_test, dataX_test_inv)
normal_max = np.amax(anomalyScoresPCA)
normal_min = np.amin(anomalyScoresPCA)
normal_min, normal_max

# anomalyScoresPCA = (anomalyScoresPCA - normal_min)/(normal_max - normal_min)
index = anomalyScoresPCA > 1200000
len(anomalyScoresPCA[index]), normal_min, normal_max, anomalyScoresPCA

(4823,
 6.147548972309307,
 43985535971.74373,
 array([ 4031465.60877557,  5842610.83088365,   298386.12596988, ...,
          190372.09165613,  2992664.14263593, 17683745.37151133]))

In [None]:
dataX_test_ab = df_abnormal.sample(n=3000, random_state=1)

dataX_test = dataX_test_ab.drop('outcome', axis=1)
dataX_test_trans = self_pca_transform_with_zero_mean(dataX_test, V_k)
dataX_test_inv = self_inverse_transform_with_zero_mean(dataX_test_trans, V_k)

anomalyScoresPCA = anomalyScores(dataX_test, dataX_test_inv)

abnormal_max = np.amax(anomalyScoresPCA)
abnormal_min = np.amin(anomalyScoresPCA)

# anomalyScoresPCA = (anomalyScoresPCA - normal_min)/(normal_max - normal_min)
index = anomalyScoresPCA < 1200000

len(anomalyScoresPCA[index]), abnormal_min, abnormal_max, anomalyScoresPCA

(3722,
 16.27139014466471,
 21094972553763.91,
 array([1213730.26428266, 1213730.26428266,   41243.00984857, ...,
        1214950.7801091 ,   39105.33579555, 1214950.7801091 ]))

In [None]:
dataX_test = pd.concat([dataX_test_n, dataX_test_ab])
dataX_test.shape
# dataX_test.head()

(17000, 35)

In [None]:
def prep_data(dataX):
  change_dataX = dataX.copy()
  featuresToScale = change_dataX.columns
  sX = StandardScaler(copy=True)
  change_dataX.loc[:,featuresToScale] = sX.fit_transform(change_dataX[featuresToScale])
  return change_dataX

In [None]:
dataX_test = dataX_test.drop(['outcome'], axis=1)
dataX_test = prep_data(dataX_test)
dataX_test.head()

Unnamed: 0,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,num_failed_logins,num_compromised,root_shell,su_attempted,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
13654,-0.01887,-0.019336,0.040422,-0.037604,0.0,-0.047654,-0.010847,-0.054313,-0.01879,0.0,...,0.651293,0.490569,0.446578,-0.238429,-0.870553,-0.347116,-0.364891,-0.363276,-0.171708,-0.169816
3852,-0.01887,-0.018117,0.096636,-0.037604,0.0,-0.047654,-0.010847,-0.054313,-0.01879,0.0,...,-1.853904,0.490569,0.446578,-0.238429,-0.702312,0.544089,-0.364891,-0.363276,-0.171708,-0.169816
4966,-0.01887,-0.017974,-0.186674,-0.037604,0.0,-0.047654,-0.010847,-0.054313,-0.01879,0.0,...,0.651293,0.490569,0.446578,-0.238429,-0.870553,-0.347116,-0.364891,-0.363276,-0.171708,-0.169816
3888,-0.01887,-0.02,0.466008,-0.037604,0.0,-0.047654,-0.010847,-0.054313,-0.01879,0.0,...,0.651293,0.490569,0.446578,-0.238429,-0.870553,-0.347116,-0.364891,-0.363276,-0.171708,-0.169816
5441,-0.01887,-0.017956,-0.197618,-0.037604,0.0,-0.047654,-0.010847,-0.054313,-0.01879,0.0,...,-1.967308,0.490569,0.446578,-0.238429,1.232459,1.138226,-0.364891,-0.363276,-0.171708,-0.169816


In [None]:
dataX_test_trans = self_pca_transform_with_zero_mean(dataX_test, V_k)
dataX_test_inv = self_inverse_transform_with_zero_mean(dataX_test_trans, V_k)

anomalyScoresPCA = anomalyScores(dataX_test, dataX_test_inv)

normal_score = anomalyScoresPCA[:8000]
normal_max = np.amax(normal_score)
normal_min = np.amin(normal_score)

index = normal_score > 3
FN = len(normal_score[index])
normal_min, normal_max, len(normal_score[index])

(1.2250820415201964, 11574.386400579338, 4350)

In [None]:
abnormal_score = anomalyScoresPCA[8000:]
abnormal_max = np.amax(abnormal_score)
abnormal_min = np.amin(abnormal_score)
# threshold for 8000: 3.5
# threshold for 7000: 4
# threshold for 6000: 4
# threshold for 5000: 5
# threshold for 4000: 6
# threshold for 3000: 8
# threshold for 2000: 11
index = abnormal_score < 3
FP = len(abnormal_score[index])
abnormal_min, abnormal_max, len(abnormal_score[index])

(1.2265347529568156, 6582.496372720061, 297)

In [None]:
abnormal_total_samples = abnormal_score.shape[0]
normal_total_samples = normal_score.shape[0]
abnormal_total_samples, normal_total_samples

(9000, 8000)

In [None]:
TN = normal_total_samples - FN
TP = abnormal_total_samples - FP
precision_score = TP/(FP + TP)
recall_score = TP/(FN + TP)
accuracy_score = (TP + TN)/ (TP + FN + TN + FP)
f1_score = 2*precision_score*recall_score/(precision_score + recall_score)
print(f"Precision: {precision_score * 100.0}")
print(f"Recall: {recall_score * 100.0}")
print(f"Accuracy score: {accuracy_score * 100.0}")
print(f"F1 score: {f1_score * 100.0}")

Precision: 96.7
Recall: 66.67432774074925
Accuracy score: 72.66470588235295
F1 score: 78.92803700176847


In [None]:
normal_data = df_normal.copy()
normal_data = normal_data.drop('outcome', axis=1)
normal_columns = normal_data.columns

for column in normal_columns:
  print(column)
  print(f"{normal_data[column].max() - normal_data[column].min()}")

duration
58329
src_bytes
2194619
dst_bytes
5134218
wrong_fragment
0
urgent
3
hot
30
num_failed_logins
4
num_compromised
884
root_shell
1
su_attempted
2
num_root
993
num_file_creations
28
num_shells
1
num_access_files
8
num_outbound_cmds
0
cnt
511
srv_count
510
serror_rate
1.0
srv_serror_rate
1.0
rerror_rate
1.0
srv_rerror_rate
1.0
same_srv_rate
1.0
diff_srv_rate
1.0
srv_diff_host_rate
1.0
dst_host_count
255
dst_host_srv_count
255
dst_host_same_srv_rate
1.0
dst_host_diff_srv_rate
1.0
dst_host_same_src_port_rate
1.0
dst_host_srv_diff_host_rate
1.0
dst_host_serror_rate
1.0
dst_host_srv_serror_rate
1.0
dst_host_rerror_rate
1.0
dst_host_srv_rerror_rate
1.0


In [None]:
abnormal_data = df_abnormal.copy()
abnormal_data = abnormal_data.drop('outcome', axis=1)
abnormal_columns = abnormal_data.columns
print(abnormal_data.shape)
for column in abnormal_columns:
  print(column)
  print(f"{abnormal_data[column].max() - abnormal_data[column].min()}")
