# Network Anomaly Detection

## Contributors

#### Amr Yasser 6772
#### Elhussein Sabri 6716
#### Marwan Khaled 7020


## References 
[https://www.ecb.torontomu.ca/~bagheri/papers/cisda.pdf](Detailed Analysis of the KDD CUP 99 Data Set Mahbod Tavallaee, Ebrahim Bagheri, Wei Lu, and Ali A. Ghorban)
https://www.kdnuggets.com/2020/04/dbscan-clustering-algorithm-machine-learning.html

## Download Datset 

You can Import data set from 
* [KDD CUP 1999 DATA](https://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html)
or Download it from kaggle

### Imports

In [3]:
import os
from time import time

# Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gzip
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import kneighbors_graph as knn
from sklearn.metrics.pairwise import rbf_kernel as rbf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.metrics.cluster import contingency_matrix
from sklearn.preprocessing import LabelEncoder
from scipy.stats import entropy
from tabulate import tabulate
from sklearn.cluster import kmeans_plusplus

## CoLab

In [2]:
from google.colab import drive

# mount drive
drive.mount('/content/drive')

path_train = '/content/drive/MyDrive/Network Anomaly Detection/kddcup.data_10_percent.gz'
path_test = '/content/drive/MyDrive/Network Anomaly Detection/corrected.gz'
path_all = '/content/drive/MyDrive/Network Anomaly Detection/kddcup.data.gz'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Kaggle

In [4]:
path_train = '/kaggle/input/kdd-cup-1999-data/kddcup.data_10_percent.gz'
path_test = '/kaggle/input/kdd-cup-1999-data/corrected.gz'
path_all = '/kaggle/input/kdd-cup-1999-data/kddcup.data.gz'

### Utils

In [5]:
def plot(x,y,title):
    plt.figure(figsize=(8,6))
    plt.bar(x,y)
    plt.title(title)
    plt.xlabel('Clusters')
    plt.ylabel('Counts')
    plt.show()

def analyze(model,ground_truth,pred,k):
    print(f'K-Means at k = {model.n_clusters} after {model.execution_time}s:\n')
    labels , counts = np.unique(model.labels,return_counts=True)
    plot(labels,counts,f'Clustering counts at k = {model.n_clusters} after {model.iterations} iterations')

    labels, counts = np.unique(pred,return_counts = True)

    precisions,recalls,fscores = accuracy_scores(ground_truth,pred)
    data = []
    dist = []
    for i in range(k):
      dist.append([i,counts[i]])
      data.append([i,precisions[i],recalls[i],fscores[i]])

    print("\nClusters distribution")
    print(f'{tabulate(dist, headers=["Cluster", "Samples"],tablefmt="psql")}')
    
    print("\nAccuracy measures")
    print(f'{tabulate(data, headers=["Cluster", "Precision", "Recall", "F-Score"],tablefmt="psql")}')
    print(f'\nF1-Score: {np.sum(fscores/k)}')
    print(f'Clustering conditional entropy: {conditional_entropy(ground_truth,pred)}')


def cluster(k,init='++',mode='kmeans',sim='nn',nn=10,gamma=1.0):
  if mode == 'kmeans':
    model = KMeans(n_clusters = k,init = init).fit(train)
    pred = model.predict(test)
    analyze(model,ground_truth,pred,k)

  if mode == 'spectral':
    model = NCut(trainN,k, similarity=sim, nn=nn, gamma=gamma)
    analyze(model,ground_truthN,model.labels,k)

# K-Means Implemenetation

In [7]:
# Define the K-Means algorithm
class KMeans:
    def __init__(self,n_clusters=3, init='++', max_iter=300,centroids=[]):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.init = init
        self.centroids = np.array(centroids)
        self.iterations = 0
    
    def fit(self,train,random_state = None):
        start = time()

        # Initialize centroids randomly
        if len(self.centroids) == 0:
          if self.init == '++':
            self.centroids, indices = kmeans_plusplus(train, n_clusters=self.n_clusters,random_state = random_state)
          elif self.init == 'random':
            self.centroids = np.array(train[np.random.choice(train.shape[0], self.n_clusters, replace=False)])   
          else:
            print('Init method not applicable')
            return
        
        prev_centroids = np.zeros(self.centroids.shape)
        self.labels = np.zeros(len(train),dtype='int8')

        while np.not_equal(self.centroids, prev_centroids).any() and self.iterations < self.max_iter:
            prev_centroids = self.centroids.copy()

            # Assign each point to the closest centroid
            distances = np.sqrt(((train - self.centroids[:, np.newaxis])**2).sum(axis=2))
            self.labels = np.argmin(distances, axis=0)

            # Update centroids
            for i in range(self.n_clusters):
                points = train[self.labels == i]
                if len(points) > 0:
                  self.centroids[i] = np.mean(points, axis=0)

            # Catch any np.nans, resulting from a centroid having no points
            for i, centroid in enumerate(self.centroids):
                if np.isnan(centroid).any():  
                    self.centroids[i] = prev_centroids[i]
            
            self.iterations += 1

        end = time()
        self.execution_time = end - start
        
        return self
    
    def predict(self,X):
        predicted = []
        distances = np.sqrt(((X - self.centroids[:, np.newaxis])**2).sum(axis=2))
        predicted = np.argmin(distances, axis=0)
        return np.array(predicted)

# Normalized Cuts Implementation

In [8]:
# Define the Normalized Cuts algorithm
def NCut(train, k, nn=10, similarity= 'nn', gamma=1.0):
    # Compute the similarity matrix
    if similarity == 'nn':
      sim = knn(train,nn,mode='connectivity').toarray()
      A = sim + sim.T
    elif similarity == 'rbf':
      sim = rbf(train,train,gamma)
      A = sim
    else:
      print('Similarity method not applicable')
      return

    # Compute the diagonal degree matrix
    D = np.diag(np.sum(sim, axis=1)) 

    # Compute the Laplacian matrix
    L = D - A

    B = np.dot(np.linalg.inv(D),L)

    eigenvalues,eigenvectors = np.linalg.eigh(B)

    U = eigenvectors[:,:k]

    Y = Normalizer().fit_transform(U)
    
    # Cluster the normalized eigenvectors using K-Means
    model = KMeans(n_clusters = k).fit(Y)

    return model

# Evaluation Implementation

In [9]:
# Conditional Entropy
def conditional_entropy(ground_truth,clusters):
  con = contingency_matrix(ground_truth,clusters)

  clusters_entropies = []

  for cluster in con.T:
    pT = cluster/np.sum(cluster)
    clusters_entropies.append((np.sum(cluster)/len(ground_truth)) * entropy(pT,base=2))
  
  return np.sum(clusters_entropies)

# Precision, Recall, and F1-Score
def accuracy_scores(ground_truth,clusters):
  con = contingency_matrix(ground_truth,clusters)

  labels , counts = np.unique(ground_truth,return_counts=True)
  n_clusters = len(np.unique(clusters))
  precision = []
  recall = []
  for cluster in con.T:
    precision.append(cluster.max()/cluster.sum())
    label = np.argmax(cluster)
    recall.append(cluster[label]/counts[label])
  
  precision = np.array(precision)
  recall = np.array(recall)
  
  return precision,recall,(2*precision*recall)/(precision+recall)

# Extracting Data and formatting

In [10]:
# Open and read the compressed data file using gzip
with gzip.open(path_train, 'rb') as f:
    train_data = pd.read_csv(f, header=None)
    

with gzip.open(path_test, 'rb') as f:
    test_data = pd.read_csv(f, header=None)
    
with gzip.open(path_all, 'rb') as f:
    train_all_data = pd.read_csv(f, header=None)

# Add column names to the DataFrame
cols = [
    "duration", "protocol_type", "service", "flag", "src_bytes",
    "dst_bytes", "land", "wrong_fragment", "urgent", "hot",
    "num_failed_logins", "logged_in", "num_compromised", "root_shell",
    "su_attempted", "num_root", "num_file_creations", "num_shells",
    "num_access_files", "num_outbound_cmds", "is_host_login",
    "is_guest_login", "count", "srv_count", "serror_rate",
    "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
    "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
    "dst_host_srv_count", "dst_host_same_srv_rate",
    "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
    "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
    "dst_host_srv_serror_rate", "dst_host_rerror_rate",
    "dst_host_srv_rerror_rate", "label"
]


train_data.columns  = cols
test_data.columns= cols
train_all_data.columns = cols

# removing the 2 samples with service = icmp as they do not match in train data
test_data = test_data[test_data.service != 'icmp']

ground_truth = LabelEncoder().fit_transform(test_data['label'])


train_data.drop('label',axis=1,inplace=True)
test_data.drop('label',axis=1,inplace=True)

train_all_y = train_all_data['label']
train_all_data.drop('label',axis=1,inplace=True)

trainN,_,trainN_y,_ = train_test_split(train_all_data,train_all_y,train_size=0.0025, random_state = 42, stratify = train_all_y)
ground_truthN = LabelEncoder().fit_transform(trainN_y)


### Pre Processing


In [12]:
# Convert categorical features to numerical features
train_categorical_columns = train_data.select_dtypes(include=['object']).columns

for col in train_categorical_columns:
    le = LabelEncoder().fit(train_data[col])

    train_data[col] = le.transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    trainN[col] = LabelEncoder().fit_transform(trainN[col])


scaler = MinMaxScaler().fit(train_data)

train = scaler.transform(train_data)
test = scaler.transform(test_data)


# Clustering Using K-Means

In [13]:
# kmeans clustering with k = 7
cluster(7)

KeyboardInterrupt: 

In [None]:
# kmeans clustering with k = 15
cluster(15)

In [None]:
# kmeans clustering with k = 23
cluster(23)

In [None]:
# kmeans clustering with k = 31
cluster(31)

In [None]:
# kmeans clustering with k = 45
cluster(45)

# Clustering Using Normalized cut

## Using K-NN

In [None]:
cluster(23, mode='spectral', nn = 10)

## Using RBF Kernel

In [None]:
cluster(23, mode='spectral', sim='rbf', gamma = 1.0)

## New Clustering Algorithm <Clustering Techinque>

In [None]:
# DBSCAN Clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan.fit(train)
labels_db = dbscan.labels_
silhouette_db = silhouette_score(train, labels_db)
print("Silhouette Score for DBSCAN Clustering:", silhouette_db)