<a href="https://colab.research.google.com/github/Basel-byte/Network-Anomaly-Detection/blob/main/Pr_Assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# !gzip -d /content/kddcup.data.gz

# Practice Lab: Clustring  

In this exercise, we will know how K-Means and Normalized Cut algorithms can be used for network anomaly detection.


# Outline
- [ 1 - Packages ](#1)
- [ 2 - Read data and Change the categorical features to numerical](#2)
- [ 3 - Kmeans](#3)
- [ 4 - Neural Networks](#4)
  - [ 4.1 Problem Statement](#4.1)
  - [ 4.2 Dataset](#4.2)
  - [ 4.3 Model representation](#4.3)
  - [ 4.4 Tensorflow Model Implementation](#4.4)
  - [ 4.5 Softmax placement](#4.5)
    - [ Exercise 2](#ex02)

<a name="1"></a>
## 1 - Packages 

First, let's run the cell below to import all the packages that you will need during this assignment.
- [numpy](https://numpy.org/) is the fundamental package for scientific computing with Python.
<!-- - [matplotlib](http://matplotlib.org) is a popular library to plot graphs in Python. -->
<!-- - [tensorflow](https://www.tensorflow.org/) a popular platform for machine learning. -->
- [pandas](https://pandas.pydata.org/) is open source data analysis and manipulation tool.

In [None]:
import numpy as np
import pandas as pd
import os
import requests
from enum import Enum
import re

<a name="2"></a>
## 2 - Reading data

### Read data from csv file

#### Read columns name 

In [None]:

DATASET_COLUMNS_FILE = "/content/kddcup1999_columns.txt"
column_types =[]

with open(DATASET_COLUMNS_FILE, 'r') as file:
    column_labels: str = file.read()

column_regex: re.Pattern = re.compile(r"^(?P<column_name>\w+): (?P<data_type>\w+)\.$")
for column_type in column_labels.splitlines()[:]:
    match = column_regex.match(column_type)
    column_types.append(match.group("column_name"))

#### set column name 

In [None]:
data = pd.read_csv("/content/kddcup_data.csv", header=None)
data.columns = column_types


#### Remove class name to cluseter it 

In [None]:
data_without_lables = data.drop(columns=["class"])
data_without_lables

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,tcp,http,SF,215,45076,0,0,0,0,...,0,0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0
1,0,tcp,http,SF,162,4528,0,0,0,0,...,1,1,1.0,0.0,1.00,0.00,0.0,0.00,0.0,0.0
2,0,tcp,http,SF,236,1228,0,0,0,0,...,2,2,1.0,0.0,0.50,0.00,0.0,0.00,0.0,0.0
3,0,tcp,http,SF,233,2032,0,0,0,0,...,3,3,1.0,0.0,0.33,0.00,0.0,0.00,0.0,0.0
4,0,tcp,http,SF,239,486,0,0,0,0,...,4,4,1.0,0.0,0.25,0.00,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4898426,0,tcp,http,SF,212,2288,0,0,0,0,...,3,255,1.0,0.0,0.33,0.05,0.0,0.01,0.0,0.0
4898427,0,tcp,http,SF,219,236,0,0,0,0,...,4,255,1.0,0.0,0.25,0.05,0.0,0.01,0.0,0.0
4898428,0,tcp,http,SF,218,3610,0,0,0,0,...,5,255,1.0,0.0,0.20,0.05,0.0,0.01,0.0,0.0
4898429,0,tcp,http,SF,219,1234,0,0,0,0,...,6,255,1.0,0.0,0.17,0.05,0.0,0.01,0.0,0.0


### Change the categorical features to numerica

In [None]:
def convert_string_to_numeric(data_frame):
  # df_copy = data_frame.copy()
  for col in data_frame:
    if data_frame[col].dtypes == object:
      my_dict = {elem: index for index, elem in enumerate(data_frame[col].unique())}
      data_frame[col].replace(my_dict, inplace=True)
  return data_frame

In [None]:
convert_string_to_numeric(data_without_lables)

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,0,0,0,215,45076,0,0,0,0,...,0,0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0
1,0,0,0,0,162,4528,0,0,0,0,...,1,1,1.0,0.0,1.00,0.00,0.0,0.00,0.0,0.0
2,0,0,0,0,236,1228,0,0,0,0,...,2,2,1.0,0.0,0.50,0.00,0.0,0.00,0.0,0.0
3,0,0,0,0,233,2032,0,0,0,0,...,3,3,1.0,0.0,0.33,0.00,0.0,0.00,0.0,0.0
4,0,0,0,0,239,486,0,0,0,0,...,4,4,1.0,0.0,0.25,0.00,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4898426,0,0,0,0,212,2288,0,0,0,0,...,3,255,1.0,0.0,0.33,0.05,0.0,0.01,0.0,0.0
4898427,0,0,0,0,219,236,0,0,0,0,...,4,255,1.0,0.0,0.25,0.05,0.0,0.01,0.0,0.0
4898428,0,0,0,0,218,3610,0,0,0,0,...,5,255,1.0,0.0,0.20,0.05,0.0,0.01,0.0,0.0
4898429,0,0,0,0,219,1234,0,0,0,0,...,6,255,1.0,0.0,0.17,0.05,0.0,0.01,0.0,0.0


In [None]:
data_without_lables.drop_duplicates()


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,0,0,0,215,45076,0,0,0,0,...,0,0,0.0,0.0,0.00,0.00,0.0,0.00,0.0,0.0
1,0,0,0,0,162,4528,0,0,0,0,...,1,1,1.0,0.0,1.00,0.00,0.0,0.00,0.0,0.0
2,0,0,0,0,236,1228,0,0,0,0,...,2,2,1.0,0.0,0.50,0.00,0.0,0.00,0.0,0.0
3,0,0,0,0,233,2032,0,0,0,0,...,3,3,1.0,0.0,0.33,0.00,0.0,0.00,0.0,0.0
4,0,0,0,0,239,486,0,0,0,0,...,4,4,1.0,0.0,0.25,0.00,0.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4898426,0,0,0,0,212,2288,0,0,0,0,...,3,255,1.0,0.0,0.33,0.05,0.0,0.01,0.0,0.0
4898427,0,0,0,0,219,236,0,0,0,0,...,4,255,1.0,0.0,0.25,0.05,0.0,0.01,0.0,0.0
4898428,0,0,0,0,218,3610,0,0,0,0,...,5,255,1.0,0.0,0.20,0.05,0.0,0.01,0.0,0.0
4898429,0,0,0,0,219,1234,0,0,0,0,...,6,255,1.0,0.0,0.17,0.05,0.0,0.01,0.0,0.0


<a name="3"></a>
## 3 - Kmeans



In [None]:
def Kmeans(data_frame, k = 3):
  