In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import gc
import pickle
import math
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import radius_neighbors_graph
from scipy.spatial.distance import pdist, squareform
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.cluster import contingency_matrix
from sklearn.cluster import SpectralClustering
from scipy.stats import entropy

## loading data

In [None]:
path_train = "/content/drive/MyDrive/Colab Notebooks/NAD/kddcup.data_10_percent.gz"
path_test = "/content/drive/MyDrive/Colab Notebooks/NAD/corrected.gz"

In [None]:
columns = [str(i) for i in range(1, 43)]
data_train = pd.read_csv(path_train, compression='gzip', header=None, names=columns)
data_test = pd.read_csv(path_test, compression='gzip', header=None, names=columns)

In [None]:
DATA_TRAIN_ROWS = data_train.shape[0]
all_data = pd.concat([data_train, data_test])
print(all_data.shape)

(805050, 42)


In [None]:
all_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,33,34,35,36,37,38,39,40,41,42
0,0,tcp,http,SF,181,5450,0,0,0,0,...,9,1.0,0.0,0.11,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


In [None]:
all_data.isnull().sum()

1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    0
41    0
42    0
dtype: int64

In [None]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 805050 entries, 0 to 311028
Data columns (total 42 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   1       805050 non-null  int64  
 1   2       805050 non-null  object 
 2   3       805050 non-null  object 
 3   4       805050 non-null  object 
 4   5       805050 non-null  int64  
 5   6       805050 non-null  int64  
 6   7       805050 non-null  int64  
 7   8       805050 non-null  int64  
 8   9       805050 non-null  int64  
 9   10      805050 non-null  int64  
 10  11      805050 non-null  int64  
 11  12      805050 non-null  int64  
 12  13      805050 non-null  int64  
 13  14      805050 non-null  int64  
 14  15      805050 non-null  int64  
 15  16      805050 non-null  int64  
 16  17      805050 non-null  int64  
 17  18      805050 non-null  int64  
 18  19      805050 non-null  int64  
 19  20      805050 non-null  int64  
 20  21      805050 non-null  int64  
 21  22      80

### Categorical features to numerical

In [None]:
# the columns that contain non numbers values
object_cols = all_data.select_dtypes(exclude=['number']).columns.tolist()
object_cols.pop()
object_cols

['2', '3', '4']

In [None]:
for i in range(0,len(object_cols)):
    print("\n*column:", object_cols[i])
    print(all_data[object_cols[i]].value_counts())


*column: 2
icmp    448571
tcp     309422
udp      47057
Name: 2, dtype: int64

*column: 3
ecr_i      445752
private    189403
http       105530
smtp        17991
other        9422
            ...  
urh_i          14
tim_i          14
tftp_u          2
icmp            2
red_i           1
Name: 3, Length: 67, dtype: int64

*column: 4
SF        626819
S0        105019
REJ        68820
RSTO        1972
RSTR        1775
S3           299
SH           191
S1            84
S2            46
RSTOS0        13
OTH           12
Name: 4, dtype: int64


In [None]:
def dataFrame_encoding(dataFrame, object_column):
    df_encoded = pd.get_dummies(dataFrame, columns=[object_column])
    return df_encoded

def dataFrame_label_encoding(dataFrame, object_column):
    lab = LabelEncoder()
    dataFrame[object_column] = lab.fit_transform(dataFrame[object_column])
    return dataFrame

In [None]:
# data encoding (hot encoding)
for object_col in object_cols:
    all_data = dataFrame_encoding(all_data, object_col)

In [None]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 805050 entries, 0 to 311028
Columns: 120 entries, 1 to 4_SH
dtypes: float64(15), int64(23), object(1), uint8(81)
memory usage: 307.9+ MB


In [None]:
enc_data_train = all_data.iloc[:DATA_TRAIN_ROWS,:]
enc_data_test = all_data.iloc[DATA_TRAIN_ROWS:,:]
print(enc_data_train.shape)
print(enc_data_test.shape)

(494021, 120)
(311029, 120)


In [None]:
enc_data_train25, _ = train_test_split(enc_data_train, test_size=0.975,train_size=0.025, stratify=data_train['42'])
enc_train_label = enc_data_train['42']
enc_data_train = enc_data_train.drop("42", axis='columns')

enc_test_label = enc_data_test['42']
enc_data_test = enc_data_test.drop("42", axis='columns')

In [None]:
enc_test_label.value_counts()

smurf.              164091
normal.              60593
neptune.             58001
snmpgetattack.        7741
mailbomb.             5000
guess_passwd.         4367
snmpguess.            2406
satan.                1633
warezmaster.          1602
back.                 1098
mscan.                1053
apache2.               794
processtable.          759
saint.                 736
portsweep.             354
ipsweep.               306
httptunnel.            158
pod.                    87
nmap.                   84
buffer_overflow.        22
multihop.               18
named.                  17
sendmail.               17
ps.                     16
rootkit.                13
xterm.                  13
teardrop.               12
xlock.                   9
land.                    9
xsnoop.                  4
ftp_write.               3
loadmodule.              2
perl.                    2
udpstorm.                2
worm.                    2
phf.                     2
sqlattack.               2
i

In [None]:
enc_train_label.value_counts()

smurf.              280790
neptune.            107201
normal.              97278
back.                 2203
satan.                1589
ipsweep.              1247
portsweep.            1040
warezclient.          1020
teardrop.              979
pod.                   264
nmap.                  231
guess_passwd.           53
buffer_overflow.        30
land.                   21
warezmaster.            20
imap.                   12
rootkit.                10
loadmodule.              9
ftp_write.               8
multihop.                7
phf.                     4
perl.                    3
spy.                     2
Name: 42, dtype: int64

In [None]:
del all_data
del data_train
del data_test

## Evaluation

In [None]:
def per_eval(clusters, labels_train):
    #   / removed code /
  return percision,recall,ftotal,entropy

def per_eval_test_data(clusters, labels_train, clusters_test, labels_test):
#   / removed code /
  return percision,recall,ftotal,entropy

## K-Means

In [None]:

class K_Means():
  def __init__(self,n_clusters=2,max_iter=100):
    self.n_clusters = n_clusters
    self.max_iter = max_iter

  def check_repeat(self,point,data):
    repeated = 0
    for i in range(len(data)):
      if (np.array_equal(point,data[i])):
        repeated +=1
    if (repeated != 1):
      return True
    else:
      return False

  def euclidean(self,point, data):
      """
      Euclidean distance between point & data.
      Point has dimensions (m,), data has dimensions (n,m), and output will be of size (n,).
      """
      return np.sqrt(np.sum((point - data)**2, axis=1))

  def fit(self,data):
    k=self.n_clusters
    self.centroid_loc = np.array([[0 for i in range(len(data[0]))] for j in range(k)]) #K Centroids each Centroid has same number of features
    old_centroid_loc = np.array([[0 for i in range(len(data[0]))] for j in range(k)])
    self.clusters = [[] for i in range(k)] #Each row represents the cluster number and each coloumn has the index of data
    self.labels = [[] for i in range(k)]
    for i in range(k):
        self.centroid_loc [i] = random.choice(data)
        while ( self.check_repeat(self.centroid_loc[i],self.centroid_loc) ):
          self.centroid_loc [i] = random.choice(data)
    iterations = 0
    while(True):
      self.clusters = [[] for i in range(k)] #Each row represents the cluster number and each coloumn has the index of data
      self.labels = [[] for i in range(k)]
      #Assigning
      for i in range(len(data)):
        distances = self.euclidean(data[i],self.centroid_loc)
        idx = np.argmin(distances) #Gets the index of the shortest distance (Index represent the cluster number)
        self.clusters[idx].append(data[i])
        self.labels[idx].append(i)

      old_centroid_loc = self.centroid_loc.copy()
      self.centroid_loc = np.array([np.mean(cluster, axis=0) for cluster in self.clusters])
      for i, centroid in enumerate(self.centroid_loc):
        if np.isnan(centroid).any():  # Catch any np.nans, resulting from a centroid having no points
           self.centroid_loc[i] = old_centroid_loc[i]

      if( np.array_equal (self.centroid_loc, old_centroid_loc) ):
        break;
      iterations += 1
      print(iterations)
    print('Done')
    for i in range(len(self.clusters)):
      print(len(self.clusters[i]))


  def get_centroids(self):
    return self.centroid_loc

  def get_labels(self):
    return self.labels

  def get_clusters(self):
    return self.clusters

  def predict(self,point):
    distances = self.euclidean(point,self.centroid_loc)
    idx = np.argmin(distances) #Gets the index of the shortest distance (Index represent the cluster number)
    self.clusters[idx].append(point)
    return idx

  def predictTestDataLabels(self,data,centroids):
    k=self.n_clusters
    self.labels_test = [[] for i in range(k)]
    for i in range(len(data)):
      distances = self.euclidean(data[i],self.centroid_loc)
      idx = np.argmin(distances) #Gets the index of the shortest distance (Index represent the cluster number)
      self.labels_test[idx].append(i)

  def getTestDataLabels(self):
    return self.labels_test



### Test K-Means

**7 Clusters**

In [None]:
model_7 = K_Means(n_clusters = 7)
model_7.fit(data)

**15 Clusters**

In [None]:
model_15 = K_Means(n_clusters = 15)
model_15.fit(data)

**23 Clusters**

In [None]:
model_23 = K_Means(n_clusters = 23)
model_23.fit(data)

**31 Clusters**

In [None]:
model_31 = K_Means(n_clusters = 31)
model_31.fit(data)

**45 Clusters**

In [None]:
model_45 = K_Means(n_clusters=45)
model_45.fit(data)

**Prediction**

In [None]:
model_7.predictTestDataLabels(scaled_test_data,model_7.get_centroids())

In [None]:
model_15.predictTestDataLabels(scaled_test_data,model_15.get_centroids())

In [None]:
model_23.predictTestDataLabels(scaled_test_data,model_23.get_centroids())

In [None]:
model_31.predictTestDataLabels(scaled_test_data,model_31.get_centroids())

In [None]:
model_45.predictTestDataLabels(scaled_test_data,model_45.get_centroids())

**Saving Outputs**

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_7.p', 'wb')
pickle.dump(model_7, file)
file.close()

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_15.p', 'wb')
pickle.dump(model_15, file)
file.close()

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_23.p', 'wb')
pickle.dump(model_23, file)
file.close()

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_31.p', 'wb')
pickle.dump(model_31, file)
file.close()

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_45.p', 'wb')
pickle.dump(model_45, file)
file.close()

**Loading Outputs**

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_7.p', 'rb')
model_7 = pickle.load(file)
file.close()

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_15.p', 'rb')
model_15 = pickle.load(file)
file.close()

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_23.p', 'rb')
model_23 = pickle.load(file)
file.close()

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_31.p', 'rb')
model_31 = pickle.load(file)
file.close()

In [None]:
file = open('/content/drive/MyDrive/Colab Notebooks/Pattern/projects/Assignment2/model_45.p', 'rb')
model_45 = pickle.load(file)
file.close()

**Evaluating Clusters**

In [None]:
print('Percision,Recall,F1,Entropy \n',per_eval_test_data(model_7.labels, enc_train_label, model_7.getTestDataLabels(), enc_test_label))

Percision,Recall,F1,Entropy 
 0.9079410601583774 0.76134437918244 0.8098711563880899 0.4724252038768968


In [None]:
print('Percision,Recall,F1,Entropy \n',per_eval_test_data(model_15.labels, enc_train_label, model_15.getTestDataLabels(), enc_test_label))

Percision,Recall,F1,Entropy 
 0.9094296673300559 0.7068826788549926 0.7595786273820956 0.3718986117052112


In [None]:
print('Percision,Recall,F1,Entropy \n',per_eval_test_data(model_23.labels, enc_train_label, model_23.getTestDataLabels(), enc_test_label))

Percision,Recall,F1,Entropy 
 0.8353883399940198 0.5605009214168368 0.6439767033841309 0.346937629587989


In [None]:
print('Percision,Recall,F1,Entropy \n',per_eval_test_data(model_31.labels, enc_train_label, model_31.getTestDataLabels(), enc_test_label))

Percision,Recall,F1,Entropy 
 0.9135128878657615 0.6930343557462837 0.7470155393811239 0.34331698088169327


In [None]:
print('Percision,Recall,F1,Entropy \n',per_eval_test_data(model_45.labels, enc_train_label, model_45.getTestDataLabels(), enc_test_label))

Percision,Recall,F1,Entropy 
 0.8328548141813142 0.5355852444057375 0.6186606248141209 0.3440473137692984


## Spectral Clustering:

### Implemntaion:

In [None]:
from sklearn.metrics.pairwise import rbf_kernel
NUM_CLUSTERS = 23

def spectral_clustering(data, num_clusters, k_neighbors):
    # Weight Matrix
    A = kneighbors_graph(data, k_neighbors).toarray()
    # A = rbf_kernel(data, gamma=10)
    # A = radius_neighbors_graph(data, 1.5)
    # A =  cosine_similarity(data, dense_output=True)  # session crashes
    # A = 1 - squareform(pdist(data, metric='jaccard'))
    print("A is done")

    # Degree Matrix
    D = np.diag(np.sum(A, axis=1))
    print("D is done")

    # Laplacian Matrix
    L = D - A
    print("L is done")


    La = np.dot(np.linalg.inv(D),L)
    print("La is done")


    eigen_val, eigen_vec = np.linalg.eig(La)
    idx = np.real(eigen_val).argsort()[:num_clusters]
    eigen_vec = np.real(eigen_vec[:,idx])
    rows_norm = np.linalg.norm(eigen_vec, axis=1)
    normalized_eig_vectors = (eigen_vec.T / rows_norm).T
    clusters = KMeans(n_clusters=num_clusters).fit_predict(normalized_eig_vectors)

    return clusters


def clusters_rows(cluters_labels, k):
    clusters = [[] for x in range(k)]
    labels = np.arange(0, k)
    row=0
    for cluter_label in cluters_labels:
        for label in labels:
            if cluter_label == label:

                clusters[label].append(row)
                row +=1
                break
    return clusters

In [None]:
spectral_model = spectral_clustering(enc_data_train25, NUM_CLUSTERS, 100)
print(spectral_model.shape)

A is done
D is done
L is done
La is done




(12350,)


In [None]:
clusters_rows_idx= clusters_rows(spectral_model, NUM_CLUSTERS)
# print(clusters_rows_idx)

### Built-in function:

In [None]:
model = SpectralClustering(n_clusters=NUM_CLUSTERS, affinity='nearest_neighbors', n_neighbors=100, assign_labels='kmeans').fit(enc_data_train25)
# print(model.labels_.shape)

In [None]:
clusters_rows_idx_BIn = clusters_rows(model.labels_, NUM_CLUSTERS)
# print(clusters_rows_idx)

**Scaled Data:**

In [None]:
model_scaled = SpectralClustering(n_clusters=NUM_CLUSTERS, affinity='nearest_neighbors', n_neighbors=100, assign_labels='kmeans').fit(data_train25)
# print(model.labels_.shape)



In [None]:
clusters_rows_idx_scaled_BIn = clusters_rows(model_scaled.labels_, NUM_CLUSTERS)
# print(clusters_rows_idx)

### Evaluation:

#### Evaluation spectral clustering (the implementation):

In [None]:
percision, recall, ftotal, entropy = per_eval(clusters_rows_idx, np.array(enc_train_label))
print("Percision:",percision)
print("Recall:",recall)
print("f1:",ftotal)
print("Entropy:",entropy)

Percision: 0.976761133603239
Recall: 0.34082993274050183
f1: 0.4460768674717294
Entropy: 0.10595037261097427


#### Evaluation of spectral clustering (the built-in function):

In [None]:
percision, recall, ftotal, entropy = per_eval(clusters_rows_idx_BIn, np.array(enc_train_label))
print("Percision:",percision)
print("Recall:",recall)
print("f1:",ftotal)
print("Entropy:",entropy)

Percision: 0.9767611336032388
Recall: 0.34211743375337504202
f1:0.4482836668118859
Entropy: 0.12765192974549107


## DB-Scan

In [None]:
import numpy as np

def dbscan(X, eps, min_samples):
    # Initialize cluster labels
    labels = np.zeros(X.shape[0], dtype=int)

    # Initialize cluster ID
    cluster_id = 1

    # Initialize list of lists for all indices on each cluster
    clusters = [[] for i in range(cluster_id)]

    # Iterate over each data point
    for i in range(X.shape[0]):
        if labels[i] != 0:
            # Skip data points that have already been assigned to a cluster
            continue

        # Find all neighboring data points within eps distance
        neighbors = find_neighbors(X, i, eps)

        if len(neighbors) < min_samples:
            # Label this data point as noise
            labels[i] = -1
            continue

        # Assign a new cluster ID to this data point
        labels[i] = cluster_id

        # Add the index of this data point to the current cluster
        clusters[cluster_id-1].append(i)

        # Expand the cluster by finding all reachable data points
        expand_cluster(X, labels, cluster_id, neighbors, eps, min_samples, clusters)

        # Increment the cluster ID for the next cluster
        cluster_id += 1

        # Add a new list to the clusters list for the next cluster
        clusters.append([])

    return clusters

def find_neighbors(X, i, eps):
    neighbors = []
    for j in range(X.shape[0]):
        if i == j:
            continue
        if np.linalg.norm(X[i] - X[j]) < eps:
            neighbors.append(j)
    return neighbors

def expand_cluster(X, labels, cluster_id, neighbors, eps, min_samples, clusters):
    i = 0
    while i < len(neighbors):
        j = neighbors[i]
        if labels[j] == -1:
            # Label noise points as border points
            labels[j] = cluster_id
        elif labels[j] == 0:
            # Assign the new data point to the current cluster
            labels[j] = cluster_id

            # Add the index of this data point to the current cluster
            clusters[cluster_id-1].append(j)

            # Find all neighboring data points within eps distance
            new_neighbors = find_neighbors(X, j, eps)

            if len(new_neighbors) >= min_samples:
                # Add new neighbors to the list of reachable data points
                neighbors += new_neighbors

        i += 1

def euclidean(point, data):
  return np.sqrt(np.sum((point - data)**2, axis=0))

def predictTestDataLabels(data,centroids,k):
  labels_test = [[] for i in range(k)]
  for i in range(len(data)):
    distances = []
    for j in range(len(centroids)):
      distance = euclidean(data[i],centroids[j])
      distances.append(distance)
    idx = np.argmin(distances) #Gets the index of the shortest distance (Index represent the cluster number)
    labels_test[idx].append(i)
  return labels_test

In [None]:
data = data
data_test = enc_data_test.to_numpy()

In [None]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(data)
distances, indices = nbrs.kneighbors(data)

In [None]:
import plotly.express as px
distances = np.sort(distances, axis=0)
distances = distances[:,1]

In [None]:
px.line(distances)

In [None]:
clusters = dbscan(data, eps=0.50487, min_samples=24)

In [None]:
# performace evaluation on training data
percision,recall,f1,entropy = per_eval(clusters,np.array(enc_train_label))
print('Percision = ' + str(percision))
print('recall = ' + str(recall))
print('f1 = ' + str(f1))
print('entropy = ' + str(entropy))

Percision = 0.9557085020242914
recall = 0.771934702822814
f1 = 0.8144968092174271
entropy = 0.7146384994491659




---


