In [None]:
%matplotlib inline

# KDD99 Unsupervised Learning

## 0. Libraries

In [None]:
import numpy as np
import pandas as pd

## 1. Data Description

**Intrinsic attributes**

These attributes are extracted from the headers' area of the network packets.

Col|Feature name  | description |	type
---|--------------|-------------|------------
1  |duration 	  |length (number of seconds) of the connection |continuous
2  |protocol_type |type of the protocol, e.g. tcp, udp, etc. |discrete
3  |service 	  |network service on the destination, e.g., http, telnet, etc. |discrete
4  |flag 	      |normal or error status of the connection. The possible status are this: SF, S0, S1, S2, S3, OTH, REJ, RSTO, RSTOS0, SH, RSTRH, SHR 	|discrete 
5  |src_bytes 	  |number of data bytes from source to destination 	|continuous
6  |dst_bytes 	  |number of data bytes from destination to source 	|continuous
7  |land 	      |1 if connection is from/to the same host/port; 0 otherwise 	|discrete
8  |wrong_fragment|sum of bad checksum packets in a connection 	|continuous
9  |urgent 	      |number of urgent packets. Urgent packets are packets with the urgent bit activated 	|continuous


**Class attribute**

The 42nd attribute is the ***class_attack*** attribute, it indicates which type of connections is each instance: normal or which attack. The values it can take are the following: *anomaly, dict, dict_simple, eject, eject-fail, ffb, ffb_clear, format, format_clear, format-fail, ftp-write, guest, imap, land, load_clear, loadmodule, multihop, perl_clear, perlmagic, phf, rootkit, spy, syslog, teardrop, warez, warezclient, warezmaster, pod, back, ip- sweep, neptune, nmap, portsweep, satan, smurf and normal*.

** Categories of class attribute **


class_attack |Category
-------|--------------
smurf| dos
neptune| dos
back| dos
teardrop| dos
pod| dos
land| dos
normal|normal
satan|probe
ipsweep|probe
portsweep|probe
nmap|probe
warezclient|r2l
guess_passwd|r2l
warezmaster|r2l
imap|r2l
ftp_write|r2l
multihop|r2l
phf|r2l
spy|r2l
buffer_overflow|u2r
rootkit|u2r
loadmodule|u2r
perl|u2r

## 2. Load Data

In [None]:
data = pd.read_csv('./data/KDD/KDDTrain+.txt', header=None, usecols=[0,1,2,3,4,5,6,7,8,41])

In [None]:
data.columns=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
                 "wrong_fragment","urgent", "class_attack"]

In [None]:
data.head()

In [None]:
trainDS = data[['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 
         'wrong_fragment', 'urgent']]

In [None]:
trainDS.head()

In [None]:
dataLabels = pd.DataFrame(data['class_attack'], dtype="category")

In [None]:
dataLabels.head()

## 3. Data Preparation

### 3.1 Encoding categorical features

In [None]:
# import libraries
import sklearn.preprocessing as pp

** Encoding protocol_type **

In [None]:
trainDS.protocol_type.unique()

In [None]:
protocol_type_bin = pp.label_binarize(trainDS.protocol_type, 
                                      classes = trainDS.protocol_type.unique())
protocol_type_DataFrame = pd.DataFrame(protocol_type_bin, 
                                       columns = ['is_'+x for x in trainDS.protocol_type.unique()])

** Encoding service **

In [None]:
trainDS.service.unique()

In [None]:
service_bin = pp.label_binarize(trainDS.service, 
                                classes = trainDS.service.unique())
service_DataFrame = pd.DataFrame(service_bin, 
                                 columns = ['is_'+x for x in trainDS.service.unique()])

** Encoding flag **

In [None]:
trainDS.flag.unique()

In [None]:
flag_bin = pp.label_binarize(trainDS.flag, 
                                classes = trainDS.flag.unique())
flag_DataFrame = pd.DataFrame(flag_bin, 
                                 columns = ['is_'+x for x in trainDS.flag.unique()])

** Concatenating all de data set **

In [None]:
trainDS = pd.concat([trainDS, protocol_type_DataFrame, service_DataFrame, 
                     flag_DataFrame], axis = 1)


** Selecting only numbered features **

In [None]:
continuousCols = ["duration","src_bytes","dst_bytes","land","wrong_fragment","urgent"] + \
            [c for c in trainDS.columns if c.startswith("is_")]
trainDS = trainDS[continuousCols]

### 3.2 Input Normalization

In [None]:
scaler = pp.MinMaxScaler().fit(trainDS)

In [None]:
trainDS_scaled = pd.DataFrame(scaler.transform(trainDS), columns=continuousCols)

In [None]:
trainDS_scaled.describe()

### 3.3 Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

In [None]:
n_features = trainDS_scaled.columns.size

In [None]:
print "Total number of features: %d" %n_features

In [None]:
pca = PCA(n_components=n_features, whiten=False)
pca.fit(trainDS_scaled)

In [None]:
#accum explained variance ration
pca.explained_variance_ratio_[0:].cumsum()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(1 - pca.explained_variance_ratio_.cumsum(), drawstyle = 'steps-post')
plt.title('PCA Reconstruction Error');

In [None]:
n_factors = sum(1-pca.explained_variance_ratio_[0:].cumsum() > 0.10)
print "Number of factors with 10% of reonstraction Error: ", n_factors

In [None]:
pca = PCA(n_components=n_factors)
pca.fit(trainDS_scaled)

In [None]:
print "Explained Variance Ratio"
sum(pca.explained_variance_ratio_)

In [None]:
trainDS_pca = pca.transform(trainDS_scaled)

### 3.4 Labels categories

In [None]:
categories = {'smurf': 'dos',
              'neptune': 'dos',
              'back': 'dos',
              'teardrop': 'dos',
              'pod': 'dos',
              'land': 'dos',
              'normal':'normal',
              'satan': 'probe',
              'ipsweep':'probe',
              'portsweep':'probe',
              'nmap': 'probe',
              'warezclient':'r2l',
              'guess_passwd':'r2l',
              'warezmaster': 'r2l',
              'imap': 'r2l',
              'ftp_write': 'r2l',
              'multihop': 'r2l',
              'phf':'r2l',
              'spy':'r2l',
              'buffer_overflow': 'u2r',
              'rootkit': 'u2r',
              'loadmodule': 'u2r',
              'perl': 'u2r'}

In [None]:
dataLabels['category_attack'] = pd.Categorical(dataLabels["class_attack"].map(categories))
dataLabels['is_attack'] = pd.Categorical([x == 'normal' for x in dataLabels["class_attack"]])

In [None]:
dataLabels.head()

In [None]:
dataLabels.describe()

## 4. Modeling

## 4.1 Cluster metrics

Two desirable objectives for any cluster assignment:
* **homogeneity**: each cluster contains only members of a single class.
* **completeness**: all members of a given class are assigned to the same cluster.

The main cluster metrics are:

* **Homogeneity Score**: A clustering result satisfies homogeneity if all of its clusters contain only data points which are members of a single class.
    * Bounded scores: 0.0 is as bad as it can be, 1.0 is a perfect score

* **Completeness Score**: A clustering result satisfies completeness if all the data points that are members of a given class are elements of the same cluster.
    * Bounded scores: 0.0 is as bad as it can be, 1.0 is a perfect score

* **V measure Scores** : the harmonic mean between homogeneity and completeness: v = 2 * (homogeneity * completeness) / (homogeneity + completeness)
    * Bounded scores: 0.0 is as bad as it can be, 1.0 is a perfect score

* **Adjusted Rand index**: is a function that measures the similarity of the two assignments, ignoring permutations and with chance normalization
    * Bounded range [-1, 1]: negative values are bad (independent labelings), similar clusterings have a positive ARI, 1.0 is the perfect match score.

## 4.2 K-means

* **Parameters**: number of clusters
* **Scalability**:	Very large n_samples, medium n_clusters
* **Usecase**:	General-purpose, even cluster size, flat geometry, not too many clusters
* **Geometry (metric used)**: Distances between points
		 	

In [None]:
from sklearn.cluster import KMeans
from sklearn import metrics

### 4.2.1 Random centroid initialization (5 clusters)

In [None]:
n_Clusters = 5
random_K_means = KMeans(init='random', n_clusters = n_Clusters, n_init= 10)

In [None]:
%time random_K_means.fit(trainDS_pca)

In [None]:
print "Cluster metrics"
print "Homogeneity Score:" , metrics.homogeneity_score(random_K_means.labels_, dataLabels.category_attack)
print "Completeness Score:", metrics.completeness_score(random_K_means.labels_, dataLabels.category_attack)
print "V measure score:", metrics.v_measure_score(random_K_means.labels_, dataLabels.category_attack)
print "Adjusted Rand Index:", metrics.adjusted_rand_score(random_K_means.labels_, dataLabels.category_attack)

#### Visualize the results on PCA-reduced data

In [None]:
# Visualize the results on PCA-reduced data

pca = PCA(n_components= 2, whiten=True)
DS_projected = pca.fit_transform(trainDS_pca)

n_Clusters = 5
random_K_means_2 = KMeans(init='random', n_clusters = n_Clusters, n_init = 10)
random_K_means_2.fit_predict(DS_projected);

In [None]:
colors = "rgbcm"

fig, axes = plt.subplots(ncols=2, figsize=(14, 7) )
ax1, ax2 = axes.ravel()

# Plot the clusters
for i, color in zip(set(random_K_means_2.labels_), colors):
    idx = np.where(random_K_means_2.labels_== i)
    ax1.scatter(DS_projected[idx, 0], DS_projected[idx, 1], c=color, s = 10, label=i, alpha = 0.5, edgecolors='none')
    # Plot the centroids as X
    ax1.scatter(random_K_means_2.cluster_centers_[i, 0], random_K_means_2.cluster_centers_[i, 1],
            marker='x', s=169, linewidths=3,
            color=color, zorder=10)
ax1.set_title('K-mean prediction - Random centroid initialization (5 clusters)')
ax1.legend()

#Plot the category values
for i, color in zip(set(dataLabels.category_attack), colors):
    idx = np.where(dataLabels.category_attack == i)
    ax2.scatter(DS_projected[idx, 0], DS_projected[idx, 1], c=color, s = 10, label=i, alpha = 0.5, edgecolors='none')
    
ax2.set_title('Category Attacks (5 categories)')
ax2.legend();

#### Visualize the results on PCA-reduced data 3D

In [None]:
pca = PCA(n_components = 3, whiten=True)
DS_projected = pca.fit_transform(trainDS_pca)

n_Clusters = 5
random_K_means_3 = KMeans(init='random', n_clusters = n_Clusters, n_init = 15)
random_K_means_3.fit_predict(DS_projected);

In [None]:
from mpl_toolkits.mplot3d import Axes3D
colors =  "rgbcm"
fig = plt.figure(figsize=(8,8))
ax = fig.gca(projection='3d')

# Plot the clusters
for i, color in zip(set(random_K_means_3.labels_), colors):
    idx = np.where( random_K_means_3.labels_== i)
    ax.scatter(DS_projected[idx, 0], DS_projected[idx, 1], DS_projected[idx, 2], c=color, label=i, s=50, alpha = 0.3, 
               edgecolors='none')
    # Plot the centroids as X
    ax.scatter(random_K_means_3.cluster_centers_[i, 0], random_K_means_3.cluster_centers_[i, 1], 
                random_K_means_3.cluster_centers_[i, 2], marker='x', linewidths=3, s = 150,
                color=color, zorder=10)
ax.set_title('K-mean prediction - Random centroid initialization (5 clusters)')
ax.legend();

#### Exercice 1: 
Repeat the experiment with the cluster initialitation K-means++ and compare the results

### 4.2.2 K-means++ centroid initialization (5 clusters)

### 4.2.3 Random centroid initialization (2 clusters)

In [None]:
n_Clusters = 2
random_K_means = KMeans(init='random', n_clusters = n_Clusters, n_init = 15)

In [None]:
%time random_K_means.fit(trainDS_pca)

In [None]:
print "Cluster metrics"
print "Homogeneity Score:" , metrics.homogeneity_score(random_K_means.labels_, dataLabels.is_attack)
print "Completeness Score:", metrics.completeness_score(random_K_means.labels_, dataLabels.is_attack)
print "V measure score:", metrics.v_measure_score(random_K_means.labels_, dataLabels.is_attack)
print "Adjusted Rand Index:", metrics.adjusted_rand_score(random_K_means.labels_, dataLabels.is_attack)

#### Visualize the results on PCA-reduced data

In [None]:
pca = PCA(n_components= 2, whiten=True)
DS_projected = pca.fit_transform(trainDS_pca)

n_Clusters = 2
random_K_means_2 = KMeans(init='random', n_clusters = n_Clusters, n_init = 15)
random_K_means_2.fit_predict(DS_projected);

In [None]:
colors = "br"

fig, axes = plt.subplots(ncols=2, figsize=(14, 7) )
ax1, ax2 = axes.ravel()

# Plot the clusters
for i, color in zip(set(random_K_means_2.labels_), colors):
    idx = np.where( random_K_means_2.labels_== i)
    ax1.scatter(DS_projected[idx, 0], DS_projected[idx, 1], c=color, s = 10, label=i, alpha = 0.3, edgecolors='none')
    # Plot the centroids as X
    ax1.scatter(random_K_means_2.cluster_centers_[i, 0], random_K_means_2.cluster_centers_[i, 1],
            marker='x', s=169, linewidths=3,
            color=color, zorder=10)
ax1.set_title('K-mean prediction - Random centroid initialization (2 clusters)')
ax1.legend()

#Plot the category values
for i, color in zip(set(dataLabels.is_attack), colors):
    idx = np.where(dataLabels.is_attack == i)
    ax2.scatter(DS_projected[idx, 0], DS_projected[idx, 1], c=color, s = 10, label=i, alpha = 0.3, edgecolors='none')
    
ax2.set_title('Category Attacks (2 categories)')
ax2.legend();

#### Exercice 2: 
Repeat the experiment with the cluster initialitation K-means++ and compare the results

### 4.2.4 K-means++ centroid initialization (2 clusters)

### 4.2.5 Find the best number of Clusters

In [None]:
def getK_meansMesures(initArg = 'random', n_clusters = 2, labels = None, data = None):
    model = KMeans(init=initArg, n_clusters = n_clusters, n_init = 10)
    model.fit(data)
    return [n_clusters,
            metrics.homogeneity_score(model.labels_, labels), 
            metrics.completeness_score(model.labels_, labels), 
            metrics.v_measure_score(model.labels_, labels), 
            metrics.adjusted_rand_score(model.labels_, labels)]

#### Class Attack (23 categories)

In [None]:
measures = np.array([getK_meansMesures('random', n_Clusters, dataLabels.class_attack, trainDS_pca)
                     for n_Clusters in range(2,24)])

In [None]:
plt.figure(figsize=(15,5))
plt.plot(measures[:,0], measures[:,1], label = 'Homogeneity')
plt.plot(measures[:,0], measures[:,2], label = 'Completeness')
plt.plot(measures[:,0], measures[:,3], label = 'V measure')
plt.plot(measures[:,0], measures[:,4], label = 'Adjusted Rand')
plt.legend()
plt.ylim(0,1)
plt.title("Cluster measures")
plt.xlabel("Number of clusters")
plt.ylabel("Score");

#### Category Attack (5 categories)

In [None]:
measures = np.array([getK_meansMesures('random', n_Clusters, dataLabels.category_attack, trainDS_pca)
                     for n_Clusters in range(2,13)])

In [None]:
plt.figure(figsize=(9,5))
plt.plot(measures[:,0], measures[:,1], label = 'Homogeneity')
plt.plot(measures[:,0], measures[:,2], label = 'Completeness')
plt.plot(measures[:,0], measures[:,3], label = 'V measure')
plt.plot(measures[:,0], measures[:,4], label = 'Adjusted Rand')
plt.legend()
plt.ylim(0,1)
plt.title("Cluster measures")
plt.xlabel("Number of clusters")
plt.ylabel("Score");

#### Exercice 3: 
Find the best number of cluster to the attribute *is_attack* (2 categories) and compare the results with the others experiments

#### Is Attack (2 categories)

## 4.3 DBSCAN

Density-based spatial clustering of applications with noise (DBSCAN) is a data clustering algorithm  density-based clustering algorithm: given a set of points in some space, it groups together points that are closely packed together (points with many nearby neighbors), marking as outliers points that lie alone in low-density regions (whose nearest neighbors are too far away). 

* **Parameters**: 
    * *eps*: The maximum distance between two samples for them to be considered as in the same neighborhood
    * *min_samples*: The number of samples (or total weight) in a neighborhood for a point to be considered as a core point. This includes the point itself.
* **Scalability**:	Very large n_samples, medium n_clusters
* **Usecase**:	Non-flat geometry, uneven cluster sizes
* **Geometry (metric used)**: Distances between nearest points
		 	

DBSCAN	neighborhood size	Very large n_samples, medium n_clusters	Non-flat geometry, uneven cluster sizes	Distances between nearest points


In [None]:
from sklearn.cluster import DBSCAN

In [None]:
db = DBSCAN(eps=0.5, min_samples=125).fit(trainDS_pca)
n_clusters_ = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0)#outlayers are labeled like -1

In [None]:
print "Cluster metrics"
print "Number of clusters found:", n_clusters_
print "Number of outlayers:", sum(db.labels_==-1)
print "Homogeneity Score:" , metrics.homogeneity_score(db.labels_, dataLabels.class_attack)
print "Completeness Score:", metrics.completeness_score(db.labels_, dataLabels.class_attack)
print "V measure score:", metrics.v_measure_score(db.labels_, dataLabels.class_attack)
print "Adjusted Rand Index:", metrics.adjusted_rand_score(db.labels_, dataLabels.class_attack)

#### Visualize the results on PCA-reduced data

In [None]:
pca = PCA(n_components= 2, whiten=True)
DS_projected = pca.fit_transform(trainDS_pca)

db = DBSCAN(eps=0.5, min_samples=125).fit(DS_projected)
n_clusters_ = len(set(db.labels_)) - (1 if -1 in db.labels_ else 0) #outlayers are labeled like -1


In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(14, 7) )
ax1, ax2 = axes.ravel()

core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
core_samples_mask[db.core_sample_indices_] = True

unique_labels = set(db.labels_)
colors = plt.cm.Spectral(np.linspace(0, 1, len(unique_labels)))

for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = 'k'

    class_member_mask = (db.labels_ == k)

    xy = DS_projected[class_member_mask & core_samples_mask]
    ax1.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=14)

    xy = DS_projected[class_member_mask & ~core_samples_mask]
    ax1.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=6)

ax1.set_title('Estimated clusters: %d & outlayers: %d' % (n_clusters_, sum(db.labels_==-1)))

colors = plt.cm.Spectral(np.linspace(0, 1, len(set(dataLabels.category_attack))))
for i, color in zip(set(dataLabels.category_attack), colors):
    idx = np.where(dataLabels.category_attack == i)
    ax2.scatter(DS_projected[idx, 0], DS_projected[idx, 1], c=color, s = 30, label=i, 
                alpha = 0.5, edgecolors='none')
    
ax2.set_title('Category Attacks (5 categories)')
ax2.legend();


plt.show()

#### Exercice 4: 
Repeat the experiment with *eps* = 1 and *min_samples* = 250 and compare the results with the last experiment

### 4.3.1 Find the best parameter *eps* (with min_samples = 125)

In [None]:
def getDBSCANMesures(eps = 0.5, min_samples = 100, labels = None, data = None ):
    model = DBSCAN(eps = eps, min_samples = min_samples)
    model.fit(data)
    n_clusters_ = len(set(model.labels_)) - (1 if -1 in model.labels_ else 0) #outlayers are labeled like -1
    n_outlayers_ = sum(model.labels_==-1)
    return [eps, min_samples,
            metrics.homogeneity_score(model.labels_, labels), 
            metrics.completeness_score(model.labels_, labels), 
            metrics.v_measure_score(model.labels_, labels), 
            metrics.adjusted_rand_score(model.labels_, labels),
            n_clusters_,
            n_outlayers_]

In [None]:
eps_measures = np.array([getDBSCANMesures(eps = eps_, min_samples = 125 , labels = dataLabels.class_attack, data = trainDS_pca) 
            for eps_ in np.arange(0.25,1.26,0.25)])

In [None]:
plt.figure(figsize=(9,5))
plt.plot(eps_measures[:,0], eps_measures[:,2], label = 'Homogeneity')
plt.plot(eps_measures[:,0], eps_measures[:,3], label = 'Completeness')
plt.plot(eps_measures[:,0], eps_measures[:,4], label = 'V measure')
plt.plot(eps_measures[:,0], eps_measures[:,5], label = 'Adjusted Rand')
plt.legend()
plt.ylim(0,1)
plt.title("Cluster measures")
plt.xlabel("eps")
plt.ylabel("Score");

### 4.3.1 Find the best parameter *min_samples* (with eps = 0.75)

In [None]:
min_s_measures = np.array([getDBSCANMesures(eps = 0.75, min_samples = min_samples_ , labels = dataLabels.class_attack, data = trainDS_pca) 
            for min_samples_ in range(25,501,50)])

In [None]:
plt.figure(figsize=(9,5))
plt.plot(min_s_measures[:,1], min_s_measures[:,2], label = 'Homogeneity')
plt.plot(min_s_measures[:,1], min_s_measures[:,3], label = 'Completeness')
plt.plot(min_s_measures[:,1], min_s_measures[:,4], label = 'V measure')
plt.plot(min_s_measures[:,1], min_s_measures[:,5], label = 'Adjusted Rand')
plt.legend()
plt.title("Cluster measures")
plt.xlabel("min_sample")
plt.ylim(0,1)
plt.ylabel("Score");