In [None]:
%matplotlib inline

# KDD99 Supervised Learning - Model Performance

## 0. Libraries

In [None]:
import numpy as np
import pandas as pd

-----

## 1. Data Description

**Intrinsic attributes**

These attributes are extracted from the headers' area of the network packets.

Col|Feature name  | description |	type
---|--------------|-------------|------------
1  |duration 	  |length (number of seconds) of the connection |continuous
2  |protocol_type |type of the protocol, e.g. tcp, udp, etc. |discrete
3  |service 	  |network service on the destination, e.g., http, telnet, etc. |discrete
4  |flag 	      |normal or error status of the connection. The possible status are this: SF, S0, S1, S2, S3, OTH, REJ, RSTO, RSTOS0, SH, RSTRH, SHR 	|discrete 
5  |src_bytes 	  |number of data bytes from source to destination 	|continuous
6  |dst_bytes 	  |number of data bytes from destination to source 	|continuous
7  |land 	      |1 if connection is from/to the same host/port; 0 otherwise 	|discrete
8  |wrong_fragment|sum of bad checksum packets in a connection 	|continuous
9  |urgent 	      |number of urgent packets. Urgent packets are packets with the urgent bit activated 	|continuous


**Class attribute**

The 42nd attribute is the ***class_attack*** attribute, it indicates which type of connections is each instance: normal or which attack. The values it can take are the following: *anomaly, dict, dict_simple, eject, eject-fail, ffb, ffb_clear, format, format_clear, format-fail, ftp-write, guest, imap, land, load_clear, loadmodule, multihop, perl_clear, perlmagic, phf, rootkit, spy, syslog, teardrop, warez, warezclient, warezmaster, pod, back, ip- sweep, neptune, nmap, portsweep, satan, smurf and normal*.

** Categories of class attribute **


class_attack |Category
-------|--------------
smurf| dos
neptune| dos
back| dos
teardrop| dos
pod| dos
land| dos
normal|normal
satan|probe
ipsweep|probe
portsweep|probe
nmap|probe
warezclient|r2l
guess_passwd|r2l
warezmaster|r2l
imap|r2l
ftp_write|r2l
multihop|r2l
phf|r2l
spy|r2l
buffer_overflow|u2r
rootkit|u2r
loadmodule|u2r
perl|u2r

----

## 2. Load Data

### 2.1 Loading File

In [None]:
data = pd.read_csv('./data/KDD/KDDTrain+.txt', header=None, usecols=[0,1,2,3,4,5,6,7,8,41], 
                   dtype = {"duration": 'float64',
                            "protocol_type": 'object',
                            "service": 'object',
                            "flag": 'object',
                            "src_bytes": 'float64',
                            "dst_bytes": 'float64',
                            "land": 'object',
                            "wrong_fragment": 'float64',
                            "urgent": 'float64',
                            "class_attack": 'object'})

In [None]:
data.columns=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
                 "wrong_fragment","urgent", "class_attack"]

In [None]:
data.protocol_type = data.protocol_type.astype('category')
data.service = data.service.astype('category')
data.flag = data.flag.astype('category')
data.class_attack = data.class_attack.astype('category')

In [None]:
data.head()

In [None]:
DS = pd.DataFrame(data[['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 
                        'wrong_fragment', 'urgent']])

In [None]:
DS.head()

In [None]:
dataLabels = pd.DataFrame(data['class_attack'], dtype='category')

In [None]:
dataLabels["is_normal"] = np.array(dataLabels.class_attack == 'normal',dtype='int')

In [None]:
dataLabels.head()

-----

## 3. Data Preparation

### 3.1 Encoding categorical features

In [None]:
# import libraries
import sklearn.preprocessing as pp

In [None]:
attack_class = list(set(dataLabels.class_attack.unique().tolist()))

In [None]:
print attack_class

In [None]:
lb_train_attack_class = pp.LabelBinarizer()
lb_train_attack_class.fit(attack_class)
lb_train_attack_class.transform(dataLabels.class_attack).shape

In [None]:
lb_train_attack_class.classes_.shape

In [None]:
train_attack_class_bin = lb_train_attack_class.transform(data.class_attack)

##pp.label_binarize(trainLabels.class_attack, 
                        #              classes = attack_class)
trainLabels_encoded = pd.DataFrame(train_attack_class_bin, 
                                       columns = ['is_'+x for x in attack_class])

** Encoding protocol_type **

In [None]:
protocol_type_class = list(set(DS.protocol_type.unique().tolist()))

In [None]:
print protocol_type_class

In [None]:
train_protocol_type_bin = pp.label_binarize(DS.protocol_type, 
                                      classes = protocol_type_class)
train_protocol_type_DataFrame = pd.DataFrame(train_protocol_type_bin, 
                                       columns = ['is_'+x for x in protocol_type_class])

** Encoding service **

In [None]:
service_class = list(set(DS.service.unique().tolist()))

In [None]:
print service_class

In [None]:
train_service_bin = pp.label_binarize(DS.service, 
                                      classes = service_class)
train_service_DataFrame = pd.DataFrame(train_service_bin, 
                                       columns = ['is_'+x for x in service_class])

** Encoding flag **

In [None]:
flag_class = list(set(data.flag.unique().tolist()))

In [None]:
print flag_class

In [None]:
train_flag_bin = pp.label_binarize(DS.flag, 
                                    classes = flag_class)
train_flag_DataFrame = pd.DataFrame(train_flag_bin, 
                                 columns = ['is_'+x for x in flag_class])

** Concatenating all de data set **

In [None]:
DS_encoded = pd.concat([DS, train_protocol_type_DataFrame, train_service_DataFrame, 
                     train_flag_DataFrame], axis = 1)


** Selecting only numbered features **

In [None]:
continuousCols_train = ["duration","src_bytes","dst_bytes","land","wrong_fragment","urgent"] + \
            [c for c in DS_encoded.columns if c.startswith("is_")]
DS_encoded = pd.DataFrame(DS_encoded[continuousCols_train], dtype='float64')
print DS_encoded.shape

### 3.2 Input Normalization

#### 3.2.1 Training Data Set

In [None]:
scaler = pp.MinMaxScaler().fit(DS_encoded)

In [None]:
DS_scaled = pd.DataFrame(scaler.transform(DS_encoded), columns =  continuousCols_train)

In [None]:
DS_scaled.describe()

### 3.3 Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

In [None]:
n_features = DS_scaled.columns.size

In [None]:
print "Total number of features: %d" %n_features

In [None]:
pca = PCA(n_components=n_features, whiten=False)
pca.fit(DS_scaled)

In [None]:
#accum explained variance ration
pca.explained_variance_ratio_[0:].cumsum()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(1 - pca.explained_variance_ratio_.cumsum(), drawstyle = 'steps-post')
plt.title('PCA Reconstruction Error');

In [None]:
n_factors = sum(1-pca.explained_variance_ratio_[0:].cumsum() > 0.10)
print "Number of factors with 10% of reonstraction Error: ", n_factors

In [None]:
pca = PCA(n_components=n_factors)
pca.fit(DS_scaled)

In [None]:
print "Explained Variance Ratio"
sum(pca.explained_variance_ratio_)

In [None]:
DS_pca = pca.transform(DS_scaled)

### 2.4 Split Data Set: Training + Testing

In [None]:
from sklearn.cross_validation import StratifiedShuffleSplit

In [None]:
sss = StratifiedShuffleSplit(dataLabels.is_normal, n_iter = 1, 
                                 train_size = 0.1, 
                                 test_size = 0.066,
                                 random_state = np.random.random_integers(0,100000))

In [None]:
train_ix, test_ix = [(train_ix, test_ix) for train_ix, test_ix in sss][0]

In [None]:
trainDS = DS_pca[train_ix, :]
trainLabels = dataLabels.is_normal[train_ix]
testDS = DS_pca[test_ix, :]
testLabels = dataLabels.is_normal[test_ix]

In [None]:
print "Train Data Set shape: ", trainDS.shape
print "Train Label shape: ", trainLabels.shape

In [None]:
print "Test Data Set shape: ", testDS.shape
print "Test Label shape: ", testLabels.shape

-----

## 3. SVM Model Evaluation

### 3.1 Receiver operating Characteristic (ROC)

In [None]:
from sklearn import metrics

In [None]:
from sklearn.svm import SVC

In [None]:
def getMetrics(model = None, name = "", X_train = None, Y_train = None, X_test = None, Y_test = None):
    m = dict()
    m["name"] = name
    m["model"] = model
    m["y_score"] = m["model"].fit(X_train, Y_train).decision_function(X_test)
    m["fpr"], m["tpr"], m["thresholds"] = metrics.roc_curve(Y_test, m["y_score"])
    m["roc_auc"] = metrics.auc(m["fpr"], m["tpr"])
    return m
    

In [None]:
def plotROC(models = [], plt = None):
    if not isinstance(models, (list, tuple)):
        models = [models]
    for m in models:
        plt.plot(m["fpr"], m["tpr"], label='ROC curve %s (area = %0.2f)' % (m["name"],m["roc_auc"]))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver operating characteristic')
        plt.legend(loc="lower right")
    

** SVM with: **

* kernel = linear

* Penalty parameter C of the error term **C =  1e-1 **


In [None]:
SVC_linear = getMetrics(model = SVC(C = 1e-1, kernel = 'linear'), 
                        name="SVC kernel 'linear'",
                        X_train = trainDS,
                        Y_train = trainLabels,
                        X_test = testDS,
                        Y_test = testLabels)

In [None]:
# Plot of a ROC curve
plt.figure(figsize=(7,7))
plotROC(SVC_linear, plt)
plt.show()

** Model SVC with: **

* kernel = rbf

* Penalty parameter C of the error term **C =  10 **

* Kernel Coefficient* **gamma = 1e2 **

In [None]:
SVC_RBF = getMetrics(model = SVC(C = 1e1, kernel = 'rbf', gamma = 1e2), 
                     name="SVC kernel 'rbt'",
                     X_train = trainDS,
                     Y_train = trainLabels,
                     X_test = testDS,
                     Y_test = testLabels)

In [None]:
# Plot of a ROC curve
plt.figure(figsize=(7,7))
plotROC([SVC_RBF, SVC_linear], plt)
plt.show()

** Model SVC with: **

* kernel = 'poly'

* Polynomial degree **degree = 2**

* Penalty parameter C of the error term **C =  1e1 **

* Kernel Coefficient **gamma = 1e-1 **

In [None]:
SVC_Poly = getMetrics(model = SVC(C = 1e1, kernel = 'poly', gamma = 1e-1, degree = 2, probability=True), 
                     name="SVC kernel 'poly'",
                     X_train = trainDS,
                     Y_train = trainLabels,
                     X_test = testDS,
                     Y_test = testLabels)

In [None]:
# Plot of a ROC curve
plt.figure(figsize=(7,7))
plotROC([SVC_RBF, SVC_linear,SVC_Poly], plt)
plt.show()

### 3.2 Cumulative Response (Gain Charts)

Gain or lift is a measure of the effectiveness of a classification model calculated as the ratio between the results obtained with and without the model. Gain and lift charts are visual aids for evaluating performance of classification models. However, in contrast to the confusion matrix that evaluates models on the whole population gain or lift chart evaluates model performance in a portion of the population.

In [None]:
def getLiftTable(y_score = None , Y = None, nBins = 10):
    scoreTable = pd.DataFrame(zip(y_score, Y), columns=["score","target"])
    sortedScoreTable = scoreTable.sort("score")
    l = len(sortedScoreTable)
    t = zip(np.arange(1,101,100.0/nBins),
            [100.0*sum(sortedScoreTable[0:n]["target"]==0)/sum(sortedScoreTable["target"]==0) for n in range(l/nBins , l, l/nBins)])
    df = pd.DataFrame(t, columns=["%count","%target"])
    df["lift"] = df["%target"]/df["%count"]
    return df

In [None]:
def plotCumulativeResponse(models = [], plt = None):
    if not isinstance(models, (list, tuple)):
        models = [models]
    for m in models:
        liftTable = getLiftTable(y_score=m["y_score"], Y = testLabels, nBins = 100)
        plt.plot(liftTable["%count"],liftTable["%target"],
                 label = '%s' % m["name"])
    plt.plot([0, 100], [0, 100], 'k--', label = "Random")
    plt.xlim([0.0, 100.0])
    plt.ylim([0.0, 105.0])
    plt.xlabel('Percentage of test instances (decresing by score)')
    plt.ylabel('Percentage of positives targeted')
    plt.title('Cumulative response of classifiers')
    plt.legend(loc="upper left")
    plt.grid()
    

In [None]:
plt.figure(figsize=(10,7))
plotCumulativeResponse([SVC_RBF, SVC_linear, SVC_Poly], plt)

### 3.3 Lift Chart

The lift chart shows how much more likely we are to receive positive responses than if we contact a random sample of customers.

In [None]:
def plotLift(models = [], plt = None):
    if not isinstance(models, (list, tuple)):
        models = [models]
    for m in models:
        liftTable = getLiftTable(y_score=m["y_score"], Y = testLabels, nBins = 100)
        plt.plot(liftTable["%count"],liftTable["lift"],
                 label = 'Lift %s' % m["name"])
    plt.plot([0, 100], [1, 1], 'k--', label = "Random")
    plt.xlim([0.0, 100.0])
    plt.ylim([0.0, 3.0])
    plt.xlabel('Percentage of test instances (decresing by score)')
    plt.ylabel('Lift')
    plt.title('Lift of classifiers')
    plt.legend(loc="upper right")
    plt.grid()
    

In [None]:
plt.figure(figsize=(10,7))
plotLift([SVC_RBF, SVC_linear, SVC_Poly], plt)