In [None]:
%matplotlib inline

# KDD99 Supervised Learning

## 0. Libraries

In [None]:
import numpy as np
import pandas as pd

-----

## 1. Data Description

**Intrinsic attributes**

These attributes are extracted from the headers' area of the network packets.

Col|Feature name  | description |	type
---|--------------|-------------|------------
1  |duration 	  |length (number of seconds) of the connection |continuous
2  |protocol_type |type of the protocol, e.g. tcp, udp, etc. |discrete
3  |service 	  |network service on the destination, e.g., http, telnet, etc. |discrete
4  |flag 	      |normal or error status of the connection. The possible status are this: SF, S0, S1, S2, S3, OTH, REJ, RSTO, RSTOS0, SH, RSTRH, SHR 	|discrete 
5  |src_bytes 	  |number of data bytes from source to destination 	|continuous
6  |dst_bytes 	  |number of data bytes from destination to source 	|continuous
7  |land 	      |1 if connection is from/to the same host/port; 0 otherwise 	|discrete
8  |wrong_fragment|sum of bad checksum packets in a connection 	|continuous
9  |urgent 	      |number of urgent packets. Urgent packets are packets with the urgent bit activated 	|continuous


**Class attribute**

The 42nd attribute is the ***class_attack*** attribute, it indicates which type of connections is each instance: normal or which attack. The values it can take are the following: *anomaly, dict, dict_simple, eject, eject-fail, ffb, ffb_clear, format, format_clear, format-fail, ftp-write, guest, imap, land, load_clear, loadmodule, multihop, perl_clear, perlmagic, phf, rootkit, spy, syslog, teardrop, warez, warezclient, warezmaster, pod, back, ip- sweep, neptune, nmap, portsweep, satan, smurf and normal*.

** Categories of class attribute **


class_attack |Category
-------|--------------
smurf| dos
neptune| dos
back| dos
teardrop| dos
pod| dos
land| dos
normal|normal
satan|probe
ipsweep|probe
portsweep|probe
nmap|probe
warezclient|r2l
guess_passwd|r2l
warezmaster|r2l
imap|r2l
ftp_write|r2l
multihop|r2l
phf|r2l
spy|r2l
buffer_overflow|u2r
rootkit|u2r
loadmodule|u2r
perl|u2r

----

## 2. Load Data

### 2.1 Loading Training Data

In [None]:
trainingData = pd.read_csv('./data/KDD/KDDTrain+.txt', header=None, usecols=[0,1,2,3,4,5,6,7,8,41], 
                   dtype = {"duration": 'float64',
                            "protocol_type": 'object',
                            "service": 'object',
                            "flag": 'object',
                            "src_bytes": 'float64',
                            "dst_bytes": 'float64',
                            "land": 'object',
                            "wrong_fragment": 'float64',
                            "urgent": 'float64',
                            "class_attack": 'object'})

In [None]:
trainingData.columns=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
                 "wrong_fragment","urgent", "class_attack"]

In [None]:
trainingData.protocol_type = trainingData.protocol_type.astype('category')
trainingData.service = trainingData.service.astype('category')
trainingData.flag = trainingData.flag.astype('category')
trainingData.class_attack = trainingData.class_attack.astype('category')

In [None]:
trainingData.head()

In [None]:
trainDS = trainingData[['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 
         'wrong_fragment', 'urgent']]

In [None]:
trainDS.head()

In [None]:
trainLabels = pd.DataFrame(trainingData['class_attack'], dtype='category')

In [None]:
trainLabels["is_normal"] = np.array(trainLabels.class_attack == 'normal',dtype='int')

In [None]:
trainLabels.head()

In [None]:
trainLabels.shape

### 2.2 Loading Testing Data

In [None]:
testData = pd.read_csv('./data/KDD/KDDTest+.txt', header=None, usecols=[0,1,2,3,4,5,6,7,8,41],
                   dtype = {"duration": 'float64',
                            "protocol_type": 'object',
                            "service": 'object',
                            "flag": 'object',
                            "src_bytes": 'float64',
                            "dst_bytes": 'float64',
                            "land": 'object',
                            "wrong_fragment": 'float64',
                            "urgent": 'float64',
                            "class_attack": 'object'})

In [None]:
testData.columns=["duration","protocol_type","service","flag","src_bytes","dst_bytes","land",
                 "wrong_fragment","urgent", "class_attack"]

In [None]:
testData.protocol_type = testData.protocol_type.astype('category')
testData.service = testData.service.astype('category')
testData.flag = testData.flag.astype('category')
testData.class_attack = testData.class_attack.astype('category')

In [None]:
testData.head()

In [None]:
testDS = testData[['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 
         'wrong_fragment', 'urgent']]

In [None]:
testDS.head()

In [None]:
testLabels = pd.DataFrame(testData['class_attack'], dtype='category')

In [None]:
testLabels["is_normal"] = np.array(testLabels.class_attack == 'normal',dtype='int')

In [None]:
testLabels.head()

In [None]:
testLabels.shape

-----

## 3. Data Preparation

### 3.1 Encoding categorical features

In [None]:
# import libraries
import sklearn.preprocessing as pp

In [None]:
attack_class = list(set(trainLabels.class_attack.unique().tolist()+
                               testLabels.class_attack.unique().tolist()))

In [None]:
print attack_class

In [None]:
lb_train_attack_class = pp.LabelBinarizer()
lb_train_attack_class.fit(attack_class)
lb_train_attack_class.transform(trainLabels.class_attack).shape

In [None]:
lb_train_attack_class.classes_.shape

In [None]:
train_attack_class_bin = lb_train_attack_class.transform(trainLabels.class_attack)

##pp.label_binarize(trainLabels.class_attack, 
                        #              classes = attack_class)
trainLabels_encoded = pd.DataFrame(train_attack_class_bin, 
                                       columns = ['is_'+x for x in attack_class])

In [None]:
test_attack_class_bin = lb_train_attack_class.transform(testLabels.class_attack)
#pp.label_binarize(testLabels.class_attack, 
#                                      classes = attack_class)
testLabels_encoded = pd.DataFrame(test_attack_class_bin, 
                                       columns = ['is_'+x for x in attack_class])

** Encoding protocol_type **

In [None]:
protocol_type_class = list(set(trainDS.protocol_type.unique().tolist()+
                               testDS.protocol_type.unique().tolist()))

In [None]:
print protocol_type_class

In [None]:
train_protocol_type_bin = pp.label_binarize(trainDS.protocol_type, 
                                      classes = protocol_type_class)
train_protocol_type_DataFrame = pd.DataFrame(train_protocol_type_bin, 
                                       columns = ['is_'+x for x in protocol_type_class])

In [None]:
test_protocol_type_bin = pp.label_binarize(testDS.protocol_type, 
                                      classes = protocol_type_class)
test_protocol_type_DataFrame = pd.DataFrame(test_protocol_type_bin, 
                                       columns = ['is_'+x for x in protocol_type_class])

** Encoding service **

In [None]:
service_class = list(set(trainDS.service.unique().tolist()+
                               testDS.service.unique().tolist()))

In [None]:
print service_class

In [None]:
train_service_bin = pp.label_binarize(trainDS.service, 
                                      classes = service_class)
train_service_DataFrame = pd.DataFrame(train_service_bin, 
                                       columns = ['is_'+x for x in service_class])

In [None]:

test_service_bin = pp.label_binarize(testDS.service, 
                                     classes = service_class)
test_service_DataFrame = pd.DataFrame(test_service_bin, 
                                      columns = ['is_'+x for x in service_class])

** Encoding flag **

In [None]:
flag_class = list(set(trainDS.flag.unique().tolist()+
                               testDS.flag.unique().tolist()))

In [None]:
print flag_class

In [None]:
train_flag_bin = pp.label_binarize(trainDS.flag, 
                                    classes = flag_class)
train_flag_DataFrame = pd.DataFrame(train_flag_bin, 
                                 columns = ['is_'+x for x in flag_class])

In [None]:
test_flag_bin = pp.label_binarize(testDS.flag, 
                                  classes = flag_class)
test_flag_DataFrame = pd.DataFrame(test_flag_bin, 
                                   columns = ['is_'+x for x in flag_class])

** Concatenating all de data set **

In [None]:
trainDS_encoded = pd.concat([trainDS, train_protocol_type_DataFrame, train_service_DataFrame, 
                     train_flag_DataFrame], axis = 1)


In [None]:
testDS_encoded = pd.concat([testDS, test_protocol_type_DataFrame, test_service_DataFrame, 
                     test_flag_DataFrame], axis = 1)



** Selecting only numbered features **

In [None]:
continuousCols_train = ["duration","src_bytes","dst_bytes","land","wrong_fragment","urgent"] + \
            [c for c in trainDS_encoded.columns if c.startswith("is_")]
trainDS_encoded = pd.DataFrame(trainDS_encoded[continuousCols_train], dtype='float64')
print trainDS_encoded.shape

In [None]:
continuousCols_test = ["duration","src_bytes","dst_bytes","land","wrong_fragment","urgent"] + \
            [c for c in testDS_encoded.columns if c.startswith("is_")]
testDS_encoded = pd.DataFrame(testDS_encoded[continuousCols_test], dtype='float64')
print testDS_encoded.shape

### 3.2 Input Normalization

#### 3.2.1 Training Data Set

In [None]:
scaler = pp.MinMaxScaler().fit(trainDS_encoded)

In [None]:
trainDS_scaled = pd.DataFrame(scaler.transform(trainDS_encoded), columns =  continuousCols_train)

In [None]:
trainDS_scaled.describe()

#### 3.2.2 Test Data Set

**WARNING**: Using the scaler from *trainDS*

In [None]:
testDS_scaled = pd.DataFrame(scaler.transform(testDS_encoded), columns =  continuousCols_test)

In [None]:
testDS_scaled.describe()

### 3.3 Principal Component Analysis (PCA)

In [None]:
from sklearn.decomposition import PCA

In [None]:
n_features = trainDS_scaled.columns.size

In [None]:
print "Total number of features: %d" %n_features

In [None]:
pca = PCA(n_components=n_features, whiten=False)
pca.fit(trainDS_scaled)

In [None]:
#accum explained variance ration
pca.explained_variance_ratio_[0:].cumsum()

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(1 - pca.explained_variance_ratio_.cumsum(), drawstyle = 'steps-post')
plt.title('PCA Reconstruction Error');

In [None]:
n_factors = sum(1-pca.explained_variance_ratio_[0:].cumsum() > 0.10)
print "Number of factors with 10% of reonstraction Error: ", n_factors

In [None]:
pca = PCA(n_components=n_factors)
pca.fit(trainDS_scaled)

In [None]:
print "Explained Variance Ratio"
sum(pca.explained_variance_ratio_)

In [None]:
trainDS_pca = pca.transform(trainDS_scaled)

**WARNING**: Using the pca from *trainDS_scaled* to *testDS_scaled*

In [None]:
testDS_pca = pca.transform(testDS_scaled)

-----

## 4. Modeling

In [None]:
from sklearn import metrics

## 4.3 Support Vector Machine

* **Parameters**: 
   * *C* : Penalty parameter C of the error term
   * *gamma* : Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’.
   * *degree* : Degree of the polynomial kernel function (‘poly’). Ignored by all other kernels.
   * *kernel* : ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’ or a callable 
* **Usecase**:	Clasification
* **Complexity**: The core of an SVM is a quadratic programming problem (QP), separating support vectors from the rest of the training data.
		 	

In [None]:
from sklearn.svm import SVC

In [None]:
from sklearn.cross_validation import StratifiedShuffleSplit

In [None]:
from sklearn.externals import joblib

In [None]:
def getSVMMesures(C = 1, kernel = 'rbf', max_iter = 1e3, tol = 1e-3,
                  train_size = None, n_experiments = 1,
                  train_labels = None, train_data = None, gamma = 0.0, degree = 3, order = None):
    
    
    model = SVC(C = C, gamma = gamma, kernel = kernel, degree = degree, tol = tol, max_iter = max_iter)
    
    test_size = train_size * 0.2
    sss = StratifiedShuffleSplit(train_labels, n_iter = n_experiments, 
                                 train_size = train_size, 
                                 test_size = test_size,
                                 random_state = np.random.random_integers(0,100000))
    
    modelsFitted = [model.fit(train_data[train_ix, :], train_labels[train_ix])
                    for train_ix, test_ix in sss]
    
    scores = [(m.score(train_data[train_ix, :], train_labels[train_ix]),
              m.score(train_data[test_ix, :], train_labels[test_ix]))
              for m, (train_ix, test_ix) in zip(modelsFitted, sss)]
    
    meanScores = np.mean(scores, axis = 0)
    maxScores = np.max(scores, axis = 0)
    minScores = np.min(scores, axis = 0)
    
    return [model.C,
            model.kernel,
            model.max_iter,
            model.tol,
            train_size,
            meanScores[0],# mean E_in
            maxScores[0], # max E_in
            minScores[0], # min E_in
            meanScores[1],# mean E_out
            maxScores[1], # max E_out
            minScores[1], # min E_out
            order,
            model.gamma,
            model.degree,
            model] #model

### 4.3.1 SVM: Kernel linear

In [None]:
svmModel = SVC(C=1, kernel='linear', cache_size= 200, tol = 1e-3, max_iter = -1)
%time svmModel.fit(trainDS_pca, trainLabels.is_normal)

In [None]:
print "Mean accuracy on the given train data and labels: "
%time svmModel.score(trainDS_pca, trainLabels.is_normal)

In [None]:
print "Mean accuracy on the given test data and labels: "
%time svmModel.score(testDS_pca, testLabels.is_normal)

#### 4.3.1.1 Finding out the Training Set Size

#### WARNING: IT TAKES 4 MIN APROX.

In [None]:
%%time 

SVM_Linear_measures_size = np.array([getSVMMesures(C = 1, kernel = 'linear', train_size = n*0.1, 
                                                   n_experiments = 10, train_labels = trainLabels.is_normal, 
                                                   train_data = trainDS_pca) 
                                     for n in range(1,6)])


In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(20, 10) )
ax1, ax2 = axes.ravel()

ax1.plot(SVM_Linear_measures_size[:,4], SVM_Linear_measures_size[:,5], label = 'mean Score in', c = 'b')
ax1.plot(SVM_Linear_measures_size[:,4], SVM_Linear_measures_size[:,6], label = 'max Score in', c = 'g')
ax1.plot(SVM_Linear_measures_size[:,4], SVM_Linear_measures_size[:,7], label = 'min Score in', c = 'r')
ax1.legend(loc = 4)
ax1.grid()
ax1.set_title('SVM: select the size of training Set (kernel = linear). E_in')
ax1.set_xlabel("% of Train Data Set")
ax1.set_ylabel("Scores In")
ax1.set_ylim(0.65,0.97);


ax2.plot(SVM_Linear_measures_size[:,4], SVM_Linear_measures_size[:,8], label = 'mean Score out', c = 'b')
ax2.plot(SVM_Linear_measures_size[:,4], SVM_Linear_measures_size[:,9], label = 'max Score out', c = 'g')
ax2.plot(SVM_Linear_measures_size[:,4], SVM_Linear_measures_size[:,10], label = 'min Score out', c = 'r')
ax2.legend(loc = 4)
ax2.grid()
ax2.set_title("SVM: select the size of training Set (kernel = linear). E_out")
ax2.set_xlabel("% of Train Data Set")
ax2.set_ylabel("Scores Out")
ax2.set_ylim(0.65,0.97);


*Percentage of training set* ** =  10% **

In [None]:
train_size = 0.1

#### 4.3.1.2 Finding out the best regularization parameter

In [None]:
Cs = [ 1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2 ]

In [None]:
%%time
SVM_Linear_measures_C = np.array([getSVMMesures(C = c, kernel = 'linear', 
                                                    train_size = train_size, n_experiments = 6, 
                                                    train_labels = trainLabels.is_normal, 
                                                    train_data = trainDS_pca) 
                                    for c in Cs])

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(20, 10) )
ax1, ax2 = axes.ravel()

ax1.plot(SVM_Linear_measures_C[:,0], SVM_Linear_measures_C[:,5], label = 'mean Score in', c = 'b')
ax1.plot(SVM_Linear_measures_C[:,0], SVM_Linear_measures_C[:,6], label = 'max Score in', c = 'g')
ax1.plot(SVM_Linear_measures_C[:,0], SVM_Linear_measures_C[:,7], label = 'min Score in', c = 'r')
ax1.legend(loc = 3)
ax1.set_xscale("log")
ax1.grid()
ax1.set_title('SVM: Regularization paramenter (kernel = linear). E_in')
ax1.set_xlabel("C: Regularization Parameter")
ax1.set_ylabel("Scores In")
ax1.set_ylim(0.4, 1);

ax2.plot(SVM_Linear_measures_C[:,0], SVM_Linear_measures_C[:,8], label = 'mean Score out', c = 'b')
ax2.plot(SVM_Linear_measures_C[:,0], SVM_Linear_measures_C[:,9], label = 'max Score out', c = 'g')
ax2.plot(SVM_Linear_measures_C[:,0], SVM_Linear_measures_C[:,10], label = 'min Score out', c = 'r')
ax2.legend(loc = 3)
ax2.set_xscale("log")
ax2.grid()
ax2.set_title("SVM: Regularization parameter (kernel = linear). E_out")
ax2.set_xlabel("C: Regularization Parameter")
ax2.set_ylabel("Scores Out")
ax2.set_ylim(0.4, 1);


*Penalty parameter C of the error term* **C =  1e-1 **

New model with C = 1e-1

In [None]:
%%time
SVM_Linear_measures = np.array([getSVMMesures(C = 1e-1, kernel = 'linear', 
                                              train_size = train_size, 
                                              n_experiments = 6, 
                                              train_labels = trainLabels.is_normal, 
                                              train_data = trainDS_pca, order = n) 
                                for n in range(10)])


In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(20, 10) )
ax1, ax2 = axes.ravel()

ax1.plot(SVM_Linear_measures[:,11], SVM_Linear_measures[:,5], label = 'mean Score in', c = 'b')
ax1.plot(SVM_Linear_measures[:,11], SVM_Linear_measures[:,6], label = 'max Score in', c = 'g')
ax1.plot(SVM_Linear_measures[:,11], SVM_Linear_measures[:,7], label = 'min Score in', c = 'r')
ax1.legend(loc = 3)
ax1.grid()
ax1.set_title('SVM: (kernel = linear, C = 1e-1). E_in')
ax1.set_xlabel("Experimenet Number")
ax1.set_ylabel("Scores In")
ax1.set_ylim(0.9, 1);

ax2.plot(SVM_Linear_measures[:,11], SVM_Linear_measures[:,8], label = 'mean Score out', c = 'b')
ax2.plot(SVM_Linear_measures[:,11], SVM_Linear_measures[:,9], label = 'max Score out', c = 'g')
ax2.plot(SVM_Linear_measures[:,11], SVM_Linear_measures[:,10], label = 'min Score out', c = 'r')
ax2.legend(loc = 3)
ax2.grid()
ax2.set_title("SVM (kernel = linear, C = 1e-1). E_out")
ax2.set_xlabel("Experimenet Number")
ax2.set_ylabel("Scores Out")
ax2.set_ylim(0.9, 1);


#### 4.3.1.3 Saving the best SVM Linear model

In [None]:
svmModel = SVM_Linear_measures[8][14]

In [None]:
%mkdir './models/SVM_Linear'

In [None]:
joblib.dump(svmModel, './models/SVM_Linear/SVM_linear.pkl')

### 4.3.2 SVM: Radius Basis Function (RBF) kernel

#### WARNING: IT TAKES 17 MIN APROX.

In [None]:
svmModel = SVC(C=1, kernel='rbf')
%time svmModel.fit(trainDS_pca, trainLabels.is_normal)

In [None]:
print "Mean accuracy on the given train data and labels: "
svmModel.score(trainDS_pca, trainLabels.is_normal)

In [None]:
print "Mean accuracy on the given test data and labels: "
svmModel.score(testDS_pca, testLabels.is_normal)

In [None]:
predicted = svmModel.predict(testDS_pca)

In [None]:
print "Confusion Matrix"
metrics.confusion_matrix(testLabels.is_normal, predicted)

#### 4.3.2.1 Finding out the Training Set Size

In [None]:
%%time 
SVM_RBF_measures_size = np.array([getSVMMesures(C = 1, kernel = 'rbf', 
                                                train_size = n*0.1, n_experiments = 6, 
                                                train_labels = trainLabels.is_normal, 
                                                train_data = trainDS_pca) 
                                  for n in range(1,6)])


In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(20, 10) )
ax1, ax2 = axes.ravel()

ax1.plot(SVM_RBF_measures_size[:,4], SVM_RBF_measures_size[:,5], label = 'mean Score in', c = 'b')
ax1.plot(SVM_RBF_measures_size[:,4], SVM_RBF_measures_size[:,6], label = 'max Score in', c = 'g')
ax1.plot(SVM_RBF_measures_size[:,4], SVM_RBF_measures_size[:,7], label = 'min Score in', c = 'r')
ax1.legend(loc = 1)
ax1.grid()
ax1.set_title('SVM: select the size of training Set (kernel = rbf). E_in')
ax1.set_xlabel("% of Train Data Set")
ax1.set_ylabel("Scores In")
ax1.set_ylim(0.7,0.97);


ax2.plot(SVM_RBF_measures_size[:,4], SVM_RBF_measures_size[:,8], label = 'mean Score out', c = 'b')
ax2.plot(SVM_RBF_measures_size[:,4], SVM_RBF_measures_size[:,9], label = 'max Score out', c = 'g')
ax2.plot(SVM_RBF_measures_size[:,4], SVM_RBF_measures_size[:,10], label = 'min Score out', c = 'r')
ax2.legend(loc = 1)
ax2.grid()
ax2.set_title("SVM: select the size of training Set (kernel = rbf). E_out")
ax2.set_xlabel("% of Train Data Set")
ax2.set_ylabel("Scores Out")
ax2.set_ylim(0.7,0.97);


*Percentage of training set* ** =  10% **

In [None]:
train_size = 0.1

#### 4.3.2.2 Finding out the best regularization parameter

In [None]:
Cs = [ 1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2]

In [None]:
%%time 
SVM_RBF_measures_C = np.array([getSVMMesures(C = c, kernel = 'rbf', 
                                             train_size = train_size, n_experiments = 6, 
                                             train_labels = trainLabels.is_normal, 
                                             train_data = trainDS_pca) 
                               for c in Cs])


In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(20, 10) )
ax1, ax2 = axes.ravel()

ax1.plot(SVM_RBF_measures_C[:,0], SVM_RBF_measures_C[:,5], label = 'mean Score in', c = 'b')
ax1.plot(SVM_RBF_measures_C[:,0], SVM_RBF_measures_C[:,6], label = 'max Score in', c = 'g')
ax1.plot(SVM_RBF_measures_C[:,0], SVM_RBF_measures_C[:,7], label = 'min Score in', c = 'r')
ax1.legend(loc = 3)
ax1.grid()
ax1.set_xscale("log")
ax1.set_title('SVM: Regularization parameter (kernel = rbf). E_in')
ax1.set_xlabel("C: Regularization Parameter")
ax1.set_ylabel("Scores In")
ax1.set_ylim(0.9,1);


ax2.plot(SVM_RBF_measures_C[:,0], SVM_RBF_measures_C[:,8], label = 'mean Score out', c = 'b')
ax2.plot(SVM_RBF_measures_C[:,0], SVM_RBF_measures_C[:,9], label = 'max Score out', c = 'g')
ax2.plot(SVM_RBF_measures_C[:,0], SVM_RBF_measures_C[:,10], label = 'min Score out', c = 'r')
ax2.legend(loc = 3)
ax2.grid()
ax2.set_xscale("log")
ax2.set_title("SVM: Regularization parameter (kernel = rbf). E_out")
ax2.set_xlabel("C: Regularization Parameter")
ax2.set_ylabel("Scores Out")
ax2.set_ylim(0.9,1);


*Penalty parameter C of the error term* **C =  10 **

#### 4.3.2.3 Finding out the best *gamma* with regularization parameter C = 10

In [None]:
gammas = [ 1e5, 1e4, 1e3, 1e2, 1e1, 1e0, 1e-1, 1e-2]

In [None]:
%%time 
SVM_RBF_measures_gamma = np.array([getSVMMesures(C = 10, gamma = g, kernel = 'rbf', 
                                             train_size = train_size, n_experiments = 6, 
                                             train_labels = trainLabels.is_normal, 
                                             train_data = trainDS_pca) 
                               for g in gammas])


In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(20, 10) )
ax1, ax2 = axes.ravel()

ax1.plot(SVM_RBF_measures_gamma[:,12], SVM_RBF_measures_gamma[:,5], label = 'mean Score in', c = 'b')
ax1.plot(SVM_RBF_measures_gamma[:,12], SVM_RBF_measures_gamma[:,6], label = 'max Score in', c = 'g')
ax1.plot(SVM_RBF_measures_gamma[:,12], SVM_RBF_measures_gamma[:,7], label = 'min Score in', c = 'r')
ax1.legend(loc = 3)
ax1.grid()
ax1.set_xscale("log")
ax1.set_title('SVM: gamma (kernel = rbf). E_in')
ax1.set_xlabel("Kernel coefficient gamma")
ax1.set_ylabel("Scores In")
ax1.set_ylim(0.925,0.975);


ax2.plot(SVM_RBF_measures_gamma[:,12], SVM_RBF_measures_gamma[:,8], label = 'mean Score out', c = 'b')
ax2.plot(SVM_RBF_measures_gamma[:,12], SVM_RBF_measures_gamma[:,9], label = 'max Score out', c = 'g')
ax2.plot(SVM_RBF_measures_gamma[:,12], SVM_RBF_measures_gamma[:,10], label = 'min Score out', c = 'r')
ax2.legend(loc = 3)
ax2.grid()
ax2.set_xscale("log")
ax2.set_title("SVM: gamma (kernel = rbf). E_out")
ax2.set_xlabel("Kernel coefficient gamma")
ax2.set_ylabel("Scores Out")
ax2.set_ylim(0.925,0.975);


*Penalty parameter C of the error term* **C =  10 **

*Kernel Coefficient* **gamma = 1e2 **

New model with C = 10 and gamma = 1e3 with the 10% of the training Set

In [None]:
train_size = 0.1

In [None]:
%%time
SVM_RBF_measures = np.array([getSVMMesures(C = 10, gamma= 1e2, kernel = 'rbf', 
                                           train_size = train_size, 
                                           n_experiments = 6, 
                                           train_labels = trainLabels.is_normal, 
                                           train_data = trainDS_pca, order = n) 
                            for n in range(10)])

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(20, 10) )
ax1, ax2 = axes.ravel()

ax1.plot(SVM_RBF_measures[:,11], SVM_RBF_measures[:,5], label = 'mean Score in', c = 'b')
ax1.plot(SVM_RBF_measures[:,11], SVM_RBF_measures[:,6], label = 'max Score in', c = 'g')
ax1.plot(SVM_RBF_measures[:,11], SVM_RBF_measures[:,7], label = 'min Score in', c = 'r')
ax1.legend(loc = 3)
ax1.grid()
ax1.set_title('SVM: (kernel = rbf, C = 10, gamma = 1e2). E_in')
ax1.set_xlabel("Experimenet Number")
ax1.set_ylabel("Scores In")
ax1.set_ylim(0.925, 0.975);

ax2.plot(SVM_RBF_measures[:,11], SVM_RBF_measures[:,8], label = 'mean Score out', c = 'b')
ax2.plot(SVM_RBF_measures[:,11], SVM_RBF_measures[:,9], label = 'max Score out', c = 'g')
ax2.plot(SVM_RBF_measures[:,11], SVM_RBF_measures[:,10], label = 'min Score out', c = 'r')
ax2.legend(loc = 3)
ax2.grid()
ax2.set_title("SVM (kernel = rbf, C = 10, gamma = 1e2). E_out")
ax2.set_xlabel("Experimenet Number")
ax2.set_ylabel("Scores Out")
ax2.set_ylim(0.925, 0.975);


#### 4.3.2.4 Saving the best SVM Radius Basis Fuction (RBF) model

In [None]:
%mkdir './models/SVM_RBF'

In [None]:
svmModel = SVM_RBF_measures[8][14]

In [None]:
joblib.dump(svmModel, './models/SVM_RBF/SVM_RBF.pkl')

## Exercice 1
Find the best parameters to SVC with Polynomial Kernel:
* Training size
* Degree
* Regularization paramenter (C)
* Gamma