# Lab 2: Network Intrusion Detection

In [1]:
# Imports
from matplotlib import pyplot as plt

import pandas
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier 

from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import StratifiedKFold

from sklearn.naive_bayes import GaussianNB

from sklearn.decomposition import RandomizedPCA 
from sklearn.decomposition import PCA
from sklearn.lda import LDA

from sklearn.pipeline import Pipeline
from sklearn import metrics as mt

import seaborn as sns
import pandas as pd
import numpy as np


%matplotlib inline



## Data Preparation:

* Define and prepare your class variables. Use proper variable representations (int, float, one-hot, etc.). Use pre-processing methods (as needed) for dimensionality reduction, scaling, etc. Remove variables that are not needed/useful for the analysis.

In [2]:
# Load UNSW_NB15 into a Pandas dataframe
df = pd.read_csv('UNSW_NB15_training_set.csv', encoding='utf-8-sig')

# Lets remove attributes that are not useful to us during this first analysis pass
non_useful_features_list = ['id', 'attack_cat']
# id: n internal variable to just ref an obseration. deemed not usefl
# attack_cat: first try and just predict the label. 
#             It will obviously 1:1 correlate with label
#             We can circle back and swap it out with label 
#             to see if we get any better accuracy on an 
#             on an attack type level
for feature in non_useful_features_list:
    if feature in df:
        df.drop(feature, axis=1, inplace=True)  # Lets drop id as it is an internal variable to just ref an obseratio
        
# Overwrite the existing dataframe with the new dataframe that does not contain the 
# four unwanted records and confirm we have 4 less records (shold have 82328 observations)
if "is_ftp_login" in df:
    df = df[df.is_ftp_login != 2]
    if len(df) == 82328:
        print ("duplicate record deleted successfully: " + str(len(df)) + " observations remaining" )
        
# Check to see if non useful features still exist in dataframe, if so, we did something wrong
for feature in non_useful_features_list:
    if feature in df:
        print ("[" + feature + "]" + "still found, check removal code. (Should not see this)" )

duplicate record deleted successfully: 82328 observations remaining


* Describe the final dataset that is used for classification/regression (include a description of any newly formed variables you created).

In [3]:
df.describe()

Unnamed: 0,dur,spkts,dpkts,sbytes,dbytes,rate,sttl,dttl,sload,dload,...,ct_src_dport_ltm,ct_dst_sport_ltm,ct_dst_src_ltm,is_ftp_login,ct_ftp_cmd,ct_flw_http_mthd,ct_src_ltm,ct_srv_dst,is_sm_ips_ports,label
count,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0,...,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0,82328.0
mean,1.006783,18.666893,17.546303,7994.267,13234.4,82414.89,180.973448,95.70541,64552150.0,630577.1,...,4.92904,3.663092,7.456528,0.008187,0.008284,0.129749,6.46848,9.16461,0.011126,0.550578
std,4.710557,133.919593,115.576881,171646.4,151475.1,148622.9,101.512436,116.66547,179865600.0,2393055.0,...,8.389724,5.915518,11.415443,0.09011,0.091439,0.638697,8.544117,11.121571,0.104893,0.497438
min,0.0,1.0,0.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
25%,8e-06,2.0,0.0,114.0,0.0,28.60585,62.0,0.0,11203.56,0.0,...,1.0,1.0,1.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0
50%,0.01412,6.0,2.0,534.0,178.0,2651.198,254.0,29.0,577075.1,2112.632,...,1.0,1.0,3.0,0.0,0.0,0.0,3.0,5.0,0.0,1.0
75%,0.719362,12.0,10.0,1280.0,956.0,111111.1,254.0,252.0,65142860.0,15858.18,...,4.0,3.0,6.0,0.0,0.0,0.0,7.0,11.0,0.0,1.0
max,59.999989,10646.0,11018.0,14355770.0,14657530.0,1000000.0,255.0,253.0,5268000000.0,20821110.0,...,59.0,38.0,63.0,1.0,2.0,16.0,60.0,62.0,1.0,1.0


## Modeling and Evaluation:
* Choose and explain your evaluation metrics that you will use (i.e., accuracy, precision, recall, F-measure, or any metric we have discussed). Why are the measure(s) appropriate for analyzing the results of your modeling? Give a detailed explanation backing up any assertions.

*  Choose the method you will use for dividing your data into training and testing splits (i.e., are you using Stratified 10-fold cross validation? Why?). Explain why your chosen method is appropriate or use more than one method as appropriate. For example, if you are using time series data then you should be using continuous training and testing sets across time.

*  Create three different classification/regression models for each task (e.g., random forest, KNN, and SVM for task one and the same or different algorithms for task two). Two modeling techniques must be new (but the third could be SVM or logistic regression). Adjust parameters as appropriate to increase generalization performance using your chosen metric. You must investigate different parameters of the algorithms! 

In [5]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

dataframe = pandas.read_csv(url, names=names)
array = dataframe.values

X = array[:,0:8]
Y = array[:,8]

num_folds = 10
num_instances = len(X)
seed = 7
num_trees = 100
max_features = 3

kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)

results = cross_validation.cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

0.769480519481


In [7]:
array = df.values

X = array[:,0:41]
Y = array[:,41]

num_folds = 10
num_instances = len(X)

seed = 7
num_trees = 100
max_features = 3

kfold = cross_validation.KFold(n=num_instances, n_folds=num_folds, random_state=seed)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)

results = cross_validation.cross_val_score(model, X, Y, cv=kfold)
#print( str(results.mean()) )



   




*  Analyze the results using your chosen method of evaluation. Use visualizations of the results to bolster the analysis. Explain any visuals and analyze why they are interesting to someone that might use this model.

* Discuss the advantages of each model for each classification task, if any. If there are not advantages, explain why. Is any model better than another? Is the difference significant with 95% confidence? Use proper statistical comparison methods. You must use statistical comparison techniques—be sure they are appropriate for your chosen method of validation. 

* Which attributes from your analysis are most important? Use proper methods discussed in class to evaluate the importance of different attributes. Discuss the results and hypothesize about why certain attributes are more important than others for a given classification task.

## Deployment:
* How useful is your model for interested parties (i.e., the companies or organizations that might want to use it for prediction)? How would you measure the model's value if it was used by these parties? How would your deploy your model for interested parties? What other data should be collected? How often would the model need to be updated, etc.? 

## Exceptional Work:

In [37]:
df_copy = df.select_dtypes(['float64', 'int64'])

y = df.attack_cat


#######################################################
# Percentage of variance explained by first component
#######################################################
pca = PCA(n_components=1)
x_pca = pca.fit(df_copy).transform(df_copy)

# Percentage of variance explained for each component
print('explained variance ratio (first component): %3.2f'
      % (100 * pca.explained_variance_ratio_))


############################################################
# Percentage of variance explained for first two components
############################################################
pca = PCA(n_components=2)
x_pca = pca.fit(df_copy).transform(df_copy)

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % (100 * pca.explained_variance_ratio_) )

 
##############################################################
# Percentage of variance explained for first three components
##############################################################
pca = PCA(n_components=3)
x_pca = pca.fit(df_copy).transform(df_copy) 

# Percentage of variance explained for each components
print('explained variance ratio (first three components): %s'
      % (100 * pca.explained_variance_ratio_) )



explained variance ratio (first component): 80.63
explained variance ratio (first two components): [ 80.62648708  18.61891863]
explained variance ratio (first three components): [ 80.62648708  18.61891863   0.75445077]
