In [2]:
import pandas as pd

# Intrusion Detection

## Objectives

- Predict whether a connection is good (normal) or has been used to perform an intrusion or attack.

## Algorithms/Models

- Multinomial Logistic Regression (Classification)
- Support Vector Machine (Classification)
- Random Forest (Classification)

## Data Consolidation

In [24]:
names = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'status']
detections = pd.read_csv('raw_files/intrusion_detection.csv', header=0, names=names)
detections.to_csv('intrusion_detection.csv', index=False)

## Data Preview

In [25]:
data = pd.read_csv('intrusion_detection.csv')
data.head()

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,status
0,0,tcp,http,SF,239,486,0,0,0,0,...,19,1.0,0.0,0.05,0.0,0.0,0.0,0.0,0.0,normal.
1,0,tcp,http,SF,235,1337,0,0,0,0,...,29,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
2,0,tcp,http,SF,219,1337,0,0,0,0,...,39,1.0,0.0,0.03,0.0,0.0,0.0,0.0,0.0,normal.
3,0,tcp,http,SF,217,2032,0,0,0,0,...,49,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.
4,0,tcp,http,SF,217,2032,0,0,0,0,...,59,1.0,0.0,0.02,0.0,0.0,0.0,0.0,0.0,normal.


## Data Dictionary

- duration = length (number of seconds) of the connection 
- protocol_type = type of the protocol, e.g. tcp, udp, etc.
- service = network service on the destination, e.g., http, telnet, etc.
- src_bytes = number of data bytes from source to destination
- dst_bytes = number of data bytes from destination to source
- flag = normal or error status of the connection 
- land = 1 if connection is from/to the same host/port; 0 otherwise 
- wrong_fragment = number of wrong fragments
- urgent = number of urgent packets
- hot = number of hot indicators
- num_failed_logins = number of failed login attempts 
- logged_in = 1 if successfully logged in; 0 otherwise 
- num_compromised = number of compromised conditions 
- root_shell = 1 if root shell is obtained; 0 otherwise 
- su_attempted = 1 if su root command attempted; 0 otherwise 
- num_root = number of root accesses 
- num_file_creations = number of file creation operations 
- num_shells = number of shell prompts
- num_access_files = number of operations on access control files
- num_outbound_cmds = number of outbound commands in an ftp session 
- is_hot_login = 1 if the login belongs to the hot list; 0 otherwise
- is_guest_login = 1 if the login is a guest login; 0 otherwise
- count = number of connections to the same host as the current connection in the past two seconds
- serror_rate = % of connections that have SYN errors 
- rerror_rate = % of connections that have REJ errors 
- same_srv_rate = % of connections to the same service 
- diff_srv_rate = % of connections to different services
- srv_count = number of connections to the same service as the current connection in the past two seconds
- srv_serror_rate = % of connections that have SYN errors 
- srv_rerror_rate = % of connections that have REJ errors 
- srv_diff_host_rate = % of connections to different hosts 
- status = normal or type of attack

## Basic Literature

- Buczak, A., & Guven, E. (2016). A Survey of Data Mining and Machine Learning Methods for Cyber Security Intrusion Detection. *IEEE Communications Surveys & Tutorials, 18(2)*, 446-452. https://doi.org/10.1109/COMST.2015.2494502.
- Dhanabal, L., & Shantharajaj, S.P. (2015). A Study on NSL-KDD Dataset for Intrusion Detection System Based on Classification Algorithms. *International Journal of Advanced Research in Computer and Communication Engineering, 4(6)*, 446-452. https://doi.org/10.17148/IJARCCE.2015.4696.

## Source

http://archive.ics.uci.edu/ml/datasets/kdd+cup+1999+data