In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [None]:
print(os.listdir('./dataset'))

In [None]:
with open("./dataset/kddcup.names",'r') as f:
    print(f.read())

In [None]:
cols="""duration,
protocol_type,
service,
flag,
src_bytes,
dst_bytes,
land,
wrong_fragment,
urgent,
hot,
num_failed_logins,
logged_in,
num_compromised,
root_shell,
su_attempted,
num_root,
num_file_creations,
num_shells,
num_access_files,
num_outbound_cmds,
is_host_login,
is_guest_login,
count,
srv_count,
serror_rate,
srv_serror_rate,
rerror_rate,
srv_rerror_rate,
same_srv_rate,
diff_srv_rate,
srv_diff_host_rate,
dst_host_count,
dst_host_srv_count,
dst_host_same_srv_rate,
dst_host_diff_srv_rate,
dst_host_same_src_port_rate,
dst_host_srv_diff_host_rate,
dst_host_serror_rate,
dst_host_srv_serror_rate,
dst_host_rerror_rate,
dst_host_srv_rerror_rate"""

columns=[]
for c in cols.split(','):
    if(c.strip()):
       columns.append(c.strip())

columns.append('target')
#print(columns)
print(len(columns))

In [None]:
with open("./dataset/training_attack_types",'r') as f:
    print(f.read())

In [None]:
attacks_types = {
    'normal': 'normal',
'back': 'dos',
'buffer_overflow': 'u2r',
'ftp_write': 'r2l',
'guess_passwd': 'r2l',
'imap': 'r2l',
'ipsweep': 'probe',
'land': 'dos',
'loadmodule': 'u2r',
'multihop': 'r2l',
'neptune': 'dos',
'nmap': 'probe',
'perl': 'u2r',
'phf': 'r2l',
'pod': 'dos',
'portsweep': 'probe',
'rootkit': 'u2r',
'satan': 'probe',
'smurf': 'dos',
'spy': 'r2l',
'teardrop': 'dos',
'warezclient': 'r2l',
'warezmaster': 'r2l',
}

In [None]:
path = "./dataset/kddcup.data_10_percent.gz"
df = pd.read_csv(path,names=columns)

#Adding Attack Type column
df['Attack Type'] = df.target.apply(lambda r:attacks_types[r[:-1]])

df.head()

In [None]:
df['target'].value_counts()

In [None]:
df['Attack Type'].value_counts()

In [None]:
df.dtypes


In [None]:
df.isnull().sum()


In [None]:
df['Attack Type'].value_counts()
df.dtypes
df.isnull().sum()

In [None]:
#Finding categorical features
num_cols = df._get_numeric_data().columns

cate_cols = list(set(df.columns)-set(num_cols))
cate_cols.remove('target')
cate_cols.remove('Attack Type')

cate_cols


In [None]:
#Visualization
def bar_graph(feature):
    df[feature].value_counts().plot(kind="bar")

In [None]:
bar_graph('protocol_type')

In [None]:
plt.figure(figsize=(15,3))
bar_graph('service')

In [None]:
bar_graph('flag')

In [None]:
bar_graph('logged_in')

In [None]:
bar_graph('target')

In [None]:
bar_graph('Attack Type')

In [None]:
df.columns


In [None]:
df = df.dropna('columns')# drop columns with NaN

df = df[[col for col in df if df[col].nunique() > 1]]# keep columns where there are more than 1 unique values

corr = df.corr()

plt.figure(figsize=(15,12))

sns.heatmap(corr)

plt.show()


In [None]:
df['num_root'].corr(df['num_compromised'])

In [None]:
df['srv_serror_rate'].corr(df['serror_rate'])

In [None]:
df['srv_count'].corr(df['count'])


In [None]:
df['srv_rerror_rate'].corr(df['rerror_rate'])

In [None]:
df['dst_host_same_srv_rate'].corr(df['dst_host_srv_count'])

In [None]:
df['dst_host_srv_serror_rate'].corr(df['dst_host_serror_rate'])

In [None]:
df['dst_host_srv_rerror_rate'].corr(df['dst_host_rerror_rate'])

In [None]:
df['dst_host_same_srv_rate'].corr(df['same_srv_rate'])


In [None]:
df['dst_host_srv_count'].corr(df['same_srv_rate'])

In [None]:
df['dst_host_same_src_port_rate'].corr(df['srv_count'])

In [None]:
df['dst_host_serror_rate'].corr(df['serror_rate'])

In [None]:
df['dst_host_serror_rate'].corr(df['srv_serror_rate'])

In [None]:
df['dst_host_srv_serror_rate'].corr(df['serror_rate'])

In [None]:
df['dst_host_srv_serror_rate'].corr(df['srv_serror_rate'])

In [None]:
df['dst_host_rerror_rate'].corr(df['rerror_rate'])

In [None]:
df['dst_host_rerror_rate'].corr(df['srv_rerror_rate'])

In [None]:
df['dst_host_srv_rerror_rate'].corr(df['rerror_rate'])

In [None]:
df['dst_host_srv_rerror_rate'].corr(df['srv_rerror_rate'])

In [None]:
#This variable is highly correlated with num_compromised and should be ignored for analysis.
#(Correlation = 0.9938277978738366)
df.drop('num_root',axis = 1,inplace = True)

#This variable is highly correlated with serror_rate and should be ignored for analysis.
#(Correlation = 0.9983615072725952)
df.drop('srv_serror_rate',axis = 1,inplace = True)

#This variable is highly correlated with rerror_rate and should be ignored for analysis.
#(Correlation = 0.9947309539817937)
df.drop('srv_rerror_rate',axis = 1, inplace=True)

#This variable is highly correlated with srv_serror_rate and should be ignored for analysis.
#(Correlation = 0.9993041091850098)
df.drop('dst_host_srv_serror_rate',axis = 1, inplace=True)

#This variable is highly correlated with rerror_rate and should be ignored for analysis.
#(Correlation = 0.9869947924956001)
df.drop('dst_host_serror_rate',axis = 1, inplace=True)

#This variable is highly correlated with srv_rerror_rate and should be ignored for analysis.
#(Correlation = 0.9821663427308375)
df.drop('dst_host_rerror_rate',axis = 1, inplace=True)

#This variable is highly correlated with rerror_rate and should be ignored for analysis.
#(Correlation = 0.9851995540751249)
df.drop('dst_host_srv_rerror_rate',axis = 1, inplace=True)

#This variable is highly correlated with dst_host_srv_count and should be ignored for analysis.
#(Correlation = 0.9736854572953938)
df.drop('dst_host_same_srv_rate',axis = 1, inplace=True)

In [None]:
df.head()

In [None]:
df.columns


In [None]:
df_std = df.std()
df_std = df_std.sort_values(ascending = True)
df_std

In [None]:
df['protocol_type'].value_counts()


In [None]:
#protocol_type feature mapping
pmap = {'icmp':0,'tcp':1,'udp':2}
df['protocol_type'] = df['protocol_type'].map(pmap)

In [None]:
df['flag'].value_counts()

In [None]:
#flag feature mapping
fmap = {'SF':0,'S0':1,'REJ':2,'RSTR':3,'RSTO':4,'SH':5 ,'S1':6 ,'S2':7,'RSTOS0':8,'S3':9 ,'OTH':10}
df['flag'] = df['flag'].map(fmap)

In [None]:
df.head()

In [None]:
df.drop('service',axis = 1,inplace= True)

In [None]:
df.dtypes

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score

In [None]:
df = df.drop(['target',], axis=1)
print(df.shape)

# Target variable and train set
Y = df[['Attack Type']]
X = df.drop(['Attack Type',], axis=1)

sc = MinMaxScaler()
X = sc.fit_transform(X)

# Split test and train data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape)
print(Y_train.shape, Y_test.shape)

In [None]:
# Gaussian Naive Bayes
from sklearn.naive_bayes import GaussianNB

In [None]:
from sklearn.svm import SVC

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

In [None]:
def fun():
    model = Sequential()
    
    #here 30 is output dimension
    model.add(Dense(30,input_dim =30,activation = 'relu',kernel_initializer='random_uniform'))
    
    #in next layer we do not specify the input_dim as the model is sequential so output of previous layer is input to next layer
    model.add(Dense(1,activation='sigmoid',kernel_initializer='random_uniform'))
    
    #5 classes-normal,dos,probe,r2l,u2r
    model.add(Dense(5,activation='softmax'))
    
    #loss is categorical_crossentropy which specifies that we have multiple classes
    
    model.compile(loss ='categorical_crossentropy',optimizer = 'adam',metrics = ['accuracy'])
    
    return model

In [None]:
model7 = KerasClassifier(build_fn=fun,epochs=100,batch_size=64)

In [None]:
model7.fit(X_train, Y_train.values.ravel())
start_time = time.time()
Y_test_pred7 = model7.predict(X_test)
start_time = time.time()
Y_train_pred7 = model7.predict(X_train)


In [None]:
accuracy_score(Y_train,Y_train_pred7)


In [None]:
accuracy_score(Y_test,Y_test_pred7)

In [None]:
aa = model7.predict(X_test)

In [None]:
model7

In [None]:
model7.model.save("KDD_model")

In [None]:
model7.model.predict(X_test)

In [None]:
import tensorflow as tf

In [None]:
mo = tf.keras.models.load_model("KDD_model/")

In [None]:
mo

In [None]:
kk = mo.predict(X_test)
kk

In [None]:
kk.shape

In [None]:
X_train