In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score



In [2]:
data_train=pd.read_csv('nsl-kdd/KDDTrain+.txt', header=None)

In [3]:
data_train.columns = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment',
                      'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted',
                      'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds',
                      'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate',
                      'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
                      'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
                      'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
                      'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate', 'outcome', 'level']

In [4]:
data_train

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,outcome,level
0,0,tcp,ftp_data,SF,491,0,0,0,0,0,...,0.17,0.03,0.17,0.00,0.00,0.00,0.05,0.00,normal,20
1,0,udp,other,SF,146,0,0,0,0,0,...,0.00,0.60,0.88,0.00,0.00,0.00,0.00,0.00,normal,15
2,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,19
3,0,tcp,http,SF,232,8153,0,0,0,0,...,1.00,0.00,0.03,0.04,0.03,0.01,0.00,0.01,normal,21
4,0,tcp,http,SF,199,420,0,0,0,0,...,1.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,normal,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125968,0,tcp,private,S0,0,0,0,0,0,0,...,0.10,0.06,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20
125969,8,udp,private,SF,105,145,0,0,0,0,...,0.96,0.01,0.01,0.00,0.00,0.00,0.00,0.00,normal,21
125970,0,tcp,smtp,SF,2231,384,0,0,0,0,...,0.12,0.06,0.00,0.00,0.72,0.00,0.01,0.00,normal,18
125971,0,tcp,klogin,S0,0,0,0,0,0,0,...,0.03,0.05,0.00,0.00,1.00,1.00,0.00,0.00,neptune,20


In [6]:
cat_cols = ['is_host_login','protocol_type','service','flag','land', 'logged_in','is_guest_login', 'level', 'outcome']

In [7]:
def preprocess(dataframe):
    df_num = dataframe.drop(cat_cols, axis=1)
    num_cols = df_num.columns
    scaled_df = Scaling(df_num, num_cols)

    dataframe.drop(labels=num_cols, axis="columns", inplace=True)
    dataframe[num_cols] = scaled_df[num_cols]

    # No need to convert 'outcome' to binary, keep it as it is

    dataframe = pd.get_dummies(dataframe, columns=['protocol_type', 'service', 'flag'])
    return dataframe

In [8]:
def Scaling(df_num, cols):
    std_scaler = RobustScaler()
    std_scaler_temp = std_scaler.fit_transform(df_num)
    std_df = pd.DataFrame(std_scaler_temp, columns=cols)
    return std_df

In [9]:
scaled_train = preprocess(data_train)

In [10]:
x = scaled_train.drop(['outcome', 'level'], axis=1).values
y = scaled_train['outcome'].values

In [11]:
one_hot_encoding_mapping = {}
for col in ['protocol_type', 'service', 'flag']:
    one_hot_encoding_mapping[col] = scaled_train.filter(like=col).columns

In [109]:
pca = PCA(n_components=20)
pca = pca.fit(x)
x_reduced = pca.transform(x)
print("Number of original features is {} and of reduced features is {}".format(x.shape[1], x_reduced.shape[1]))

Number of original features is 122 and of reduced features is 20


In [15]:
x_train, x_test, y_train, y_test = train_test_split(x_reduced, y, test_size=0.2, random_state=42)

In [16]:
y_train

array(['normal', 'neptune', 'neptune', ..., 'neptune', 'normal', 'normal'],
      dtype=object)

In [124]:
# Get unique class labels in the 'outcome' column
unique_labels = scaled_train['outcome'].unique()

# Create a mapping dictionary
outcome_mapping = {label: idx for idx, label in enumerate(unique_labels)}

# Replace 'scaled_train' with your actual DataFrame
scaled_train['outcome'] = scaled_train['outcome'].map(outcome_mapping)

In [125]:
outcome_mapping

{'normal': 0,
 'neptune': 1,
 'warezclient': 2,
 'ipsweep': 3,
 'portsweep': 4,
 'teardrop': 5,
 'nmap': 6,
 'satan': 7,
 'smurf': 8,
 'pod': 9,
 'back': 10,
 'guess_passwd': 11,
 'ftp_write': 12,
 'multihop': 13,
 'rootkit': 14,
 'buffer_overflow': 15,
 'imap': 16,
 'warezmaster': 17,
 'phf': 18,
 'land': 19,
 'loadmodule': 20,
 'spy': 21,
 'perl': 22}

In [19]:
from keras.utils import to_categorical

# Assuming 'y_train' and 'y_test' contain class labels
y_train_encoded = [outcome_mapping[label] for label in y_train]
y_test_encoded = [outcome_mapping[label] for label in y_test]

# Convert encoded labels to one-hot encoded format
y_train_categorical = to_categorical(y_train_encoded, num_classes=len(outcome_mapping))
y_test_categorical = to_categorical(y_test_encoded, num_classes=len(outcome_mapping))

In [20]:
model = Sequential()
model.add(LSTM(units=32, input_shape=(x_train.shape[1], 1)))
model.add(Dense(units=len(outcome_mapping), activation='softmax'))  # Output units equal to the number of classes, with 'softmax' activation
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [21]:
history = model.fit(x_train.reshape((x_train.shape[0], x_train.shape[1], 1)), y_train_categorical,
                    epochs=10, batch_size=32,
                    validation_data=(x_test.reshape((x_test.shape[0], x_test.shape[1], 1)), y_test_categorical))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [22]:
import pickle

In [23]:
with open('model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

In [83]:
input_data = {
    'duration': 10,
    'protocol_type': 'tcp',
    'service': 'http',
    'flag': 'SF',
    'src_bytes': 1000,
    'dst_bytes': 500,
    'land': 0,
    'wrong_fragment': 0,
    'urgent': 0,
    'hot': 0,
    'num_failed_logins': 0,
    'logged_in': 1,
    'num_compromised': 0,
    'root_shell': 0,
    'su_attempted': 0,
    'num_root': 0,
    'num_file_creations': 0,
    'num_shells': 0,
    'num_access_files': 0,
    'num_outbound_cmds': 0,
    'is_host_login': 0,
    'is_guest_login': 0,
    'count': 150,
    'srv_count': 25,
    'serror_rate': 0.05,
    'srv_serror_rate': 0.04,
    'rerror_rate': 0.02,
    'srv_rerror_rate': 0.01,
    'same_srv_rate': 0.8,
    'diff_srv_rate': 0.2,
    'srv_diff_host_rate': 0.01,
    'dst_host_count': 255,
    'dst_host_srv_count': 100,
    'dst_host_same_srv_rate': 0.4,
    'dst_host_diff_srv_rate': 0.1,
    'dst_host_same_src_port_rate': 0.05,
    'dst_host_srv_diff_host_rate': 0.01,
    'dst_host_serror_rate': 0.06,
    'dst_host_srv_serror_rate': 0.05,
    'dst_host_rerror_rate': 0.03,
    'dst_host_srv_rerror_rate': 0.02
}

In [76]:
scaled_train.head()

Unnamed: 0,land,logged_in,is_host_login,is_guest_login,outcome,level,duration,src_bytes,dst_bytes,wrong_fragment,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,0,0,0,normal,20,0.0,1.619565,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,normal,15,0.0,0.369565,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0
2,0,0,0,0,neptune,19,0.0,-0.15942,0.0,0.0,...,0,0,0,0,1,0,0,0,0,0
3,0,1,0,0,normal,21,0.0,0.681159,15.800388,0.0,...,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,normal,21,0.0,0.561594,0.813953,0.0,...,0,0,0,0,0,0,0,0,1,0


In [86]:
#####################
custom_data=pd.DataFrame()
for col in scaled_train.columns:
    if col in input_data.keys():
        custom_data[col]=[input_data[col]]
custom_data

Unnamed: 0,land,logged_in,is_host_login,is_guest_login,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,...,dst_host_count,dst_host_srv_count,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate
0,0,1,0,0,10,1000,500,0,0,0,...,255,100,0.4,0.1,0.05,0.01,0.06,0.05,0.03,0.02


In [87]:
for col in custom_data.columns:
    print(col)

land
logged_in
is_host_login
is_guest_login
duration
src_bytes
dst_bytes
wrong_fragment
urgent
hot
num_failed_logins
num_compromised
root_shell
su_attempted
num_root
num_file_creations
num_shells
num_access_files
num_outbound_cmds
count
srv_count
serror_rate
srv_serror_rate
rerror_rate
srv_rerror_rate
same_srv_rate
diff_srv_rate
srv_diff_host_rate
dst_host_count
dst_host_srv_count
dst_host_same_srv_rate
dst_host_diff_srv_rate
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
dst_host_serror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
dst_host_srv_rerror_rate


In [88]:
for col in scaled_train.columns:
    print(col)

land
logged_in
is_host_login
is_guest_login
outcome
level
duration
src_bytes
dst_bytes
wrong_fragment
urgent
hot
num_failed_logins
num_compromised
root_shell
su_attempted
num_root
num_file_creations
num_shells
num_access_files
num_outbound_cmds
count
srv_count
serror_rate
srv_serror_rate
rerror_rate
srv_rerror_rate
same_srv_rate
diff_srv_rate
srv_diff_host_rate
dst_host_count
dst_host_srv_count
dst_host_same_srv_rate
dst_host_diff_srv_rate
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
dst_host_serror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
dst_host_srv_rerror_rate
protocol_type_icmp
protocol_type_tcp
protocol_type_udp
service_IRC
service_X11
service_Z39_50
service_aol
service_auth
service_bgp
service_courier
service_csnet_ns
service_ctf
service_daytime
service_discard
service_domain
service_domain_u
service_echo
service_eco_i
service_ecr_i
service_efs
service_exec
service_finger
service_ftp
service_ftp_data
service_gopher
service_harvest
service_hostnames
service_h

In [104]:
#############################
# Define the list of column names
column_names = [
    'protocol_type_icmp', 'protocol_type_tcp', 'protocol_type_udp',
    'service_IRC', 'service_X11', 'service_Z39_50', 'service_aol', 'service_auth', 'service_bgp',
    'service_courier', 'service_csnet_ns', 'service_ctf', 'service_daytime', 'service_discard',
    'service_domain', 'service_domain_u', 'service_echo', 'service_eco_i', 'service_ecr_i', 'service_efs',
    'service_exec', 'service_finger', 'service_ftp', 'service_ftp_data', 'service_gopher', 'service_harvest',
    'service_hostnames', 'service_http', 'service_http_2784', 'service_http_443', 'service_http_8001',
    'service_imap4', 'service_iso_tsap', 'service_klogin', 'service_kshell', 'service_ldap', 'service_link',
    'service_login', 'service_mtp', 'service_name', 'service_netbios_dgm', 'service_netbios_ns',
    'service_netbios_ssn', 'service_netstat', 'service_nnsp', 'service_nntp', 'service_ntp_u',
    'service_other', 'service_pm_dump', 'service_pop_2', 'service_pop_3', 'service_printer', 'service_private',
    'service_red_i', 'service_remote_job', 'service_rje', 'service_shell', 'service_smtp', 'service_sql_net',
    'service_ssh', 'service_sunrpc', 'service_supdup', 'service_systat', 'service_telnet', 'service_tftp_u',
    'service_tim_i', 'service_time', 'service_urh_i', 'service_urp_i', 'service_uucp', 'service_uucp_path',
    'service_vmnet', 'service_whois',
    'flag_OTH', 'flag_REJ', 'flag_RSTO', 'flag_RSTOS0', 'flag_RSTR', 'flag_S0', 'flag_S1', 'flag_S2', 'flag_S3',
    'flag_SF', 'flag_SH'
]

# Create an empty DataFrame with the specified columns
custom_data_df = pd.DataFrame(columns=column_names)

# Add the initial row of zeros
initial_row = pd.Series(np.zeros(len(column_names)), index=column_names)
custom_data_df = custom_data_df.append(initial_row, ignore_index=True)

# Update 'protocol_type' columns
custom_data_df['protocol_type_'+input_data['protocol_type']]=1

# Update 'service' columns
custom_data_df['service_' + input_data['service']] = 1

# Update 'flag' columns
custom_data_df['flag_' + input_data['flag']] = 1

In [105]:
custom_data_df

Unnamed: 0,protocol_type_icmp,protocol_type_tcp,protocol_type_udp,service_IRC,service_X11,service_Z39_50,service_aol,service_auth,service_bgp,service_courier,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0.0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0


In [106]:
#########################################################
custom_data=pd.concat([custom_data, custom_data_df], axis=1)
for col in custom_data.columns:
    print(col)

land
logged_in
is_host_login
is_guest_login
duration
src_bytes
dst_bytes
wrong_fragment
urgent
hot
num_failed_logins
num_compromised
root_shell
su_attempted
num_root
num_file_creations
num_shells
num_access_files
num_outbound_cmds
count
srv_count
serror_rate
srv_serror_rate
rerror_rate
srv_rerror_rate
same_srv_rate
diff_srv_rate
srv_diff_host_rate
dst_host_count
dst_host_srv_count
dst_host_same_srv_rate
dst_host_diff_srv_rate
dst_host_same_src_port_rate
dst_host_srv_diff_host_rate
dst_host_serror_rate
dst_host_srv_serror_rate
dst_host_rerror_rate
dst_host_srv_rerror_rate
protocol_type_icmp
protocol_type_tcp
protocol_type_udp
service_IRC
service_X11
service_Z39_50
service_aol
service_auth
service_bgp
service_courier
service_csnet_ns
service_ctf
service_daytime
service_discard
service_domain
service_domain_u
service_echo
service_eco_i
service_ecr_i
service_efs
service_exec
service_finger
service_ftp
service_ftp_data
service_gopher
service_harvest
service_hostnames
service_http
service_ht

In [107]:
custom_data

Unnamed: 0,land,logged_in,is_host_login,is_guest_login,duration,src_bytes,dst_bytes,wrong_fragment,urgent,hot,...,flag_REJ,flag_RSTO,flag_RSTOS0,flag_RSTR,flag_S0,flag_S1,flag_S2,flag_S3,flag_SF,flag_SH
0,0,1,0,0,10,1000,500,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0.0


In [110]:
custom_data_reduced=pca.transform(custom_data)



In [120]:
import pickle

# Specify the path to the pickled model file
model_file_path = 'model.pkl'  # Replace with the actual file path

# Load the pickled model
with open(model_file_path, 'rb') as model_file:
    model = pickle.load(model_file)

# Now 'loaded_model' contains your previously pickled model

In [121]:
predicted_probabilities = model.predict(custom_data_reduced)



In [122]:
predicted_probabilities

array([[4.6816358e-04, 6.8556212e-02, 3.3962981e-05, 3.0318476e-02,
        3.5976004e-03, 1.2593243e-04, 4.1653213e-01, 3.1434763e-02,
        9.3788065e-02, 3.1283870e-01, 1.1208211e-03, 1.5852530e-02,
        1.8374657e-03, 7.5940569e-03, 1.7142333e-03, 3.8045624e-04,
        1.1319865e-03, 1.4457171e-03, 1.6973961e-04, 5.9851287e-03,
        1.0685424e-03, 3.2326831e-03, 7.7280408e-04]], dtype=float32)

In [127]:
# Get the predicted class label (index with highest probability)
predicted_class_index = np.argmax(predicted_probabilities)

# Inverse mapping to get the class label string
predicted_class = list(outcome_mapping.keys())[predicted_class_index]

In [128]:
predicted_class

'nmap'