In [194]:
#Read in KDD99 Data Set¶
import pandas as pd

df1 = pd.read_csv('kddcup1.csv')

print("Read {} rows.".format(len(df1)))
# df = df.sample(frac=0.1, replace=False) # Uncomment this line to sample only 10% of the dataset

df1.dropna(inplace=True,axis=1) # For now, just drop NA's (rows with missing values)

# The CSV file has no column heads, so add them
df1.columns = [
    'duration',
    'protocol_type',
    'service',
    'flag',
    'src_bytes',
    'dst_bytes',
    'land',
    'wrong_fragment',
    'urgent',
    'hot',
    'dst_host_srv_rerror_rate',
    'outcome'
]

# display 5 rows
df1[0:5]

Read 494020 rows.


Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,dst_host_srv_rerror_rate,outcome
0,0,tcp,http,SF,181,5450,0,0,0,0,0.0,normal
1,0,tcp,http,SF,239,486,0,0,0,0,0.0,normal
2,0,tcp,http,SF,235,1337,0,0,0,0,0.0,normal
3,0,tcp,http,SF,219,1337,0,0,0,0,0.0,normal
4,0,tcp,http,SF,217,2032,0,0,0,0,0.0,normal


In [195]:
ENCODING = 'utf-8'

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(df):
    print()
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

In [196]:
# Analyze KDD-99

import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore

analyze(df1)


494020 rows
** duration:2495 (0%)
** protocol_type:[icmp:57.41%,tcp:38.47%,udp:4.12%]
** service:[ecr_i:56.96%,private:22.45%,http:13.01%,smtp:1.97%,other:1.46%,domain_u:1.19%,ftp_data:0.96%,eco_i:0.33%,ftp:0.16%,finger:0.14%,urp_i:0.11%,telnet:0.1%,ntp_u:0.08%,auth:0.07%,pop_3:0.04%,time:0.03%,csnet_ns:0.03%,remote_job:0.02%,gopher:0.02%,imap4:0.02%,discard:0.02%,domain:0.02%,iso_tsap:0.02%,systat:0.02%,shell:0.02%,echo:0.02%,rje:0.02%,whois:0.02%,sql_net:0.02%,printer:0.02%,nntp:0.02%,courier:0.02%,sunrpc:0.02%,netbios_ssn:0.02%,mtp:0.02%,vmnet:0.02%,uucp_path:0.02%,uucp:0.02%,klogin:0.02%,bgp:0.02%,ssh:0.02%,supdup:0.02%,nnsp:0.02%,login:0.02%,hostnames:0.02%,efs:0.02%,daytime:0.02%,link:0.02%,netbios_ns:0.02%,pop_2:0.02%,ldap:0.02%,netbios_dgm:0.02%,exec:0.02%,http_443:0.02%,kshell:0.02%,name:0.02%,ctf:0.02%,netstat:0.02%,Z39_50:0.02%,IRC:0.01%,urh_i:0.0%,X11:0.0%,tim_i:0.0%,pm_dump:0.0%,tftp_u:0.0%,red_i:0.0%]
** flag:[SF:76.6%,S0:17.61%,REJ:5.44%,RSTR:0.18%,RSTO:0.12%,SH:0.02%,S

In [197]:
#df = df1.drop('protocol_type', axis=1)

# Encode a numeric column as zscores
def encode_numeric_zscore(df, name, mean=None, sd=None):
    if mean is None:
        mean = df[name].mean()

    if sd is None:
        sd = df[name].std()

    df[name] = (df[name] - mean) / sd

    
# Encode text values to dummy variables(i.e. [1,0,0],[0,1,0],[0,0,1] for red,green,blue)
def encode_text_dummy(df, name):
    dummies = pd.get_dummies(df[name])
    for x in dummies.columns:
        dummy_name = f"{name}-{x}"
        df[dummy_name] = dummies[x]
    df.drop(name, axis=1, inplace=True)

In [198]:
encode_numeric_zscore(df1, 'duration')
encode_numeric_zscore(df1, 'src_bytes')
encode_numeric_zscore(df1, 'dst_bytes')
encode_numeric_zscore(df1, 'wrong_fragment')
encode_numeric_zscore(df1, 'urgent')
encode_numeric_zscore(df1, 'hot')
encode_numeric_zscore(df1, 'dst_host_srv_rerror_rate')
encode_text_dummy(df1, 'protocol_type')
encode_text_dummy(df1, 'service')
encode_text_dummy(df1, 'flag')
encode_text_dummy(df1, 'land')


In [200]:
# Convert to numpy - Classification
x_columns1 = df1.columns.drop('outcome')
x1 = df1[x_columns1].values
dummies1 = pd.get_dummies(df1['outcome']) # Classification
outcomes1 = dummies1.columns
num_classes1 = len(outcomes1)
y1 = dummies1.values

In [201]:
df1.groupby('outcome')['outcome'].count()

outcome
back                 2203
buffer_overflow        30
ftp_write               8
guess_passwd           53
imap                   12
ipsweep              1247
land                   21
loadmodule              9
multihop                7
neptune            107201
nmap                  231
normal              97277
perl                    3
phf                     4
pod                   264
portsweep            1040
rootkit                10
satan                1589
smurf              280790
spy                     2
teardrop              979
warezclient          1020
warezmaster            20
Name: outcome, dtype: int64

# Defining a neural network model using Keras, 
# a high-level neural networks API that runs on top of TensorFlow or other backend engines. 
# The model is a feedforward neural network with multiple layers. 
# Here's a brief explanation of the model

In [202]:
from sklearn.model_selection import train_test_split

x1_train, x1_test, y1_train, y1_test = train_test_split(
    x1, y1, test_size=0.25, random_state=42)

from sklearn import metrics
from IPython.display import display, HTML 
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation

model = Sequential()
model.add(Dense(25, input_dim=x1.shape[1], activation='relu'))
model.add(Dense(3, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(x1.shape[1])) # Multiple output neurons
model.compile(loss='mean_squared_error', optimizer='adam')

################ SAVING THE LAST OUTPUT LAYER ##################

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense

# Assuming you have already defined your original model as 'model'
# and it looks like the one in your previous code

# Create a new model with just the last layer
last_layer = Dense(x1.shape[1], name='output_layer')(model.layers[-1].output)
output_model = Model(inputs=model.input, outputs=last_layer)

# Save the last layer model to a file
output_model.save('last_layer_model.h5')

1. `Sequential()`: This line initializes a sequential neural network model, 
    which means you can add layers sequentially one after another.

2. `model.add(Dense(25, input_dim=x1.shape[1], activation='relu'))`: 
    This line adds the first layer to the model. 
    It's a fully connected (Dense) layer with 25 units/neurons, and it takes an input of shape `x1.shape[1]`. 
    The activation function used here is ReLU (Rectified Linear Unit).

3. `model.add(Dense(3, activation='relu'))`: 
    This adds a second fully connected layer with 3 units and ReLU activation. 
    This layer reduces the output dimensions from the previous layer to 3 units.

4. `model.add(Dense(25, activation='relu'))`: 
    Another fully connected layer with 25 units and ReLU activation is added. 
    This increases the output dimensions back to 25 units.

5. `model.add(Dense(x1.shape[1]))`: 
    The final layer in your model is another fully connected layer 
    with the same number of units as the input dimensions (`x1.shape[1]`). 
    This layer typically serves as the output layer in autoencoders or similar models, 
    where we're trying to reconstruct the input.

It's important to note that the architecture we've provided does not have an explicit activation 
function for the output layer. But, we might want to add an appropriate activation function 
(e.g., sigmoid for binary classification, softmax for multi-class classification) to the output layer, 
especially if you're using this network for classification tasks.

Also, ensure that we compile the model with an appropriate loss function, optimizer, and evaluation metrics 
before training it on your data. 

# SAVE THE LAST LAYER

If we want to save the weights and architecture of just the last layer of your Keras model, 
we can create a new model that includes only that layer and then save it separately. 

In this code:

1. We create a new model (`output_model`) that takes the input from your original model 
and has only the last dense layer (`Dense(x1.shape[1])`). We name this last layer 'output_layer'.

2. The `Model` constructor is used to create this new model with the specified input and output layers.

3. Finally, we save the `output_model` to a file named 'last_layer_model.h5' using the `save` method. 
This file will contain the weights and architecture of just the last layer, 
allowing you to load and use it separately in the future.

After saving, we can load this saved last layer model using 
`tensorflow.keras.models.load_model('last_layer_model.h5')` 
if we need to use it in a different script or at a later time.

In [203]:

# Convert the output into a Pandas dataframe
pd1 = pd.DataFrame(x1, columns=['output_neuron_{}'.format(i+1) for i in range(x1.shape[1])])

# Save the output to a CSV file
pd1.to_csv('dense_output1.csv', index=False)

In [204]:
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential()
model.add(Dense(10, input_dim=x1.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(50, input_dim=x1.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(10, input_dim=x1.shape[1], kernel_initializer='normal', activation='relu'))
model.add(Dense(1, kernel_initializer='normal'))
model.add(Dense(y1.shape[1],activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto')
model.fit(x1_train,y1_train,validation_data=(x1_test,y1_test),callbacks=[monitor],verbose=2,epochs=1000)

Epoch 1/1000
11579/11579 - 9s - loss: 0.2601 - val_loss: 0.0950 - 9s/epoch - 775us/step
Epoch 2/1000
11579/11579 - 8s - loss: 0.0824 - val_loss: 0.0739 - 8s/epoch - 728us/step
Epoch 3/1000
11579/11579 - 8s - loss: 0.0710 - val_loss: 0.0548 - 8s/epoch - 727us/step
Epoch 4/1000
11579/11579 - 8s - loss: 0.0564 - val_loss: 0.0557 - 8s/epoch - 730us/step
Epoch 5/1000
11579/11579 - 8s - loss: 0.0518 - val_loss: 0.0476 - 8s/epoch - 726us/step
Epoch 6/1000
11579/11579 - 8s - loss: 0.0658 - val_loss: 0.0466 - 8s/epoch - 730us/step
Epoch 7/1000
11579/11579 - 8s - loss: 0.0504 - val_loss: 0.0459 - 8s/epoch - 722us/step
Epoch 8/1000
11579/11579 - 8s - loss: 0.0452 - val_loss: 0.0432 - 8s/epoch - 730us/step
Epoch 9/1000
11579/11579 - 8s - loss: 0.0499 - val_loss: 0.0455 - 8s/epoch - 722us/step
Epoch 10/1000
11579/11579 - 8s - loss: 0.0435 - val_loss: 0.0456 - 8s/epoch - 728us/step
Epoch 11/1000
11579/11579 - 8s - loss: 0.0468 - val_loss: 0.0422 - 8s/epoch - 720us/step
Epoch 12/1000
11579/11579 - 8s

<keras.callbacks.History at 0x7f9e80060490>

In [205]:
import numpy as np

# Measure accuracy
pred = model.predict(x1_test)
pred = np.argmax(pred,axis=1)
y_eval = np.argmax(y1_test,axis=1)
score = metrics.accuracy_score(y_eval, pred)
print("Validation score: {}".format(score))

Validation score: 0.9905428930002834
