In [None]:
import numpy as np
import pandas as pd
import keras
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
dataset = pd.read_csv('kddcup.data_10_percent_corrected.csv')

In [None]:
dataset

## **Features:**
 
  

*   **duration:**	 
    length (number of seconds) of the connection				
*   **protocol_type:**	 	 
    type of the protocol, e.g. tcp, udp, etc.					
*   **service:**	 	 
    network service on the destination, e.g., http, telnet, etc.	
*   **flag:**	 	 
    normal or error status of the connection 	
*   **src_bytes:** 	 
    number of data bytes from source to destination			
*   **dst_bytes:**		 
    number of data bytes from destination to source
*   **land:**		 
    1 if connection is from/to the same host/port; 0 otherwise	
*   **wrong_fragment:**		 
    number of “wrong” fragments	
*   **urgent:**		 
    number of urgent packets
*   **hot:**		 
    number of “hot” indicators
*   **num_failed_logins:** 	 
    number of failed login attempts
*   **logged_in:**		 
    11 if successfully logged in; 0 otherwise	
*   **num_compromised:**		 
    number of “compromised” conditions
*   **root_shell:**		 
    1 if root shell is obtained; 0 otherwise	
*   **su_attempted:**		 
    1 if “su root” command attempted; 0 otherwise
*   **num_root:**		 
    number of “root” accesses
*   **num_file_creations:**		 
    number of file creation operations	
*   **num_shells:**		 
    number of shell prompts
*   **num_access_files:**		 
    number of operations on access control files	
*   **num_outbound_cmds:**		 
    number of outbound commands in an ftp session	
*   **is_host_login:**		 
    1 if the login belongs to the “hot” list; 0 otherwise
*   **is_guest_login:**		 
    1 if the login is a “guest”login; 0 otherwise
*   **count:**		 
    number of connections to the same host as the current connection in the past two seconds
*   **srv_count:**		 
    Number of connection to the same service (port number)
*   **serror_rate:**		 
    Percentage of connections that have activated flag (#4) s0,s1,s2 or s3, among the connections aggregated in count (#23)
*   **srv_rerror_rate:**		 
    Percentage of connection that have activated flag (#4) s0,s1,s2 or s3, among the connections aggregated in srv count (#24)
*   **rerror_rate:**		 
    Percentage of connections that have activated flag (#4 )REJ, among the connections aggregated in count (#23)
*   **srv_rerror_rate:** 	 
    Percentage of connections that have activated flag (#4) REJ, among the connections aggregated in srv count (#24)
*   **same_srv_rate:**		 
    Percentage of connections that were to the same services, among the connections aggregated in count (#23)
*   **diff_srv_rate:**		 
    Percentage of connections that were to the different services, among the connections aggregated in count (#23)
*   **srv_count:**		 
    Number of connection to the same service (port number)
*   **srv_diff_host_rate:**		 
    Percentage of connections that were to different destination machines among the connections aggregated in srv count (#24)
*   **dst_host_count:**		 
    Number of connections having the same destination host IP address
*   **dst_host_srv_count:**		 
    Number of connections having same port number
*   **dst_host_same_srv_rate:**		 
    Percentage of connections that were to the same service among
the connections aggregated in dst host count (#32)
*   **dst_host_diff_srv_rate:**		 
    Percentage of connections that were to different service among the connections aggregated in dst host count (#32)
*   **dst_host_same_src_port_rate:**		 
    Percentage of connections that were to the same source port among the connections aggregated in dst host srv count (#33)
*   **dst_host_srv_diff_host_rate:**		 
    Percentage of connections that were to the different destination machines among the connections aggregated in dthtt(#33)
*   **dst_host_serror_rate:**		 
    Percentage of connections that have activated flag (#4) s0,s1,s2 or s3, among the connections aggregated in dst host count (#32)
*   **dst_host_srv_serror_rate:**		 
    Percentage of connections that have activated flag (#4) REJ, among the connections aggregated in dst host count (#32)
*   **dst_host_rerror_rate:**		 
    Percentage of connections that have activated flag (#4) REJ, among the connections aggregated in dst host count (#32)	
*   **dst_host_srv_rerror_rate:**		 
    Percentage of connections that have activated flag (#4) REJ, among the connections aggregated in dst host srv count (#32)
*   **label:**		 
    Attack class label

In [None]:
dataset.rename(columns = {'label':'outcome'}, inplace = True)

## **Analyzing our database**

To analyze our database we need to get the percentage of every possible value in each column, to do that:



*   expand all categories values in each column
*   get number of categories values in each column in the dataset
*   get the percentage of every categories value in each column
*   run the fonction on each column 


In [None]:
import pandas as pd
import os
import numpy as np
from sklearn import metrics
from scipy.stats import zscore

def expand_categories(values):
    result = []
    s = values.value_counts()
    t = float(len(values))
    for v in s.index:
        result.append("{}:{}%".format(v,round(100*(s[v]/t),2)))
    return "[{}]".format(",".join(result))
        
def analyze(df):
    print()
    cols = df.columns.values
    total = float(len(df))

    print("{} rows".format(int(total)))
    for col in cols:
        uniques = df[col].unique()
        unique_count = len(uniques)
        if unique_count>100:
            print("** {}:{} ({}%)".format(col,unique_count,\
                int(((unique_count)/total)*100)))
        else:
            print("** {}:{}".format(col,expand_categories(df[col])))
            expand_categories(df[col])

analyze(dataset)

## **Preprocessing the dataset**

To Preprocess our database we need to :

*   Drop the features that are not required
*   Separate the features and labels
*   Encode the categorical labels into integers

    > To encode the categorical labels we need LabelEncoder function that is used to normalize labels. It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels. Fit label encoder. Fit label encoder and return encoded labels.
*   One-hot encoding the labels

    > One hot encoding is a process of converting categorical data variables so they can be provided to machine learning algorithms to improve predictions. One hot encoding is a crucial part of feature engineering for machine learning.

*   Feature scaling the input data


In [None]:
# Drop the features that are not required
dataset = dataset.drop(dataset.columns[[0, 1, 2, 3, 6, 11, 13, 14, 15, 20, 21]], axis=1)

# Separate the features and labels
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

# Encode the categorical labels into integers
label_encoder_y = LabelEncoder()
y = label_encoder_y.fit_transform(y)

# One-hot encoding the labels
onehotencoder = OneHotEncoder(categories='auto')
y = onehotencoder.fit_transform(y.reshape(-1, 1)).toarray()

# Feature scaling the input data
scaler = StandardScaler()
X = scaler.fit_transform(X)

### Splitting the dataset into train and test sets
X_train : X irrevocable used to fit the machine learning model.

X_test : X irrevocable used to evaluate the fit machine learning model.

Y_train : Y irrevocable used to fit the machine learning model.

Y_test : Y irrevocable used to evaluate the fit machine learning model.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Creating CNN Model 1

Creating a CNN model for a classification problem using Keras layers:


1.   Creates an instance of the Sequential class, which is a Keras model.

2.   Adds a **1D convolutional layer** to the model with **64 filters**, a kernel size of **3**, and a **ReLU** activation function. The input shape is specified as **(X_train.shape[1], 1)** which means that the input data is **1-dimensional** and the number of features is equal to **X_train.shape[1]**.

3.   Adds a max **pooling layer** to the model with a pool size of **2**.

4.   Adds another **1D convolutional layer** to the model with **32 filters**, a kernel size of **3**, and a **ReLU** activation function.

5.   Adds another max **pooling layer** to the model with a pool size of **2**.

6.   **Flattens** the output from the **convolutional layers** into a **1-dimensional** array.

7.   Adds a **fully connected layer** to the model with **128 neurons** and a **ReLU** activation function.

8.   Adds a **dropout layer** to the model with a dropout rate of **0.2**. Dropout is a regularization technique that randomly drops out **(sets to zero)** some of the neurons in the layer during training, **which helps prevent overfitting**.

9.   Adds the **output layer** to the model. The number of neurons in the output layer is **equal** to the number of classes in the problem, which is **y_train.shape[1]**. The activation function used is **softmax**, which produces a probability distribution over the classes.

10.   Compiles the model by specifying the optimizer **(adam)**, loss function **(categorical cross-entropy)**, and evaluation metric **(accuracy)**.

11.   Prints a summary of the model architecture, including the layers, number of parameters, and output shapes.

    > Creates a CNN model with two 1D convolutional layers, two max pooling layers, a fully connected layer, a dropout layer, and an output layer. The model is compiled with the Adam optimizer and categorical cross-entropy loss function, and is evaluated using the accuracy metric.

In [None]:
classifier = Sequential()

# Add convolutional layer
classifier.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))

# Add max pooling layer
classifier.add(MaxPooling1D(pool_size=2))

# Add another convolutional layer
classifier.add(Conv1D(filters=32, kernel_size=3, activation='relu'))

# Add max pooling layer
classifier.add(MaxPooling1D(pool_size=2))

# Flatten the output from convolutional layers
classifier.add(Flatten())

# Add fully connected layer
classifier.add(Dense(units=128, activation='relu'))

# Add dropout layer to prevent overfitting
classifier.add(Dropout(rate=0.2))

# Add output layer
classifier.add(Dense(units=y_train.shape[1], activation='softmax'))

# Compile the model
classifier.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Print the model summary
classifier.summary()

### Creating CNN Model 2

**Model 2:** Change activation function to LeakyReLU, optimizer to RMSprop, loss function to mean squared error, and increase the dropout rate.

In [None]:
from keras.layers import LeakyReLU
from keras.optimizers import RMSprop

# Model 2
classifier2 = Sequential()

classifier2.add(Conv1D(filters=128, kernel_size=3, activation=LeakyReLU(alpha=0.2), input_shape=(X_train.shape[1], 1)))
classifier2.add(MaxPooling1D(pool_size=2))
classifier2.add(Conv1D(filters=64, kernel_size=3, activation=LeakyReLU(alpha=0.2)))
classifier2.add(MaxPooling1D(pool_size=2))
classifier2.add(Flatten())
classifier2.add(Dense(units=256, activation=LeakyReLU(alpha=0.2)))
classifier2.add(Dropout(rate=0.4))
classifier2.add(Dense(units=y_train.shape[1], activation='softmax'))

classifier2.compile(optimizer=RMSprop(), loss='mean_squared_error', metrics=['accuracy'])
classifier2.summary()


# **Ceating CNN Model 3**
**Model 3:** Change activation function to tanh, optimizer to SGD, loss function to binary cross-entropy, and decrease the dropout rate.

In [None]:
from keras.optimizers import SGD

# Model 3
classifier3 = Sequential()

classifier3.add(Conv1D(filters=32, kernel_size=3, activation='tanh', input_shape=(X_train.shape[1], 1)))
classifier3.add(MaxPooling1D(pool_size=2))
classifier3.add(Conv1D(filters=16, kernel_size=3, activation='tanh'))
classifier3.add(MaxPooling1D(pool_size=2))
classifier3.add(Flatten())
classifier3.add(Dense(units=64, activation='tanh'))
classifier3.add(Dropout(rate=0.1))
classifier3.add(Dense(units=y_train.shape[1], activation='softmax'))

classifier3.compile(optimizer=SGD(), loss='binary_crossentropy', metrics=['accuracy'])
classifier3.summary()


In these examples, I changed the activation function to LeakyReLU and tanh, optimizer to RMSprop and SGD, loss function to mean squared error and binary cross-entropy, and adjusted the dropout rate. These changes can influence the model's learning behavior and potentially impact accuracy. Remember to select appropriate activation functions, optimizers, and loss functions based on your specific task and dataset characteristics.

Feel free to modify other parameters or explore different combinations to further investigate their effects on model performance.

## Training the model

Train a machine learning model using a deep learning neural network called "classifier" on some input data **X_train** with corresponding output data **y_train**. 

The purpose of the neural network is to predict the output values **(y)** for new input values **(X)** that it has not seen before.

The **"fit"** method of the **"classifier"** object is used to train the neural network.

The **"fit"** method takes in the input data and output data along with some other parameters:

 

*   **X_train.reshape(X_train.shape[0], X_train.shape[1], 1)** - This reshapes the input data to a 3D array. The first dimension is the number of training examples, the second dimension is the length of each input sequence, and the third dimension is the number of features (in this case, there is only one feature). This is necessary for the input shape of the neural network.

*   **y_train** - This is the output data that corresponds to the input data.

*   **epochs=10** - This specifies the number of times the entire dataset will be passed through the neural network during training.

*   **batch_size=128** - This specifies the number of samples that will be used in each training iteration. In this case, the training data will be divided into batches of 128 samples and the neural network will be trained on each batch.

*   **validation_data=(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), y_test)** - This specifies the validation data, which is used to evaluate the performance of the neural network during training. The validation data is also reshaped to a 3D array.

>The output of the **"fit"** method is stored in the **"history"** variable. This object contains information about the performance of the neural network during training, such as the training and validation loss and accuracy for each epoch.









In [None]:
history = classifier.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), y_train, epochs=10, batch_size=128, validation_data=(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), y_test))


In [None]:
history2 = classifier2.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), y_train, epochs=10, batch_size=128, validation_data=(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), y_test))

In [None]:
history3 = classifier3.fit(X_train.reshape(X_train.shape[0], X_train.shape[1], 1), y_train, epochs=10, batch_size=128, validation_data=(X_test.reshape(X_test.shape[0], X_test.shape[1], 1), y_test))

## Evaluating the model

The "evaluate" method takes the following parameters:

*  **X_test.reshape((X_test.shape[0], X_test.shape[1], 1))** - This reshapes the test input data to the same 3D array format used during the training. The first dimension is the number of test examples, the second dimension is the length of each input sequence, and the third dimension is the number of features (in this case, there is only one feature).

*  **y_test** - This is the corresponding output data for the test input data.

*  **verbose=0** - This specifies the level of verbosity of the evaluation process. A value of 0 means that no progress messages will be displayed during the evaluation.

>The output of the "evaluate" method is stored in the "score" variable. The "score" variable contains a scalar value that represents the overall performance of the model on the test data. The specific metric used for evaluation depends on the type of model and the problem being solved. For example, if the model is a classifier, the metric might be accuracy, precision, or recall. If the model is a regressor, the metric might be mean squared error or mean absolute error.



In [None]:
# Plot the accuracy and loss curves
import matplotlib.pyplot as plt

# Evaluate the model
score = classifier.evaluate(X_test.reshape((X_test.shape[0], X_test.shape[1], 1)), y_test, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()



### Evaluating the 3 models

In [None]:
# Plot the accuracy and loss curves
import matplotlib.pyplot as plt

# Evaluate the models
score1 = classifier.evaluate(X_test.reshape((X_test.shape[0], X_test.shape[1], 1)), y_test, verbose=0)
print('Model 1 - Test loss:', score1[0])
print('Model 1 - Test accuracy:', score1[1])

score2 = classifier2.evaluate(X_test.reshape((X_test.shape[0], X_test.shape[1], 1)), y_test, verbose=0)
print('Model 2 - Test loss:', score2[0])
print('Model 2 - Test accuracy:', score2[1])

score3 = classifier3.evaluate(X_test.reshape((X_test.shape[0], X_test.shape[1], 1)), y_test, verbose=0)
print('Model 3 - Test loss:', score3[0])
print('Model 3 - Test accuracy:', score3[1])

# Plot accuracy
plt.plot(history.history['accuracy'])
plt.plot(history2.history['accuracy'])
plt.plot(history3.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.plot(history2.history['val_accuracy'])
plt.plot(history3.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Model 1 Train', 'Model 2 Train', 'Model 3 Train', 'Model 1 Test', 'Model 2 Test', 'Model 3 Test'], loc='lower right')
plt.show()

# Plot loss
plt.plot(history.history['loss'])
plt.plot(history2.history['loss'])
plt.plot(history3.history['loss'])
plt.plot(history.history['val_loss'])
plt.plot(history2.history['val_loss'])
plt.plot(history3.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Model 1 Train', 'Model 2 Train', 'Model 3 Train', 'Model 1 Test', 'Model 2 Test', 'Model 3 Test'], loc='upper right')
plt.show()


In [None]:
classifier.save('the_best_cnn_model.h5')

## TEST THE CNN1 Model

#### create the example

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
database2 = {
    'duration': [0],  # Placeholder value or actual value for duration
    'protocol_type': ['tcp'],  # Placeholder value or actual value for protocol_type
    'service': ['http'],  # Placeholder value or actual value for service
    'flag': ['SF'],  # Placeholder value or actual value for flag
    'src_bytes': [943],
    'dst_bytes': [0],
    'land': [0],  # Placeholder value or actual value for land
    'wrong_fragment': [0],
    'urgent': [0],
    'hot': [0],
    'num_failed_logins': [0],
    'logged_in': [0],  # Placeholder value or actual value for logged_in
    'num_compromised': [0],
    'root_shell': [0],  # Placeholder value or actual value for root_shell
    'su_attempted': [0],  # Placeholder value or actual value for su_attempted
    'num_root': [0],  # Placeholder value or actual value for num_root
    'num_file_creations': [0],
    'num_shells': [0],
    'num_access_files': [0],
    'num_outbound_cmds': [0],
    'is_host_login': [0],  # Placeholder value or actual value for is_host_login
    'is_guest_login': [0],  # Placeholder value or actual value for is_guest_login
    'count': [87],
    'srv_count': [87],
    'serror_rate': [1.0],
    'srv_serror_rate': [1.0],
    'rerror_rate': [0.0],
    'srv_rerror_rate': [0.0],
    'same_srv_rate': [1.0],
    'diff_srv_rate': [0.0],
    'srv_diff_host_rate': [0.0],
    'dst_host_count': [255],
    'dst_host_srv_count': [255],
    'dst_host_same_srv_rate': [1.0],
    'dst_host_diff_srv_rate': [0.0],
    'dst_host_same_src_port_rate': [0.0],
    'dst_host_srv_diff_host_rate': [0.0],
    'dst_host_serror_rate': [1.0],
    'dst_host_srv_serror_rate': [1.0],
    'dst_host_rerror_rate': [0.0],
    'dst_host_srv_rerror_rate': [0.0],
    'outcome': ['smurf.']
}

df = pd.DataFrame(database2)

### Prepare the example

In [None]:
df = pd.DataFrame(database2)
# Drop the features that are not required
df = df.drop(df.columns[[0, 1, 2, 3, 6, 11, 13, 14, 15, 20, 21]], axis=1)

# Separate the features and labels
X2 = df.iloc[:, :-1].values
y2 = df.iloc[:, -1].values

# Encode the categorical labels into integers
label_encoder_y2 = LabelEncoder()
y2 = label_encoder_y2.fit_transform(y2)

# One-hot encoding the labels
onehotencoder = OneHotEncoder(categories='auto')
y2 = onehotencoder.fit_transform(y2.reshape(-1, 1)).toarray()

# Feature scaling the input data
scaler = StandardScaler()
X2 = scaler.fit_transform(X2)

# Reshape the modified example to match the original input shape
X2_reshaped = X2.reshape((1, 30, 1))

### Testing the model 

In [None]:
# Make predictions on the modified example
predictions = classifier.predict(X2_reshaped)

# Decode the one-hot encoded predictions
# Reshape the predictions array
predictions_reshaped = predictions.argmax(axis=1).reshape(-1, 1)

# Inverse transform the predictions
decoded_predictions = label_encoder_y2.inverse_transform(onehotencoder.inverse_transform(predictions_reshaped))


# Print the predictions
print("Predictions:", decoded_predictions)
