In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import pickle 
from os import path

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn import metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

# **Importing Datasets**

In [2]:
train = pd.read_csv("UNSW_NB15_training-set.csv")
test = pd.read_csv("UNSW_NB15_testing-set.csv")
data = pd.concat([train, test], axis=0)

FileNotFoundError: [Errno 2] No such file or directory: 'UNSW_NB15_training-set.csv'

In [None]:
data['service'].replace('-',np.nan,inplace=True)

In [None]:
f, axes = plt.subplots( figsize=(40, 20))
sns.histplot(data[data['label'] == 1]['service'], color='green')
sns.histplot(data[data['label'] ==0]['service'], color='yellow')
axes.tick_params('x', labelrotation=45,width=6,labelsize=20)

In [None]:
data.isnull().sum()

In [None]:
data.dropna(inplace=True)

In [None]:
data['attack_cat'].value_counts()

In [None]:
data['state'].value_counts()

In [None]:
features = pd.read_csv("NUSW-NB15_features.csv",encoding='cp1252')
features

In [None]:
features.head()

In [None]:
features['Type '] = features['Type '].str.lower()

In [None]:
nominal_names = features['Name'][features['Type ']=='nominal']
integer_names = features['Name'][features['Type ']=='integer']
binary_names = features['Name'][features['Type ']=='binary']
float_names = features['Name'][features['Type ']=='float']

In [None]:
cols = data.columns
nominal_names = cols.intersection(nominal_names)
integer_names = cols.intersection(integer_names)
binary_names = cols.intersection(binary_names)
float_names = cols.intersection(float_names)

In [None]:
for c in integer_names:
  pd.to_numeric(data[c])

In [None]:
for c in binary_names:
  pd.to_numeric(data[c])

In [None]:
for c in float_names:
  pd.to_numeric(data[c])

In [None]:
data.info()

In [None]:
plt.figure(figsize=(8,8))
plt.pie(data.label.value_counts(),labels=['normal','abnormal'],autopct='%0.2f%%')
plt.title("Pie chart distribution of normal and abnormal labels",fontsize=16)
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(8,8))
plt.pie(data.attack_cat.value_counts(),labels=data.attack_cat.unique(),autopct='%0.2f%%')
plt.title('Pie chart distribution of multi-class labels')
plt.legend(loc='best')
plt.show()

In [None]:
num_col = data.select_dtypes(include='number').columns
cat_col = data.columns.difference(num_col)
cat_col = cat_col[1:]
cat_col

In [None]:
data_cat = data[cat_col].copy()
data_cat.head()

In [None]:
data_cat = pd.get_dummies(data_cat,columns=cat_col)

In [None]:
data = pd.concat([data, data_cat],axis=1)

In [None]:
data.drop(columns=cat_col,inplace=True)

In [None]:
num_col = list(data.select_dtypes(include='number').columns)
num_col.remove('id')
num_col.remove('label')

In [None]:
minmax_scale = MinMaxScaler(feature_range=(0, 1))
def normalization(df,col):
  for i in col:
    arr = df[i]
    arr = np.array(arr)
    df[i] = minmax_scale.fit_transform(arr.reshape(len(arr),1))
  return df

In [None]:
data.head()

In [None]:
data = normalization(data.copy(),num_col)

In [None]:
data.head()

In [None]:
bin_label = pd.DataFrame(data.label.map(lambda x:'normal' if x==0 else 'abnormal'))

In [None]:
bin_data = data.copy()
bin_data['label'] = bin_label

In [None]:
le1 = preprocessing.LabelEncoder()
enc_label = bin_label.apply(le1.fit_transform)
bin_data['label'] = enc_label

In [None]:
le1.classes_

In [None]:
np.save("le1_classes.npy",le1.classes_,allow_pickle=True)

In [None]:
multi_data = data.copy()
multi_label = pd.DataFrame(multi_data.attack_cat)

In [None]:
multi_data = pd.get_dummies(multi_data,columns=['attack_cat'])

In [None]:
le2 = preprocessing.LabelEncoder()
enc_label = multi_label.apply(le2.fit_transform)
multi_data['label'] = enc_label

In [None]:
le2.classes_

In [None]:
np.save("le2_classes.npy",le2.classes_,allow_pickle=True)

In [None]:
num_col.append('label')

In [None]:
# Correlation Matrix for Binary Labels
plt.figure(figsize=(20,8))
corr_bin = bin_data[num_col].corr()
sns.heatmap(corr_bin,vmax=1.0,annot=False)
plt.title('Correlation Matrix for Binary Labels',fontsize=16)
plt.show()

In [None]:
num_col = list(multi_data.select_dtypes(include='number').columns)

In [None]:
# Correlation Matrix for Multi-class Labels
plt.figure(figsize=(20,8))
corr_multi = multi_data[num_col].corr()
sns.heatmap(corr_multi,vmax=1.0,annot=False)
plt.title('Correlation Matrix for Multi Labels',fontsize=16)
plt.show()

In [None]:
l4=data[['attack_cat']]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

lb_encoder4 =   LabelEncoder()
l4_= lb_encoder4.fit_transform(l4)

data["attack_cat"]=l4_

In [None]:
def constant_feature_detect(data, threshold=0.98):
    quasi_constant_feature = []
    for feature in data.columns:
        predominant = (data[feature].value_counts() / float(
                      len(data))).sort_values(ascending=False).values[0]
        if predominant >= threshold:
            quasi_constant_feature.append(feature)
    print(len(quasi_constant_feature), ' variables are found to be almost constant')    
    return quasi_constant_feature

# the original dataset has no constant variable
quasi_constant_feature = constant_feature_detect(data=data, threshold=0.9)


In [None]:
data.drop(quasi_constant_feature,axis=1,inplace=True)

In [None]:
X         = data.iloc[:,:-3]
Y         = np.array(data.iloc[:,-3])
Y         = Y.reshape(len(Y),1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import train_test_split

# Set aside 20% of the data for testing
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, shuffle=True, random_state=43)

# Further split the training data into 80% for training and 20% for validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20, random_state=42)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.decomposition import PCA

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
n_components = 10
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)
distances_train, indices_train = knn.kneighbors(X_train_scaled)
distances_test, indices_test = knn.kneighbors(X_test_scaled)

In [None]:
X_train_combined = np.concatenate([X_train_scaled, distances_train], axis=1)
X_test_combined = np.concatenate([X_test_scaled, distances_test], axis=1)

# Reshape data for CNN-LSTM input
X_train_combined_reshaped = X_train_combined.reshape(X_train_combined.shape[0], X_train_combined.shape[1], 1)
X_test_combined_reshaped = X_test_combined.reshape(X_test_combined.shape[0], X_test_combined.shape[1], 1)


In [None]:
model_combined = Sequential()
model_combined.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train_combined_reshaped.shape[1], 1)))
model_combined.add(MaxPooling1D(pool_size=3))
model_combined.add(LSTM(units=40, activation='relu', return_sequences=True))
model_combined.add(Dropout(0.1))
model_combined.add(LSTM(units=20, activation='relu'))
model_combined.add(Dropout(0.05))
model_combined.add(Dense(units=10, activation='softmax'))

In [None]:
model_combined.layers

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)

model_combined.compile(loss='sparse_categorical_crossentropy',
                      optimizer=optimizer,
                      metrics=['accuracy'])

callbacks_combined = [EarlyStopping(monitor='val_loss', patience=3),
                     ModelCheckpoint(filepath='best_model_combined', monitor='val_loss', save_best_only=True)]

history_combined = model_combined.fit(X_train_combined_reshaped, y_train, epochs=100,
                                      validation_data=(X_val, y_val),
                                      batch_size=500,
                                      callbacks=callbacks_combined)

In [None]:
import matplotlib.pyplot as plt

def plot_training_vs_validation(history, model):
    fig, ax = plt.subplots(1, 2, figsize=(12, 5))
    
    
    # Accuracy Plot
    ax[0].plot(history.history['accuracy'], 'b-', label="Training Accuracy")
    ax[0].plot(history.history['val_accuracy'], 'r-', label="Validation Accuracy")
    ax[0].set_title('Training vs Validation Accuracy - ' + model)
    ax[0].set_xlabel('Epochs')
    ax[0].set_ylabel('Accuracy')
    ax[0].grid(True)
    ax[0].legend()

    # Loss Plot
    ax[1].plot(history.history['loss'], 'g-', label="Training Loss")
    ax[1].plot(history.history['val_loss'], 'c-', label="Validation Loss")
    ax[1].set_title('Training vs Validation Loss - ' + model)
    ax[1].set_xlabel('Epochs')
    ax[1].set_ylabel('Loss')
    ax[1].grid(True)
    ax[1].legend()

    plt.tight_layout()
    plt.show()



In [None]:
plot_training_vs_validation(history_combined,"ADAM-LSTM")