In [None]:
import numpy as np  # for array
import pandas as pd  # for csv files and dataframe
import matplotlib.pyplot as plt  # for plotting
import seaborn as sns  # plotting
from scipy import stats

import pickle  # To load data int disk
from prettytable import PrettyTable  # To print in tabular format

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer
from sklearn.metrics import auc, f1_score, roc_curve
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_predict

In [None]:
train = pd.read_csv('https://raw.githubusercontent.com/Nir-J/ML-Projects/master/UNSW-Network_Packet_Classification/UNSW_NB15_training-set.csv')
test = pd.read_csv('https://raw.githubusercontent.com/Nir-J/ML-Projects/master/UNSW-Network_Packet_Classification/UNSW_NB15_testing-set.csv')
df = pd.concat([train, test]).drop(['id'],axis=1)

In [None]:
print(df.columns)

Index(['dur', 'proto', 'service', 'state', 'spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label'],
      dtype='object')


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.model_selection import train_test_split


# Perform necessary preprocessing
# Drop unnecessary columns
df.drop(['dur', 'proto', 'service', 'state'], axis=1, inplace=True)
# Replace missing values with 0
df.fillna(0, inplace=True)

# Convert categorical features to numeric using LabelEncoder
le = LabelEncoder()
categorical_features = ['spkts', 'dpkts', 'sbytes',
       'dbytes', 'rate', 'sttl', 'dttl', 'sload', 'dload', 'sloss', 'dloss',
       'sinpkt', 'dinpkt', 'sjit', 'djit', 'swin', 'stcpb', 'dtcpb', 'dwin',
       'tcprtt', 'synack', 'ackdat', 'smean', 'dmean', 'trans_depth',
       'response_body_len', 'ct_srv_src', 'ct_state_ttl', 'ct_dst_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm',
       'is_ftp_login', 'ct_ftp_cmd', 'ct_flw_http_mthd', 'ct_src_ltm',
       'ct_srv_dst', 'is_sm_ips_ports', 'attack_cat', 'label']
# Apply RobustScaler to normalize numeric features
numeric_features = df.select_dtypes(include=['float64']).columns
scaler = RobustScaler()
df[numeric_features] = scaler.fit_transform(df[numeric_features])

# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
for col in categorical_features:
  df[col]=df[col].astype(str)
  df[col]=label_encoder.fit_transform(df[col])
for feature in categorical_features:
    df[feature] = le.fit_transform(df[feature])



# Further analysis with the selected features
# X_train and X_test contain the selected features based on ANOVA and Information Gain
# y_train and y_test are the corresponding target labels


In [None]:
import numpy as np
def is_float(element: any) -> bool:
    #If you expect None to be passed:
    if element is None: 
        return False
    try:
        float(element)
        return True
    except ValueError:
        return False

df=df.replace('?',np.nan)
df=df.replace('-',np.nan)

for col in df.columns:
  df=df[df[col].apply(lambda x: is_float(str(x)))]
  if df[col].dtype == 'object':
    df[col]=df[col].astype(float)
  df[col]=df[col].fillna(df[col].mean())

In [None]:
# Perform feature selection using ANOVA and Information Gain
from imblearn.under_sampling import EditedNearestNeighbours
from imblearn.over_sampling import RandomOverSampler
from sklearn.ensemble import IsolationForest
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from scipy.stats import pearsonr
import numpy as np


X = df.drop('label', axis=1)
y = df['label']
 # Number of top features to select
enn = EditedNearestNeighbours(sampling_strategy='auto')
X_enn, y_enn = enn.fit_resample(X, y)
k = int(len(X_enn.columns) * 0.33) 

# Correlation Coefficient
corr_top_k_features = []
for feature in X.columns:
    corr, _ = pearsonr(X[feature], y)
    if abs(corr) >= 0.1:  # Set correlation coefficient threshold
        corr_top_k_features.append(feature)


# ANOVA
f_selector = SelectKBest(f_classif, k=k)
f_selector.fit(X, y)
f_feature_scores = f_selector.scores_
f_top_k_idx = f_feature_scores.argsort()[-k:][::-1]
f_top_k_features = X.columns[f_top_k_idx].tolist()

# Information Gain
mi_selector = SelectKBest(mutual_info_classif, k=k)
mi_selector.fit(X, y)
mi_feature_scores = mi_selector.scores_
mi_top_k_idx = mi_feature_scores.argsort()[-k:][::-1]
mi_top_k_features = X.columns[mi_top_k_idx].tolist()


# Select common top features from ANOVA, Information Gain, and Correlation Coefficient
top_k_features = list(set(f_top_k_features).intersection(mi_top_k_features).intersection(corr_top_k_features))



In [None]:
# Create and fit the Isolation Forest model
import numpy as np
import pandas as pd
from sklearn.ensemble import IsolationForest
X=df[top_k_features]
clf = IsolationForest(n_estimators=100, random_state=42, contamination='auto')
clf.fit(X)

# Predict the anomaly scores for each data point
scores = clf.decision_function(X)

# Find the indices of the outliers
outlier_indices = np.where(clf.predict(X) == -1)[0]

# Remove the outliers from the dataset
data_clean = df.drop(df.index[outlier_indices])

# Split the cleaned data into features (X_clean) and labels (y_clean)
X_clean = data_clean.drop(['label'], axis=1)
y_clean = data_clean['label']


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# Create and train the Random Forest classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1-score:', f1)


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Conv1D, LSTM, Dense, Flatten
from keras.utils import to_categorical


# Reshape X_clean to 3D array (samples, timesteps, features)
X_clean = np.array(X_clean)
X_clean = X_clean.reshape(X_clean.shape[0], X_clean.shape[1], 1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_clean, y_clean, test_size=0.2, random_state=42)

# Create the CNN LSTM model
model = Sequential()
model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=(X_clean.shape[1], 1)))
model.add(LSTM(units=64, activation='relu'))
model.add(Dense(units=1, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model to the training data
model.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model on the test data
scores = model.evaluate(X_test, y_test)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test loss: nan
Test accuracy: 0.14643052220344543


In [None]:
X_train_ = X_train.reshape((X_train.shape[0],1,X_train.shape[1],1))
X_test_ = X_test.reshape((X_test.shape[0],1,X_test.shape[1],1))

In [None]:
import numpy as np
import pandas as pd
import random
import cv2
import os
from imutils import paths
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import accuracy_score,precision_score, recall_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Activation
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import backend as K
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow.keras as keras
from sklearn.preprocessing import MinMaxScaler as SC


input_ = keras.layers.Input(shape=(None,39,1))
cnn1 = keras.layers.TimeDistributed(keras.layers.Conv1D(filters = 128,kernel_size =3,activation = 'relu'),input_shape=(None,39,1))(input_)
Norm1 = keras.layers.TimeDistributed(keras.layers.BatchNormalization())(cnn1)
Pool1 = keras.layers.TimeDistributed(keras.layers.MaxPool1D(pool_size=2, strides=2))(Norm1)
cnn2 = keras.layers.TimeDistributed(keras.layers.Conv1D(filters=64, kernel_size=3,activation='relu'))(Pool1)
Norm2 = keras.layers.TimeDistributed(keras.layers.BatchNormalization())(cnn2)
Pool2 = keras.layers.TimeDistributed(keras.layers.MaxPool1D(pool_size=2, strides=2))(Norm2)
Flat = keras.layers.TimeDistributed(keras.layers.Flatten())(Pool2)
lstm1 = keras.layers.LSTM(50,activation ='tanh',return_sequences =True)(Flat)
lstm2 = keras.layers.LSTM(1,activation='sigmoid')(lstm1)
model = keras.Model(inputs = [input_],outputs=[lstm2])
model.compile(optimizer = 'adam',loss = 'binary_crossentropy',metrics=['accuracy'])


In [None]:
model.fit(X_train_,y_train,epochs=20)
model.summary()
y_pred=model.predict(X_test_)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None, 39, 1)]     0         
                                                                 
 time_distributed (TimeDistr  (None, None, 37, 128)    512       
 ibuted)                                                         
                                                                 
 time_distributed_1 (TimeDis  (None, None, 37, 128)    512       
 tributed)                                                       
                                                                 
 time_distributed_2 (TimeDis  (None, None, 18, 128)    0         
 tributed)                  

In [1]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print("Accuracy: {:.2f}%".format(accuracy * 100))
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1-score: {:.2f}%".format(f1 * 100))



NameError: ignored

In [None]:
# X_ = tf.constant(X).reshape(-1,1)

# # Create labels (using tensors)
# y_ = tf.constant(y).reshape(-1,1)
print(X)
model.fit(tf.expand_dims(X,axis=-1),y,epochs=100)
# model.fit(X_train,y_train)