# Data Pre-Processing:

In [None]:
# Importing the formatted dataset from a google drive.
!pip install --upgrade --no-cache-dir gdown
!gdown https://drive.google.com/uc?id=1cvwrlIvVDWuJikwQPfSNBN_7x3Jpcpks 

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gdown
  Downloading gdown-4.6.0-py3-none-any.whl (14 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.4.0
    Uninstalling gdown-4.4.0:
      Successfully uninstalled gdown-4.4.0
Successfully installed gdown-4.6.0
Downloading...
From: https://drive.google.com/uc?id=1cvwrlIvVDWuJikwQPfSNBN_7x3Jpcpks
To: /content/UNSW-NB15_data.csv
100% 256M/256M [00:01<00:00, 138MB/s]


In [None]:
# Converting the csv file to a dataframe.
import pandas as pd
df = pd.read_csv("UNSW-NB15_data.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
# Dropping the 'Unnamed: 0' (unessecary) and 'attack_cat' (this column maps 100% to the target column, so using it would not give us a generalizable model) columns:
df = df.drop(labels = ['Unnamed: 0', 'attack_cat'], axis = 1)
# Drop null values in the following columns (these colums have some null values):
df = df.dropna(subset = ['ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'label'])

# Numerical Features:

Creating the testing and training data with normalization:

In [None]:
# Creating the testing/training data by normalizing the features using z-score normalization.
from sklearn.model_selection import train_test_split
numerical_features = ['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'sload', 'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz', 'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime', 'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat', 'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm', 'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm']
df_numerical = df[numerical_features]
df_numerical =(df_numerical - df_numerical.astype('float32').mean())/(df_numerical.astype('float32').std())
y = df.label
x = df_numerical
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Creating a logistic regression model:

In [None]:
# Creating the model.
from sklearn.linear_model import LogisticRegression
log_reg_numerical = LogisticRegression(solver = 'lbfgs')
log_reg_numerical.fit(X_train, y_train)

LogisticRegression()

In [None]:
# Getting the accuracy of this model.
log_reg_numerical.score(X_test, y_test)

0.9707813306132416

In [None]:
# Confusion matrix.
from sklearn.metrics import confusion_matrix
y_pred = log_reg_numerical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

                 Predicted Positive  Predicted Negative
Actual Positive              246485                  10
Actual Negative                7409                   9


Creating a decision tree model:

In [None]:
# Creating the model.
from sklearn.tree import DecisionTreeClassifier
decision_tree_numerical = DecisionTreeClassifier()
decision_tree_numerical.fit(X_train, y_train)

DecisionTreeClassifier()

In [None]:
# Getting the accuracy of decision tree model.
decision_tree_numerical.score(X_test, y_test)

0.9952227731545844

In [None]:
# Confusion matrix.
y_pred = decision_tree_numerical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

                 Predicted Positive  Predicted Negative
Actual Positive              245977                 518
Actual Negative                 695                6723


Creating a random forest model of 70 trees:

In [None]:
# Creates random forest model.
from sklearn.ensemble import RandomForestClassifier
random_forest_numerical = RandomForestClassifier(n_estimators = 70)
random_forest_numerical.fit(X_train, y_train)

RandomForestClassifier(n_estimators=70)

In [None]:
# Gets accuracy of random forest model.
random_forest_numerical.score(X_test, y_test)

0.995526026631169

In [None]:
# Confusion matrix.
y_pred = random_forest_numerical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

                 Predicted Positive  Predicted Negative
Actual Positive              246272                 223
Actual Negative                 105                7313


# Categorical Features:

Creating the testing/training data with normalization nd binary encoding:

In [None]:
# Performing binary encoding on the categorical features.
!pip install category_encoders
import category_encoders as ce
categorical_features = ['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'service', 'ct_ftp_cmd']
df_binary = df[categorical_features].join(df['label'])
binary_encoder = ce.BinaryEncoder()
df_binary = binary_encoder.fit_transform(df_binary)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting category_encoders
  Downloading category_encoders-2.5.1.post0-py2.py3-none-any.whl (72 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.4/72.4 KB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category_encoders
Successfully installed category_encoders-2.5.1.post0


In [None]:
# Normalize the features using z-score normalization.
df_categorical = df_binary.drop(labels = 'label', axis = 1)
df_categorical =(df_categorical - df_categorical.astype('float32').mean())/(df_categorical.astype('float32').std())
y = df_binary.label
x = df_categorical
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Creating a logistic regression model:

In [None]:
# Creating the model.
log_reg_categorical = LogisticRegression(solver = 'lbfgs')
log_reg_categorical.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
# Getting the accuracy of this model.
log_reg_categorical.score(X_test, y_test)

0.992229621957127

In [None]:
# Confusion matrix.
y_pred = log_reg_categorical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

                 Predicted Positive  Predicted Negative
Actual Positive              245670                 825
Actual Negative                1148                6270


Creating a decison tree model:

In [None]:
# Creating the model.
decision_tree_categorical = DecisionTreeClassifier()
decision_tree_categorical.fit(X_train, y_train)

DecisionTreeClassifier()

In [None]:
# Getting the accuracy of decision tree model.
decision_tree_categorical.score(X_test, y_test)

0.997573972187324

In [None]:
# Confusion matrix.
y_pred = decision_tree_categorical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

                 Predicted Positive  Predicted Negative
Actual Positive              246181                 314
Actual Negative                 302                7116


Creating a random forest model of 70 trees:

In [None]:
# Creates random forest model.
from sklearn.ensemble import RandomForestClassifier
random_forest_categorical = RandomForestClassifier(n_estimators = 70)
random_forest_categorical.fit(X_train, y_train)

RandomForestClassifier(n_estimators=70)

In [None]:
# Gets accuracy of random forest model.
random_forest_categorical.score(X_test, y_test)

0.9979087325186186

In [None]:
# Confusion matrix.
y_pred = random_forest_categorical.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

                 Predicted Positive  Predicted Negative
Actual Positive              246233                 262
Actual Negative                 269                7149


# Numerical and Categorical Features:

Creating the testing/training data with normalization:

In [None]:
# Joining the categorical features and numerical features together to create the final dataframe.
df_numerical = df[numerical_features]
df_final = df_numerical.join(df_binary)

In [None]:
# Creating the training/testing data by normalizing the features using z-score normalization.
df_both = df_final.drop(labels = 'label', axis = 1)
df_both =(df_both - df_both.astype('float32').mean())/(df_both.astype('float32').std())
y = df_final.label
x = df_both
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

Creating the logistic regression model:

In [None]:
# Creating the model.
log_reg_final = LogisticRegression(solver = 'lbfgs')
log_reg_final.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [None]:
# Getting the accuracy of this model.
log_reg_final.score(X_test, y_test)

0.9961955472937581

In [None]:
# Confusion matrix.
y_pred = log_reg_final.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

                 Predicted Positive  Predicted Negative
Actual Positive              245883                 612
Actual Negative                 354                7064


Creating the decision tree model:

In [None]:
# Creating the model.
decision_tree_final = DecisionTreeClassifier()
decision_tree_final.fit(X_train, y_train)

DecisionTreeClassifier()

In [None]:
# Getting the accuracy of decision tree model.
decision_tree_final.score(X_test, y_test)

0.9989208902261798

In [None]:
# Confusion matrix.
y_pred = decision_tree_final.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

                 Predicted Positive  Predicted Negative
Actual Positive              246347                 148
Actual Negative                 126                7292


Creating the random forest model using 70 trees:

In [None]:
# Creates random forest model.
random_forest_final = RandomForestClassifier(n_estimators = 70)
random_forest_final.fit(X_train, y_train)

RandomForestClassifier(n_estimators=70)

In [None]:
# Gets accuracy of random forest model.
random_forest_final.score(X_test, y_test)

0.9990587327155365

In [None]:
# Confusion matrix.
y_pred = random_forest_final.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm, columns=['Predicted Positive', 'Predicted Negative'], index=['Actual Positive', 'Actual Negative'])
print(cm_df)

                 Predicted Positive  Predicted Negative
Actual Positive              246312                 183
Actual Negative                  56                7362
