In [None]:
# Mounts Google Drive
# Importing the necessary module
from google.colab import drive
# Mounting Google Drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np  # Library for array manipulation
import pandas as pd  # Library for handling CSV files and dataframes
import matplotlib.pyplot as plt  # Library for plotting
import seaborn as sns  # Library for advanced plotting
from scipy import stats  # Library for statistical functions

import pickle  # Library for loading data from disk
from prettytable import PrettyTable  # Library for creating tabular output

import warnings  # Library for handling warnings
warnings.filterwarnings("ignore")  # Ignore warning messages

from sklearn.preprocessing import StandardScaler  # Library for data scaling
from sklearn.preprocessing import LabelEncoder, OneHotEncoder  # Libraries for encoding categorical variables

from sklearn.model_selection import train_test_split  # Library for splitting data into train and test sets
from sklearn.metrics import accuracy_score, confusion_matrix, make_scorer  # Libraries for model evaluation
from sklearn.metrics import auc, f1_score, roc_curve  # Libraries for specific evaluation metrics
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV  # Libraries for hyperparameter tuning
from sklearn.model_selection import cross_validate, cross_val_predict  # Libraries for cross-validation


# Reading Data

In [None]:
saved_dict={}

Combining Multiple CSV Files: we will read multiple CSV files, each corresponding to a different subset of data, and combine them into a single DataFrame called all_data. The CSV files are assumed to be located in the directory specified by path, and their names follow the format UNSW-NB15_1.csv, UNSW-NB15_2.csv, and so on.

In [None]:
dfs = []
for i in range(1,5):
    path = '/content/drive/MyDrive/PPP/IDS_DATASET/UNSW-NB15_{}.csv'  # There are 4 input csv files
    dfs.append(pd.read_csv(path.format(i), header = None))
all_data = pd.concat(dfs).reset_index(drop=True)  # Concat all to a single df

In [None]:
# Read the CSV file to obtain column names for the dataset
df_col = pd.read_csv('/content/drive/MyDrive/PPP/IDS_DATASET/NUSW-NB15_features.csv', encoding='ISO-8859-1')

In [None]:
#Retrieving Column Names
df_col.columns

Index(['No.', 'Name', 'Type ', 'Description'], dtype='object')

In [None]:
# Column names and descriptions retrieved from the CSV file
# Column names DataFrame:
# - Contains index numbers, column names, data types, and descriptions
# - Provides details about each column in the dataset
# - Useful for understanding the data structure and column content
df_col  # DataFrame with column names and descriptions


Unnamed: 0,No.,Name,Type,Description
0,1,srcip,nominal,Source IP address
1,2,sport,integer,Source port number
2,3,dstip,nominal,Destination IP address
3,4,dsport,integer,Destination port number
4,5,proto,nominal,Transaction protocol
5,6,state,nominal,Indicates to the state and its dependent proto...
6,7,dur,Float,Record total duration
7,8,sbytes,Integer,Source to destination transaction bytes
8,9,dbytes,Integer,Destination to source transaction bytes
9,10,sttl,Integer,Source to destination time to live value



**Standardizing Column Names:** We modify the column names in the df_col DataFrame by making them lowercase and removing any leading/trailing spaces. This is achieved by applying a lambda function using the apply() method on the 'Name' column.

**Updating DataFrame Columns:** We rename the columns of the all_data DataFrame using the modified column names from df_col. By assigning the modified column names to the columns attribute of the all_data DataFrame, we ensure that the column names are consistent and follow a standardized format.

These operations are performed to improve the consistency and readability of the column names in the dataset, facilitating further analysis and processing of the data.

In [None]:
# Making column names lower case, removing spaces
df_col['Name'] = df_col['Name'].apply(lambda x: x.strip().replace(' ', '').lower())
# Renaming our dataframe with proper column names
all_data.columns = df_col['Name']

**Saving Column Names:** We extract the column names from the df_col DataFrame, excluding the column with the name "label". The resulting list of column names is assigned to the saved_dict dictionary with the key "columns".


**Excluding "label" Column:** We filter out the column name "label" from the list using the condition df_col['Name'] != 'label' to ensure it is not included in the final list of column names.


This code allows us to store the list of column names, excluding the "label" column, in the saved_dict dictionary for future use in the code.

In [None]:
saved_dict['columns'] = df_col['Name'][df_col['Name']!='label'].tolist()

In [None]:
# Delete the df_col DataFrame from memory
del df_col
# Retrieve the dimensions of the all_data DataFrame
all_data.shape

(2540047, 49)

# Data cleaning and preprocessing

**Counting Missing Values:** We use the isnull() function to identify missing values in the all_data DataFrame. The resulting Boolean DataFrame indicates whether each element is null or not.


**Summing Missing Values:** We apply the sum() function to the Boolean DataFrame to calculate the number of missing values in each column. This provides a count of the missing values present in each column of the DataFrame.


By executing this code, we can identify and understand the distribution of missing values across different columns in the all_data DataFrame.

In [None]:
all_data.isnull().sum()

Name
srcip                     0
sport                     0
dstip                     0
dsport                    0
proto                     0
state                     0
dur                       0
sbytes                    0
dbytes                    0
sttl                      0
dttl                      0
sloss                     0
dloss                     0
service                   0
sload                     0
dload                     0
spkts                     0
dpkts                     0
swin                      0
dwin                      0
stcpb                     0
dtcpb                     0
smeansz                   0
dmeansz                   0
trans_depth               0
res_bdy_len               0
sjit                      0
djit                      0
stime                     0
ltime                     0
sintpkt                   0
dintpkt                   0
tcprtt                    0
synack                    0
ackdat                    0
is_sm_ips_ports

**Counting Unique Values:** We access the 'attack_cat' column of the DataFrame all_data and use the value_counts() method. This method counts the occurrences of each unique value in the column.


**Attack Category Distribution:** The resulting output provides a count of occurrences for each unique value in the 'attack_cat' column. This helps us understand the distribution of different attack categories present in the dataset.


By running this code, we can obtain insights into the frequency of different attack categories and their relative representation in the 'attack_cat' column of the all_data DataFrame.

In [None]:
all_data['attack_cat'].value_counts()

Generic             215481
Exploits             44525
 Fuzzers             19195
DoS                  16353
 Reconnaissance      12228
 Fuzzers              5051
Analysis              2677
Backdoor              1795
Reconnaissance        1759
 Shellcode            1288
Backdoors              534
Shellcode              223
Worms                  174
Name: attack_cat, dtype: int64

Handling Missing Values: We fill the missing values in the 'attack_cat' column of the DataFrame all_data with the value 'normal'. This is done using the fillna() method.


Standardizing Values: We apply a lambda function using the apply() method to each element in the 'attack_cat' column. The lambda function removes leading/trailing spaces and converts the values to lowercase.


Updated 'attack_cat' Column: The modified values are assigned back to the 'attack_cat' column of all_data, overwriting the original values.


By executing this code, we handle missing values in the 'attack_cat' column by replacing them with the value 'normal'. We also ensure consistency by removing leading/trailing spaces and converting the values to lowercase. This preprocessing step prepares the 'attack_cat' column for further modeling tasks.

In [None]:
all_data['attack_cat'] = all_data.attack_cat.fillna(value='normal').apply(lambda x: x.strip().lower())

In [None]:
all_data['attack_cat'].value_counts()

normal            2218764
generic            215481
exploits            44525
fuzzers             24246
dos                 16353
reconnaissance      13987
analysis             2677
backdoor             1795
shellcode            1511
backdoors             534
worms                 174
Name: attack_cat, dtype: int64

 The resulting output provides a count of occurrences for each unique value in the 'ct_flw_http_mthd' column. This helps us understand the distribution of different HTTP methods present in the dataset.

In [None]:
all_data["ct_flw_http_mthd"].value_counts()

0.0     986791
1.0     188008
6.0       7902
4.0       6564
3.0        738
2.0        588
5.0        430
9.0        270
14.0       224
12.0       132
30.0        60
8.0         56
16.0        48
36.0        36
10.0        30
25.0        25
Name: ct_flw_http_mthd, dtype: int64

**Handling Missing Values:** We fill the missing values in the 'ct_flw_http_mthd' column of the DataFrame all_data with the value 0 using the fillna() method.

**Updated 'ct_flw_http_mthd' Column:** The modified values are assigned back to the 'ct_flw_http_mthd' column of all_data, replacing the missing values with 0.

By executing this code, we handle missing values in the 'ct_flw_http_mthd' column by replacing them with 0. This ensures that the column contains valid values for further modeling tasks.

In [None]:
all_data['ct_flw_http_mthd'] = all_data.ct_flw_http_mthd.fillna(value=0)
all_data['ct_flw_http_mthd']=all_data.ct_flw_http_mthd.fillna(value=0)

 The resulting output provides a count of occurrences for each unique value in the 'is_ftp_login' column. This helps us understand the distribution of different values present in the column.

In [None]:
all_data['is_ftp_login'].value_counts()

0.0    1066593
1.0      43389
4.0        156
2.0         30
Name: is_ftp_login, dtype: int64

we handle missing values in the 'is_ftp_login' column by replacing them with 0 and convert the column's data type to integer. This ensures consistency and prepares the 'is_ftp_login' column for further modeling tasks.

In [None]:
all_data['is_ftp_login'] = (all_data.is_ftp_login.fillna(value=0)).astype(int)

In [None]:
all_data.isnull().sum().sum()

0

# Information about datatest

In [None]:
all_data.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'attack_cat',
       'label'],
      dtype='object', name='Name')

In [None]:
all_data.dtypes.value_counts()

int64      29
float64    11
object      9
dtype: int64

In [None]:
all_data.select_dtypes(exclude=np.number).columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'service',
       'ct_ftp_cmd', 'attack_cat'],
      dtype='object', name='Name')

In [None]:
all_data['ct_ftp_cmd'].unique()

array([0, 1, 6, 2, 4, 8, 5, 3, '0', '1', ' ', '2', '4'], dtype=object)

we replace any space characters in the 'ct_ftp_cmd' column with the value "0" and convert the column's data type to integer. 

In [None]:
all_data["ct_ftp_cmd"]=all_data["ct_ftp_cmd"].replace(to_replace=" ",value="0").astype(int)

In [None]:
all_data["ct_ftp_cmd"]=all_data["ct_ftp_cmd"].replace(to_replace=" ",value="0").astype(int)

In [None]:
all_data.ct_ftp_cmd.value_counts()

0    2496377
1      40077
2       1264
4        960
3        729
6        332
5        290
8         18
Name: ct_ftp_cmd, dtype: int64

In [None]:
all_data.select_dtypes(include=np.number).columns

Index(['dur', 'sbytes', 'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'sload',
       'dload', 'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm', 'label'],
      dtype='object', name='Name')

In [None]:
saved_dict['binary_col'] = ['is_sm_ips_ports', 'is_ftp_login']

In [None]:
for col in 'is_sm_ips_ports', 'is_ftp_login':
    print(all_data[col].value_counts())
    print()

0    2535852
1       4195
Name: is_sm_ips_ports, dtype: int64

0    2496472
1      43389
4        156
2         30
Name: is_ftp_login, dtype: int64



we modify the values in the 'is_ftp_login' column. If a value is greater than 1, it is replaced with 1. This operation ensures that all values in the 'is_ftp_login' column are either 0 or 1, based on the condition specified.

In [None]:
all_data['is_ftp_login'] = np.where(all_data['is_ftp_login']>1, 1, all_data['is_ftp_login'])
all_data['is_ftp_login'].value_counts()

0    2496472
1      43575
Name: is_ftp_login, dtype: int64

In [None]:
all_data['service'].value_counts()

-           1246397
dns          781668
http         206273
ftp-data     125783
smtp          81645
ftp           49090
ssh           47160
pop3           1533
dhcp            172
ssl             142
snmp            113
radius           40
irc              31
Name: service, dtype: int64

we modify the values in the 'service' column. Any occurrence of "-" is replaced with the string "None". This operation ensures consistency and handles missing or undefined values in the 'service' column

In [None]:
all_data['service'] = all_data['service'].apply(lambda x:"None" if x=="-" else x)

In [None]:
all_data['service'].value_counts()

None        1246397
dns          781668
http         206273
ftp-data     125783
smtp          81645
ftp           49090
ssh           47160
pop3           1533
dhcp            172
ssl             142
snmp            113
radius           40
irc              31
Name: service, dtype: int64

In [None]:
all_data['attack_cat'].nunique()

11

In [None]:
all_data['attack_cat'].value_counts()

normal            2218764
generic            215481
exploits            44525
fuzzers             24246
dos                 16353
reconnaissance      13987
analysis             2677
backdoor             1795
shellcode            1511
backdoors             534
worms                 174
Name: attack_cat, dtype: int64

we modify the values in the 'attack_cat' column. The string 'backdoors' is replaced with 'backdoor', ensuring consistency in the attack category labels. Additionally, the values in the 'attack_cat' column are transformed to lowercase and stripped of any leading or trailing whitespaces. 

In [None]:
all_data['attack_cat'] = all_data['attack_cat'].replace('backdoors','backdoor', regex=True).apply(lambda x: x.strip().lower())

In [None]:
all_data["attack_cat"].value_counts()

normal            2218764
generic            215481
exploits            44525
fuzzers             24246
dos                 16353
reconnaissance      13987
analysis             2677
backdoor             2329
shellcode            1511
worms                 174
Name: attack_cat, dtype: int64

we split the data in all_data into a training set (train) and a test set (test). The training set contains 40% of the data, while the test set contains 60% of the data. 

In [None]:
train, test = train_test_split(all_data, test_size=0.6, random_state=16)

In [None]:
train["attack_cat"].value_counts()

normal            887702
generic            86054
exploits           17943
fuzzers             9623
dos                 6579
reconnaissance      5490
analysis            1049
backdoor             901
shellcode            609
worms                 68
Name: attack_cat, dtype: int64

In [None]:
train.shape

(1016018, 49)

the "attack_cat" and "label" columns are removed from the test DataFrame. This operation is done to prepare the test data for prediction, as the target labels ("attack_cat" and "label") are not available during inference.

In [None]:
test=test.drop(["attack_cat","label"],axis=1)

In [None]:
test.columns

Index(['srcip', 'sport', 'dstip', 'dsport', 'proto', 'state', 'dur', 'sbytes',
       'dbytes', 'sttl', 'dttl', 'sloss', 'dloss', 'service', 'sload', 'dload',
       'spkts', 'dpkts', 'swin', 'dwin', 'stcpb', 'dtcpb', 'smeansz',
       'dmeansz', 'trans_depth', 'res_bdy_len', 'sjit', 'djit', 'stime',
       'ltime', 'sintpkt', 'dintpkt', 'tcprtt', 'synack', 'ackdat',
       'is_sm_ips_ports', 'ct_state_ttl', 'ct_flw_http_mthd', 'is_ftp_login',
       'ct_ftp_cmd', 'ct_srv_src', 'ct_srv_dst', 'ct_dst_ltm', 'ct_src_ltm',
       'ct_src_dport_ltm', 'ct_dst_sport_ltm', 'ct_dst_src_ltm'],
      dtype='object', name='Name')

In [None]:
#train.to_csv('/content/drive/MyDrive/PPP/IDS_DATASET/train_alldata_EDA.csv', index=False)
test.to_csv('/content/drive/MyDrive/PPP/IDS_DATASET/test_alldata_EDA.csv', index=False)

In [None]:
train.shape

(1016018, 49)

In [None]:
test.shape

(1524029, 49)

In [None]:
pickle.dump(saved_dict, open('/content/drive/MyDrive/PPP/IDS_DATASET/saved_dict', 'wb'))

In [None]:
del all_data