<center><h1>Data Explotation </center>

In [60]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Set the warning mode to 'none' to suppress the SettingWithCopyWarning
pd.options.mode.chained_assignment = None

In [3]:
# Setting up the working directory
os.chdir('..')

In [33]:
df = pd.read_csv(r'data/iot_23.csv', low_memory=False)

In [34]:
df.head(5)

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,1536227000.0,CeqqKl3hyLQmO8LK98,192.168.100.111,17576.0,78.1.220.212,8081.0,tcp,-,3e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan
1,1536227000.0,C2oHQWo1EFGH8D9x7,192.168.100.111,17576.0,152.84.7.111,8081.0,tcp,-,2e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan
2,1536227000.0,CJLVjs4BByG04mczXc,192.168.100.111,17576.0,173.36.41.67,8081.0,tcp,-,2e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan
3,1536227000.0,C0z4uS9AWHDH2s4S7,192.168.100.111,17576.0,87.13.21.104,8081.0,tcp,-,2e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan
4,1536227000.0,CxbNVk3liFNUIlqSPi,192.168.100.111,17576.0,99.110.163.140,8081.0,tcp,-,2e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan


In [35]:
# Checking the count of the columns
print(df.columns)
print('\n')
print('Total number of columns', len(df.columns))

Index(['ts', 'uid', 'id.orig_h', 'id.orig_p', 'id.resp_h', 'id.resp_p',
       'proto', 'service', 'duration', 'orig_bytes', 'resp_bytes',
       'conn_state', 'local_orig', 'local_resp', 'missed_bytes', 'history',
       'orig_pkts', 'orig_ip_bytes', 'resp_pkts', 'resp_ip_bytes', 'label'],
      dtype='object')


Total number of columns 21


In [36]:
# Checking the data types of the dataframe
df.dtypes

ts               float64
uid               object
id.orig_h         object
id.orig_p        float64
id.resp_h         object
id.resp_p        float64
proto             object
service           object
duration          object
orig_bytes        object
resp_bytes        object
conn_state        object
local_orig        object
local_resp        object
missed_bytes     float64
history           object
orig_pkts        float64
orig_ip_bytes    float64
resp_pkts        float64
resp_ip_bytes    float64
label             object
dtype: object

## Dataset Description
1. ts --	The time when the capture was done, expressed in Unix Time
2. uid --	The ID of the capture
3. id_orig_h --	The IP address where the attack happened, either IPv4 or IPv6
4. id_orig_p --	The port used by the responder
5. id_resp_h --	The IP address of the device on which the capture happened
6. id_resp_p --	The port used for the response from the device where the capture happened
7. proto --	The network protocol used for the data package
8. service --	The application protocol
9. duration --	The amount of time data was traded between the device and the attacker
10. orig_bytes --	The amount of data sent to the device
11. resp_bytes --	The amount of data sent by the device
12. conn_state --	The state of the connection
13. local_orig --	Whether the connection originated locally
14. local_resp --	Whether the response originated locally
15. missed_bytes --	Number of missed bytes in a message
16. history --	The history of the state of the connection	
17. orig_pkts --	Number of packets being sent to the device	
18. orig_ip_bytes --	Number of bytes being sent to the device
19. resp_pkts --	Number of packets being sent from the device
20. resp_ip_bytes --	Number of bytes being sent from the device
21. label --	The type of capture, benign or malicious


In [37]:
# Check for different types of attacks and their frequencies in the data
df['label'].value_counts()

PartOfAHorizontalPortScan     3389036
Okiru                         1313012
Benign                         688812
DDoS                           638506
C&C                             15286
C&C-HeartBeat                    1332
Attack                            538
C&C-FileDownload                   46
C&C-Torii                          30
FileDownload                       13
C&C-HeartBeat-FileDownload          8
Okiru-Attack                        3
C&C-Mirai                           1
Name: label, dtype: int64

<center><h1>Data Cleansing </center>

In [38]:
# Check if any null data exists in the dataset
df.isna().sum()

ts               0
uid              0
id.orig_h        0
id.orig_p        0
id.resp_h        0
id.resp_p        0
proto            0
service          0
duration         0
orig_bytes       0
resp_bytes       0
conn_state       0
local_orig       0
local_resp       0
missed_bytes     0
history          0
orig_pkts        0
orig_ip_bytes    0
resp_pkts        0
resp_ip_bytes    0
label            0
dtype: int64

In [42]:
# Convert Unix timestamp to datetime
df['ts'] = pd.to_datetime(df['ts'], unit='s')

In [44]:
# Remove duplicate rows based on all columns
df.drop_duplicates(inplace=True)

## Outlier Treatment
1. Z-Score Calculation: For every data point, compute its Z-score, which represents how many standard deviations the data point is away from the mean of the dataset.

2. Threshold Determination: Set a threshold (commonly set to a value like 3) to determine how extreme a Z-score has to be for the corresponding data point to be considered an outlier.

3. Outlier Identification: Data points with Z-scores greater than the threshold are identified as outliers. Essentially, points that are too far from the mean, based on the set threshold, are flagged.

4. Outlier Removal: Using the identified outliers from the previous step, these data points are excluded or replaced to cleanse the dataset.

5. Clean Dataset: After removal, the result is a dataset with reduced extremities and variations caused by outliers.

In [58]:
# Define a threshold for considering values as outliers
threshold = 3

# Copy the original DataFrame to a new one to retain all columns
df_no_outliers = df.copy(deep = True)

# Iterate through each numerical column
for column in df.select_dtypes(include=np.number):
    # Calculate the z-scores for the column
    z_scores = np.abs(stats.zscore(df_no_outliers[column]))
    
    # Identify outliers
    outliers = np.where(z_scores > threshold)
    
    # Create a mask to filter out outliers
    mask = ~(z_scores > threshold)
    
    # Replace outlier values with NaN
    df_no_outliers[column][~mask] = np.nan

# Optionally, you can drop rows with any missing values after outlier treatment
df_no_outliers.dropna(inplace=True)

In [54]:
df_no_outliers

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,...,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes,label
0,2018-09-06 09:43:43.384673024,CeqqKl3hyLQmO8LK98,192.168.100.111,17576.0,78.1.220.212,8081.0,tcp,-,3e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan
1,2018-09-06 09:43:43.384677888,C2oHQWo1EFGH8D9x7,192.168.100.111,17576.0,152.84.7.111,8081.0,tcp,-,2e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan
2,2018-09-06 09:43:43.384681984,CJLVjs4BByG04mczXc,192.168.100.111,17576.0,173.36.41.67,8081.0,tcp,-,2e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan
3,2018-09-06 09:43:43.384694016,C0z4uS9AWHDH2s4S7,192.168.100.111,17576.0,87.13.21.104,8081.0,tcp,-,2e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan
4,2018-09-06 09:43:43.384698112,CxbNVk3liFNUIlqSPi,192.168.100.111,17576.0,99.110.163.140,8081.0,tcp,-,2e-06,0,...,S0,-,-,0.0,S,2.0,80.0,0.0,0.0,PartOfAHorizontalPortScan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6046618,2019-07-03 14:39:12.145404928,CbiAr234EfpzQInxbg,0.0.0.0,68.0,255.255.255.255,67.0,udp,dhcp,90.034713,3300.0,...,S0,-,-,0.0,D,11.0,3608.0,0.0,0.0,Benign
6046619,2019-07-03 14:39:12.076456192,CqEhZf1lYmVokqECn3,::,143.0,ff02::16,0.0,icmp,-,90.39997,340.0,...,OTH,-,-,0.0,-,9.0,844.0,0.0,0.0,Benign
6046620,2019-07-03 14:39:12.588412928,Cr5vKO1CGfMQwFp22c,::,135.0,ff02::1:ffd5:cdf,136.0,icmp,-,89.82403000000001,72.0,...,OTH,-,-,0.0,-,3.0,216.0,0.0,0.0,Benign
6046621,2019-07-03 14:39:13.612461056,CQ5cJ21U9NUbUWIznd,fe80::5bcc:698e:39d5:cdf,143.0,ff02::16,0.0,icmp,-,45.215915,200.0,...,OTH,-,-,0.0,-,8.0,648.0,0.0,0.0,Benign


<center><h1>Exploratory Data Analysis </center>