<center><h1>Data Explotation </center>

In [3]:
import pandas as pd
import numpy as np
import os
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
# Set the warning mode to 'none' to suppress the SettingWithCopyWarning
pd.options.mode.chained_assignment = None

In [4]:
# Setting up the working directory
os.chdir('..')

In [None]:
df = pd.read_csv(r'data/iot_23.csv', low_memory=False)

In [None]:
df.head(5)

In [None]:
# Checking the count of the columns
print(df.columns)
print('\n')
print('Total number of columns', len(df.columns))

In [None]:
# Checking the data types of the dataframe
df.dtypes

## Dataset Description
1. ts --	The time when the capture was done, expressed in Unix Time
2. uid --	The ID of the capture
3. id_orig_h --	The IP address where the attack happened, either IPv4 or IPv6
4. id_orig_p --	The port used by the responder
5. id_resp_h --	The IP address of the device on which the capture happened
6. id_resp_p --	The port used for the response from the device where the capture happened
7. proto --	The network protocol used for the data package
8. service --	The application protocol
9. duration --	The amount of time data was traded between the device and the attacker
10. orig_bytes --	The amount of data sent to the device
11. resp_bytes --	The amount of data sent by the device
12. conn_state --	The state of the connection
13. local_orig --	Whether the connection originated locally
14. local_resp --	Whether the response originated locally
15. missed_bytes --	Number of missed bytes in a message
16. history --	The history of the state of the connection	
17. orig_pkts --	Number of packets being sent to the device	
18. orig_ip_bytes --	Number of bytes being sent to the device
19. resp_pkts --	Number of packets being sent from the device
20. resp_ip_bytes --	Number of bytes being sent from the device
21. label --	The type of capture, benign or malicious


In [None]:
# Check for different types of attacks and their frequencies in the data
df['label'].value_counts()

<center><h1>Data Cleansing </center>

In [None]:
# Check if any null data exists in the dataset
df.isna().sum()

In [None]:
# Convert Unix timestamp to datetime
df['ts'] = pd.to_datetime(df['ts'], unit='s')

In [None]:
# Remove duplicate rows based on all columns
df.drop_duplicates(inplace=True)

## Outlier Treatment
1. Z-Score Calculation: For every data point, compute its Z-score, which represents how many standard deviations the data point is away from the mean of the dataset.

2. Threshold Determination: Set a threshold (commonly set to a value like 3) to determine how extreme a Z-score has to be for the corresponding data point to be considered an outlier.

3. Outlier Identification: Data points with Z-scores greater than the threshold are identified as outliers. Essentially, points that are too far from the mean, based on the set threshold, are flagged.

4. Outlier Removal: Using the identified outliers from the previous step, these data points are excluded or replaced to cleanse the dataset.

5. Clean Dataset: After removal, the result is a dataset with reduced extremities and variations caused by outliers.

In [None]:
# Define a threshold for considering values as outliers
threshold = 3

# Copy the original DataFrame to a new one to retain all columns
df_no_outliers = df.copy(deep = True)

# Iterate through each numerical column
for column in df.select_dtypes(include=np.number):
    # Calculate the z-scores for the column
    z_scores = np.abs(stats.zscore(df_no_outliers[column]))
    
    # Identify outliers
    outliers = np.where(z_scores > threshold)
    
    # Create a mask to filter out outliers
    mask = ~(z_scores > threshold)
    
    # Replace outlier values with NaN
    df_no_outliers[column][~mask] = np.nan

# Optionally, you can drop rows with any missing values after outlier treatment
df_no_outliers.dropna(inplace=True)

In [None]:
df_no_outliers

<center><h1>Exploratory Data Analysis </center>

In [None]:
x 