# MSADS 508 CyberSentinel Security Solutions
## Data Preparation, Balancing, and Test/Train/Validation Splits

In [1]:
import boto3
import sagemaker
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

sess = sagemaker.Session()
bucket = sess.default_bucket()
role = sagemaker.get_execution_role()
region = boto3.Session().region_name



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
!aws s3 cp s3://msads-508-sp25-team6/MSADS\ 508\ Final\ Project.csv .

download: s3://msads-508-sp25-team6/MSADS 508 Final Project.csv to ./MSADS 508 Final Project.csv


In [3]:
!aws s3 ls s3://msads-508-sp25-team6/

                           PRE Data/
2025-03-15 07:04:28 1138005184 MSADS 508 Final Project.csv


In [4]:
df = pd.read_csv("MSADS 508 Final Project.csv")

  df = pd.read_csv("MSADS 508 Final Project.csv")


In [5]:
df.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packet Length Mean,Bwd Packet Length Mean,Label
0,192.168.4.118,4504,203.73.24.75,80,tcp,3974862.0,29,44,2.965517,1359.340909,ddos
1,192.168.4.118,4504,203.73.24.75,80,tcp,63.0,1,1,0.0,0.0,ddos
2,192.168.4.118,4505,203.73.24.75,80,tcp,476078.0,2,6,43.0,506.166667,ddos
3,192.168.4.118,4505,203.73.24.75,80,tcp,151.0,2,1,0.0,0.0,ddos
4,192.168.4.118,4506,203.73.24.75,80,tcp,472507.0,2,5,36.5,210.0,ddos


### Change and validate the data types

In [6]:
print(df.dtypes)

Source IP                  object
Source Port                object
Destination IP             object
Destination Port           object
Protocol                   object
Flow Duration             float64
Total Fwd Packets           int64
Total Backward Packets      int64
Fwd Packet Length Mean    float64
Bwd Packet Length Mean    float64
Label                      object
dtype: object


In [7]:
def convert_to_int(value):
    try:
        if isinstance(value, str) and value.startswith('0x'):
            return int(value, 16)
        else:
            return int(value)
    except ValueError:
        return None

In [8]:
df['Source IP'] = df['Source IP'].astype(object)

# Some Source Ports have been identified as hexidecimal. Function convert_to_int fixes this issue, but a few NaN values are identified. These values are resolved with fillna(0) as 0 represents no port value
# The issue record indices are 13102903 and 13163723
df['Source Port'] = df['Source Port'].apply(convert_to_int).fillna(0).astype(int)

df['Destination IP'] = df['Destination IP'].astype(object)

# Destination Port follows the same rules as Source Port
df['Destination Port'] = df['Destination Port'].apply(convert_to_int).fillna(0).astype(int)

# Label encode the categorical variables to numbers
df['Protocol'] = pd.factorize(df['Protocol'])[0]

df['Flow Duration'] = df['Flow Duration'].astype(float)

df['Total Fwd Packets'] = df['Total Fwd Packets'].astype(int)

df['Total Backward Packets'] = df['Total Backward Packets'].astype(int)

df['Fwd Packet Length Mean'] = df['Fwd Packet Length Mean'].astype(float)

df['Bwd Packet Length Mean'] = df['Bwd Packet Length Mean'].astype(float)

df['Label'] = df['Label'].astype(object)

In [9]:
print(df.dtypes)

Source IP                  object
Source Port                 int64
Destination IP             object
Destination Port            int64
Protocol                    int64
Flow Duration             float64
Total Fwd Packets           int64
Total Backward Packets      int64
Fwd Packet Length Mean    float64
Bwd Packet Length Mean    float64
Label                      object
dtype: object


### Split Source IP and Destination IP into their Octet breakdowns

In [10]:
df[['Source Octet 1', 'Source Octet 2', 'Source Octet 3', 'Source Octet 4']] = df['Source IP'].str.split('.', expand = True)

In [11]:
df[['Source Octet 1', 'Source Octet 2', 'Source Octet 3', 'Source Octet 4']] = df[['Source Octet 1', 'Source Octet 2', 'Source Octet 3', 'Source Octet 4']].astype(int)

In [12]:
df.head()

Unnamed: 0,Source IP,Source Port,Destination IP,Destination Port,Protocol,Flow Duration,Total Fwd Packets,Total Backward Packets,Fwd Packet Length Mean,Bwd Packet Length Mean,Label,Source Octet 1,Source Octet 2,Source Octet 3,Source Octet 4
0,192.168.4.118,4504,203.73.24.75,80,0,3974862.0,29,44,2.965517,1359.340909,ddos,192,168,4,118
1,192.168.4.118,4504,203.73.24.75,80,0,63.0,1,1,0.0,0.0,ddos,192,168,4,118
2,192.168.4.118,4505,203.73.24.75,80,0,476078.0,2,6,43.0,506.166667,ddos,192,168,4,118
3,192.168.4.118,4505,203.73.24.75,80,0,151.0,2,1,0.0,0.0,ddos,192,168,4,118
4,192.168.4.118,4506,203.73.24.75,80,0,472507.0,2,5,36.5,210.0,ddos,192,168,4,118


In [None]:
df[['Destination Octet 1', 'Destination Octet 2', 'Destination Octet 3', 'Destination Octet 4']] = df['Destination IP'].str.split('.', expand = True)

In [None]:
df[['Source Octet 1', 'Source Octet 2', 'Source Octet 3', 'Source Octet 4']] = df[['Source Octet 1', 'Source Octet 2', 'Source Octet 3', 'Source Octet 4']].astype(int)

In [None]:
df.head()

### Z-Score Standardization of Flow Duration, Total Fwd Packets, Total Backward Packets, Fwd Packet Length Mean, and Bwd Packet Length Mean

In [None]:
df['Flow Duration'] = (df['Flow Duration'] - df['Flow Duration'].mean()) / df['Flow Duration'].std()
df['Total Fwd Packets'] = (df['Total Fwd Packets'] - df['Total Fwd Packets'].mean()) / df['Total Fwd Packets'].std()
df['Total Backward Packets'] = (df['Total Backward Packets'] - df['Total Backward Packets'].mean()) / df['Total Backward Packets'].std()
df['Fwd Packet Length Mean'] = (df['Fwd Packet Length Mean'] - df['Fwd Packet Length Mean'].mean()) / df['Fwd Packet Length Mean'].std()
df['Bwd Packet Length Mean'] = (df['Bwd Packet Length Mean'] - df['Bwd Packet Length Mean'].mean()) / df['Bwd Packet Length Mean'].std()

In [None]:
df.head()

In [None]:
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

# randomized_df = df.sample(frac=1, random_state = 35)

In [None]:
train_size = int(len(df) * train_ratio)
validation_size = int(len(df) * val_ratio)

In [None]:
train_df = df[:train_size]

In [None]:
validation_df = df[train_size:train_size + validation_size]

In [None]:
test_df = df[train_size + validation_size:]

In [None]:
print(len(train_df))

In [None]:
print(len(validation_df))

In [None]:
print(len(test_df))

In [None]:
print(len(df))

In [None]:
print(train_df['Label'].value_counts())

In [None]:
!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
x_train = train_df[['Source IP', 'Source Port', 'Destination IP', 'Destination Port', 'Protocol', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Fwd Packet Length Mean', 'Bwd Packet Length Mean']]

In [None]:
y_train = train_df['Label']

In [None]:
smote = SMOTE(random_state = 35)

In [None]:
batch_size = 100000

x_batches = np.array_split(x_train, len(x_train) // batch_size)
y_batches = np.array_split(y_train, len(y_train) // batch_size)

In [None]:
x_resampled_list = []
y_resampled_list = []

for x_batch, y_batch in zip(x_batches, y_batches):
    x_res, y_res = smote.fit_resample(x_batch, y_batch)
    x_resampled_list.append(x_res)
    y_resampled_list.append(y_res)

In [None]:
x_train_resampled = np.vstack(x_resampled_list)
y_train_resampled = np.hstack(y_resampled_list)

In [None]:
x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)

In [None]:
train_balanced = pd.DataFrame(x_train_resampled, columns = x_train.columns)
train_balanced['Label'] = y_train_resampled

print(train_balanced['Label'].value_counts())