# Augmenting the original dataset

## Date
2024.06.17

## Description
This notebook performs essential validations on the original dataset produced in laboratory, 'D1_encoded_categorical.csv'.

Then, it applied some data augmentation techniques, generating different datasets. These datasets are saved under Dataset_simulazione folder.


In [2]:
import os
import pandas as pd

folder = r'..\Dataset_simulazione'
file = r'D1_encoded_categorical.csv'

file_path = os.path.join(folder, file)
df= pd.read_csv(file_path)

df.drop('tactic_mitre', axis=1, inplace=True)

df.shape


(436404, 13)

In [5]:
df.drop_duplicates(inplace=True)

df.dropna(inplace=True)

df = df[df['techniques_mitre'] != 'not present']
df = df[df['techniques_mitre'] != 'command_and_control']

In [6]:
df.shape

(208735, 13)

# SMOTE


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline


# Selecting features and target from the dataset
X = df[['resp_pkts', 'service', 'local_resp', 'protocol', 'duration',
        'conn_state', 'orig_pkts', 'dest_port', 'orig_bytes', 'local_orig',
        'resp_bytes', 'src_port']]
y = df['techniques_mitre']

# Identify categorical columns
categorical_cols = ['service', 'local_resp', 'protocol', 'conn_state', 'local_orig']  # Adjust as needed

# Applying Label Encoding to categorical columns
for col in categorical_cols:
    le = LabelEncoder()
    X.loc[:, col] = le.fit_transform(X[col])  # Use .loc to ensure assignment is done properly

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a pipeline with SMOTE and RandomUnderSampler
resample_pipeline = Pipeline([
    ('smote', SMOTE(sampling_strategy='auto', random_state=42)),  # Oversample minority classes
    ('undersample', RandomUnderSampler(sampling_strategy='auto', random_state=42))  # Undersample majority classes
])

# Fit and apply the transform
X_resampled, y_resampled = resample_pipeline.fit_resample(X_train, y_train)

# Combine the features and target into a single DataFrame for easy export
combined_df = pd.concat([X_resampled, y_resampled.reset_index(drop=True)], axis=1)

# Save the combined DataFrame to a CSV file
combined_df.to_csv('combined_resampled_data.csv', index=False)

combined_df = pd.concat([X_resampled, y_resampled.reset_index(drop=True)], axis=1)

file = r'01.smote.csv'

file_path = os.path.join(folder, file)

# Save the combined DataFrame to a CSV file
combined_df.to_csv(file_path, index=False)


techniques_mitre
account_discovery_domain                 115474
benign                                   115474
domain_trust_discovery                   115474
group_policy_discovery                   115474
network_service_discovery                115474
reconnaissance_scan_ip_blocks            115474
reconnaissance_vulnerability_scanning    115474
reconnaissance_wordlist_scanning         115474
remote_system_discovery                  115474
Name: count, dtype: int64
       resp_pkts  service  local_resp  protocol   duration  conn_state  \
153          9.0        8           0         0  30.836542           2   
14247        1.0        2           0         2   0.001456           2   
14424        1.0        2           0         2   0.000784           2   
16291        4.0        5           0         0   0.786347           4   
17088        1.0        2           0         2   0.017180           2   

       orig_pkts  dest_port  orig_bytes  local_orig  resp_bytes  src_port  \
153

# ADASYN


In [18]:

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import LabelEncoder



# Selecting features and target from the dataset
X = df[['resp_pkts', 'service', 'local_resp', 'protocol', 'duration',
        'conn_state', 'orig_pkts', 'dest_port', 'orig_bytes', 'local_orig',
        'resp_bytes', 'src_port']]
y = df['techniques_mitre']

# Encode categorical variables using Label Encoding
label_encoders = {}
categorical_cols = ['service', 'local_resp', 'protocol', 'conn_state', 'local_orig']  # adjust based on your dataset
for col in categorical_cols:
    le = LabelEncoder()
    X.loc[:, col] = le.fit_transform(X[col])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize ADASYN
adasyn = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=5)

# Apply ADASYN
X_resampled, y_resampled = adasyn.fit_resample(X_train, y_train)

# Combine the features and target into a single DataFrame
combined_resampled_df = pd.concat([X_resampled, y_resampled.reset_index(drop=True)], axis=1)

file = r'adasyn.csv'
file_path = os.path.join(folder, file)

# Save the combined resampled DataFrame to a CSV file
combined_resampled_df.to_csv(file_path, index=False)





# Borderline-SMOTE

In [19]:
# import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.preprocessing import LabelEncoder



# Selecting features and target from the dataset
X = df[['resp_pkts', 'service', 'local_resp', 'protocol', 'duration',
        'conn_state', 'orig_pkts', 'dest_port', 'orig_bytes', 'local_orig',
        'resp_bytes', 'src_port']]
y = df['techniques_mitre']

# Encode categorical variables using Label Encoding
label_encoders = {}
categorical_cols = ['service', 'local_resp', 'protocol', 'conn_state', 'local_orig']  # adjust based on your dataset
for col in categorical_cols:
    le = LabelEncoder()
    X.loc[:, col] = le.fit_transform(X[col])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Borderline-SMOTE
blsmote = BorderlineSMOTE(sampling_strategy='auto', random_state=42, kind='borderline-1')

# Apply Borderline-SMOTE
X_resampled, y_resampled = blsmote.fit_resample(X_train, y_train)

# Combine the features and target into a single DataFrame
combined_resampled_df = pd.concat([X_resampled, y_resampled.reset_index(drop=True)], axis=1)

# Define the file path for saving the DataFrame
file = 'borderline_smote.csv'
file_path = os.path.join(folder, file)

# Save the combined resampled DataFrame to a CSV file
combined_resampled_df.to_csv(file_path, index=False)

# Output the new class distributions
print("New class distribution:")
print(y_resampled.value_counts())




New class distribution:
techniques_mitre
benign                                   115474
network_service_discovery                115474
remote_system_discovery                  115474
domain_trust_discovery                   115474
reconnaissance_vulnerability_scanning    115474
account_discovery_domain                 115474
reconnaissance_wordlist_scanning         115474
reconnaissance_scan_ip_blocks            115474
group_policy_discovery                   115474
Name: count, dtype: int64


# Tomek Links

In [25]:

from sklearn.model_selection import train_test_split
from imblearn.under_sampling import TomekLinks
from sklearn.preprocessing import LabelEncoder



# Selecting features and target from the dataset
X = df[['resp_pkts', 'service', 'local_resp', 'protocol', 'duration',
        'conn_state', 'orig_pkts', 'dest_port', 'orig_bytes', 'local_orig',
        'resp_bytes', 'src_port']]
y = df['techniques_mitre']

# Encode categorical variables using Label Encoding
label_encoders = {}
categorical_cols = ['service', 'local_resp', 'protocol', 'conn_state', 'local_orig']  # adjust based on your dataset
for col in categorical_cols:
    le = LabelEncoder()
    X.loc[:, col] = le.fit_transform(X[col])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Tomek Links for under-sampling
tl = TomekLinks()

# Apply Tomek Links
X_resampled, y_resampled = tl.fit_resample(X_train, y_train)

# Combine the features and target into a single DataFrame
combined_resampled_df = pd.concat([X_resampled, y_resampled.reset_index(drop=True)], axis=1)

# Define the file path for saving the DataFrame
file = 'tomek_links.csv'
file_path = os.path.join(folder, file)

# Save the combined resampled DataFrame to a CSV file
combined_resampled_df.to_csv(file_path, index=False)

# Output the new class distributions
print("New class distribution:")
print(y_resampled.value_counts())

New class distribution:
techniques_mitre
network_service_discovery                115467
benign                                    48353
reconnaissance_vulnerability_scanning      1004
reconnaissance_wordlist_scanning            577
remote_system_discovery                     431
domain_trust_discovery                      238
reconnaissance_scan_ip_blocks                62
account_discovery_domain                     40
group_policy_discovery                       25
Name: count, dtype: int64


# Smoteenn

In [26]:
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import LabelEncoder



# Selecting features and target from the dataset
X = df[['resp_pkts', 'service', 'local_resp', 'protocol', 'duration',
        'conn_state', 'orig_pkts', 'dest_port', 'orig_bytes', 'local_orig',
        'resp_bytes', 'src_port']]
y = df['techniques_mitre']

# Encode categorical variables using Label Encoding
label_encoders = {}
categorical_cols = ['service', 'local_resp', 'protocol', 'conn_state', 'local_orig']  # adjust based on your dataset
for col in categorical_cols:
    le = LabelEncoder()
    X.loc[:, col] = le.fit_transform(X[col])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SMOTEENN
smote_enn = SMOTEENN(random_state=42)

# Apply SMOTEENN
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

# Combine the features and target into a single DataFrame
combined_resampled_df = pd.concat([X_resampled, y_resampled.reset_index(drop=True)], axis=1)

# Define the file path for saving the DataFrame
file = 'smoteenn.csv'
file_path = os.path.join(folder, file)

# Save the combined resampled DataFrame to a CSV file
combined_resampled_df.to_csv(file_path, index=False)

# Output the new class distributions
print("New class distribution:")
print(y_resampled.value_counts())



New class distribution:
techniques_mitre
reconnaissance_wordlist_scanning         115474
network_service_discovery                115405
reconnaissance_scan_ip_blocks            115357
group_policy_discovery                   114503
account_discovery_domain                 114351
remote_system_discovery                  113998
domain_trust_discovery                   113080
reconnaissance_vulnerability_scanning    111585
benign                                   107187
Name: count, dtype: int64


# SmoteTomek

In [27]:
from sklearn.model_selection import train_test_split
from imblearn.combine import SMOTETomek
from sklearn.preprocessing import LabelEncoder


# Selecting features and target from the dataset
X = df[['resp_pkts', 'service', 'local_resp', 'protocol', 'duration',
        'conn_state', 'orig_pkts', 'dest_port', 'orig_bytes', 'local_orig',
        'resp_bytes', 'src_port']]
y = df['techniques_mitre']

# Encode categorical variables using Label Encoding
label_encoders = {}
categorical_cols = ['service', 'local_resp', 'protocol', 'conn_state', 'local_orig']  # adjust based on your dataset
for col in categorical_cols:
    le = LabelEncoder()
    X.loc[:, col] = le.fit_transform(X[col])

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize SMOTETomek
smote_tomek = SMOTETomek(random_state=42)

# Apply SMOTETomek
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)

# Combine the features and target into a single DataFrame
combined_resampled_df = pd.concat([X_resampled, y_resampled.reset_index(drop=True)], axis=1)

# Define the file path for saving the DataFrame
file = 'smotetomek.csv'
file_path = os.path.join(folder, file)

# Save the combined resampled DataFrame to a CSV file
combined_resampled_df.to_csv(file_path, index=False)

# Output the new class distributions
print("New class distribution:")
print(y_resampled.value_counts())


New class distribution:
techniques_mitre
reconnaissance_wordlist_scanning         115474
reconnaissance_scan_ip_blocks            115473
network_service_discovery                115470
account_discovery_domain                 115334
remote_system_discovery                  115324
group_policy_discovery                   115258
domain_trust_discovery                   115030
reconnaissance_vulnerability_scanning    114734
benign                                   114001
Name: count, dtype: int64


In [41]:
files = [
    'smote.csv',
    'adasyn.csv',
    'borderline_smote.csv',
    'tomek_links.csv',
    'smoteenn.csv',
    'smotetomek.csv'
]


smote.csv techniques_mitre
account_discovery_domain                 115474
benign                                   115474
domain_trust_discovery                   115474
group_policy_discovery                   115474
network_service_discovery                115474
reconnaissance_scan_ip_blocks            115474
reconnaissance_vulnerability_scanning    115474
reconnaissance_wordlist_scanning         115474
remote_system_discovery                  115474
Name: count, dtype: int64

adasyn.csv techniques_mitre
benign                                   115870
reconnaissance_vulnerability_scanning    115751
domain_trust_discovery                   115533
account_discovery_domain                 115480
reconnaissance_scan_ip_blocks            115478
remote_system_discovery                  115475
reconnaissance_wordlist_scanning         115475
network_service_discovery                115474
group_policy_discovery                   115473
Name: count, dtype: int64

borderline_smote.csv techni