# Preprocessing

## Data standardization

In [None]:
scaler = StandardScaler()
X_train.iloc[:, :] = scaler.fit_transform(X_train)
X_val.iloc[:, :] = scaler.transform(X_val)
X_test.iloc[:, :] = scaler.transform(X_test)

In [None]:
X_train

## Data balance

In [None]:
label_frequencies = []
labels = []
for attack in np.unique(y_train):
    frequency = len(y_train[y_train == attack])
    label_frequencies.append(frequency)
    labels.append(str(output_encoder.inverse_transform(np.expand_dims(np.array(attack), axis = 0))[0]))

In [None]:
sorted_indices = np.argsort(label_frequencies)[::-1]
label_frequencies = [label_frequencies[i] for i in sorted_indices]
labels = [labels[i] for i in sorted_indices]

In [None]:
plt.figure(figsize=(12,6), dpi = 500)
plt.bar(x = labels, height = label_frequencies)
plt.xlabel('Attack Types')
plt.ylabel('Frequency')
plt.title('Distribution of Attack Types in 5G NIDD Dataset')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(
    os.path.join('Data', 'Figures', '5G NIDD', 'attack_type_distribution.png')
)
plt.show()


## SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
label_frequencies = []
labels = []
for attack in np.unique(y_resampled):
    frequency = len(y_resampled[y_resampled == attack])
    label_frequencies.append(frequency)
    labels.append(str(output_encoder.inverse_transform(np.expand_dims(np.array(attack), axis = 0))[0]))

In [None]:
sorted_indices = np.argsort(label_frequencies)[::-1]
label_frequencies = [label_frequencies[i] for i in sorted_indices]
labels = [labels[i] for i in sorted_indices]

In [None]:
plt.figure(figsize=(12,6), dpi = 500)
plt.bar(x = labels, height = label_frequencies)
plt.xlabel('Attack Types')
plt.ylabel('Frequency')
plt.title('Distribution of Attack Types in 5G NIDD Dataset with SMOTE')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(
    os.path.join('Data', 'Figures', '5G NIDD', 'attack_type_distribution_with_SMOTE.png')
)
plt.show()


## PCA

In [None]:
pca = PCA(n_components=0.9)
X_train_resampled_pca = pca.fit_transform(X_resampled)
X_val_pca = pca.transform(X_val)
X_test_pca = pca.transform(X_test)
print(pca.explained_variance_ratio_, pca.n_components_, sep = '\n')