After feature_extraction preprocessing and testing, I will be keeping only the features that are derived by the feature selection AND ability to be computed using Python's scapy library

The rest remain the same, resulting to the final model that will be used in the IDS

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.neural_network import MLPClassifier
from category_encoders import TargetEncoder
from sklearn.preprocessing import OneHotEncoder

dftrain = pd.read_csv(r"..\archive\Train_data.csv")
dftest = pd.read_csv(r"..\archive\Test_data.csv")


In [2]:
x_train = dftrain[['service', 'dst_bytes', 'count', 'same_srv_rate', 'protocol_type', 'flag']]
y_train = dftrain['class'].map({'normal': 0, 'anomaly': 1})
x_test = dftest[['service', 'dst_bytes', 'count', 'same_srv_rate', 'protocol_type', 'flag']].copy()


x_val_train, x_val_test, y_val_train, y_val_test=train_test_split( x_train, y_train, test_size=0.2, random_state=42)

cat_cols = x_val_train.select_dtypes(include=['object', 'category']).columns
n=11
# Split columns
target_cols=[col for col in cat_cols if x_val_train[col].nunique() >n]
one_hot_cols=[col for col in cat_cols if x_val_train[col].nunique()<=n]
u=x_val_train['service'].nunique()
print(u)
print("High-cardinality columns (target encode):", target_cols)
print("Low-cardinality columns (one-hot encode):",one_hot_cols)

# print(x_val_train.columns)

# print(x_val_test.columns)


#label encoder
# categorical_cols = ['protocol_type', 'service', 'flag']
# for col in categorical_cols:
#     le = LabelEncoder()
#     combined = pd.concat([dftrain[col], dftest[col]], axis=0)
#     le.fit(combined)
#     dftrain[col] = le.transform(dftrain[col])
#     dftest[col] = le.transform(dftest[col])


#target encoding the large data
te = TargetEncoder(cols=['service'])
x_val_train['service'] = te.fit_transform(x_val_train[['service']], y_val_train)

# Transform validation and test sets
x_val_test.loc[:, 'service'] = te.transform(x_val_test[['service']])
x_test.loc[:, 'service'] = te.transform(x_test[['service']]) 


#one hot encoding smaller data
x_val_train = pd.get_dummies(x_val_train, columns=one_hot_cols)

x_val_test = pd.get_dummies(x_val_test, columns=one_hot_cols)
x_test = pd.get_dummies(x_test, columns=one_hot_cols)

x_val_test = x_val_test.reindex(columns=x_val_train.columns, fill_value=0)
x_test = x_test.reindex(columns=x_val_train.columns, fill_value=0)

print(x_val_train.head())
print(x_val_train.columns)
print(y_val_train)
print(x_val_test.head())
print(x_val_test.columns)

print("target " , target_cols) 
print(x_val_train.head())
print(len(x_val_train.columns))
print(te._dim)
print(te.cols)

selected_features = [
    'service', 'dst_bytes', 'count', 'same_srv_rate',
    'protocol_type_icmp', 'protocol_type_tcp',
    'flag_RSTR', 'flag_S0', 'flag_SF'
]

x_val_train = x_val_train[selected_features]
x_val_test  = x_val_test[selected_features]
x_test      = x_test[selected_features]

print(len(x_val_train.columns))
print(x_val_train.columns)

65
High-cardinality columns (target encode): ['service']
Low-cardinality columns (one-hot encode): ['protocol_type', 'flag']
        service  dst_bytes  count  same_srv_rate  protocol_type_icmp  \
2199   0.903138          0      1           1.00                True   
15288  0.954664          0    135           0.13               False   
4139   0.386397          0     11           0.09               False   
3505   0.954664          0    109           0.05               False   
19281  0.999946          0    132           0.18               False   

       protocol_type_tcp  protocol_type_udp  flag_OTH  flag_REJ  flag_RSTO  \
2199               False              False     False     False      False   
15288               True              False     False     False      False   
4139                True              False     False     False      False   
3505                True              False     False     False      False   
19281               True              False     Fals

In [3]:
x_val_train=x_val_train.astype(float)
x_val_test=x_val_test.astype(float)
x_test=x_test.astype(float)

print(x_val_train.head())
print(x_val_train.columns)
print(len(x_val_train.columns))


scaler=StandardScaler()
x_val_train_scaled=pd.DataFrame(scaler.fit_transform(x_val_train),columns=x_val_train.columns,index=x_val_train.index)
x_val_test_scaled=pd.DataFrame(scaler.transform(x_val_test),columns=x_val_test.columns,index=x_val_test.index)
x_test_scaled=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)

print(x_val_train_scaled.head())
print(x_val_test_scaled.head())
print(y_val_test)
print(x_val_train_scaled.columns)

        service  dst_bytes  count  same_srv_rate  protocol_type_icmp  \
2199   0.903138        0.0    1.0           1.00                 1.0   
15288  0.954664        0.0  135.0           0.13                 0.0   
4139   0.386397        0.0   11.0           0.09                 0.0   
3505   0.954664        0.0  109.0           0.05                 0.0   
19281  0.999946        0.0  132.0           0.18                 0.0   

       protocol_type_tcp  flag_RSTR  flag_S0  flag_SF  
2199                 0.0        0.0      0.0      1.0  
15288                1.0        0.0      1.0      0.0  
4139                 1.0        0.0      0.0      1.0  
3505                 1.0        0.0      1.0      0.0  
19281                1.0        0.0      1.0      0.0  
Index(['service', 'dst_bytes', 'count', 'same_srv_rate', 'protocol_type_icmp',
       'protocol_type_tcp', 'flag_RSTR', 'flag_S0', 'flag_SF'],
      dtype='object')
9
        service  dst_bytes     count  same_srv_rate  protocol_ty

In [4]:
mlp =MLPClassifier(hidden_layer_sizes=(50,50), max_iter=500, random_state=42)
mlp.fit(x_val_train_scaled, y_val_train)

0,1,2
,hidden_layer_sizes,"(50, ...)"
,activation,'relu'
,solver,'adam'
,alpha,0.0001
,batch_size,'auto'
,learning_rate,'constant'
,learning_rate_init,0.001
,power_t,0.5
,max_iter,500
,shuffle,True


In [5]:
accuracy=mlp.score(x_val_test_scaled, y_val_test)
print("Validation Accuracy:", round(accuracy, 4))

from sklearn.metrics import confusion_matrix, classification_report

y_val_pred=mlp.predict(x_val_test_scaled)

# Confusion matrix
cm=confusion_matrix(y_val_test, y_val_pred)
print("Confusion Matrix:\n", cm)

# Detailed report
print(classification_report(y_val_test, y_val_pred))

y_val_prob=mlp.predict_proba(x_val_test_scaled)
from sklearn.metrics import log_loss
loss = log_loss(y_val_test, y_val_prob)
print("Log Loss:", round(loss, 4))



Validation Accuracy: 0.9748
Confusion Matrix:
 [[2640   34]
 [  93 2272]]
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      2674
           1       0.99      0.96      0.97      2365

    accuracy                           0.97      5039
   macro avg       0.98      0.97      0.97      5039
weighted avg       0.98      0.97      0.97      5039

Log Loss: 0.07


In [6]:
import joblib
model_package = {
    # "columns": x_val_train.columns,
    "target_encoder": te,
    "one_hot_encoder": one_hot_cols,
    "scaler": scaler,
    "selected_features":selected_features,
    "mlp_model": mlp
}

# Save model and preprocessing components
joblib.dump(model_package, "models/mlp_model.joblib")

['models/mlp_model.joblib']