In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import joblib


In [2]:
df = pd.read_csv('sampled_dataset.csv')

In [3]:
df.drop(['id.orig_h', 'id.resp_h'], axis=1, inplace=True)

In [4]:
df = pd.get_dummies(df, columns=['conn_state'], drop_first=True)


In [5]:
df

Unnamed: 0,id.orig_p,id.resp_p,orig_ip_bytes,label,packet_size,proto_icmp,proto_tcp,proto_udp,conn_state_REJ,conn_state_RSTO,conn_state_RSTOS0,conn_state_RSTR,conn_state_RSTRH,conn_state_S0,conn_state_S1,conn_state_S2,conn_state_SF,conn_state_SH,conn_state_SHR
0,58408.0,23.0,0.153846,Benign,0.058594,False,True,False,False,False,False,False,False,True,False,False,False,False,False
1,43763.0,42675.0,0.102564,Benign,0.039062,False,False,True,False,False,False,False,False,True,False,False,False,False,False
2,35912.0,23.0,0.461538,Benign,0.175781,False,True,False,False,False,False,False,False,True,False,False,False,False,False
3,40726.0,23.0,0.153846,Benign,0.058594,False,True,False,False,False,False,False,False,True,False,False,False,False,False
4,40822.0,23.0,0.461538,Benign,0.175781,False,True,False,False,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18458562,14742.0,62336.0,0.000000,Malicious,0.000000,False,True,False,False,False,False,False,False,False,False,False,False,False,False
18458563,6820.0,62336.0,0.000000,Malicious,0.000000,False,True,False,False,False,False,False,False,False,False,False,False,False,False
18458564,46494.0,23.0,0.153846,Benign,0.058594,False,True,False,False,False,False,False,False,True,False,False,False,False,False
18458565,15769.0,23.0,0.102564,Malicious,0.039062,False,True,False,False,False,False,False,False,True,False,False,False,False,False


In [5]:
from sklearn.utils import resample

# Separate majority and minority classes
df_majority = df[df.label == 'Benign']
df_minority = df[df.label == 'Malicious']

# Downsample minority class
df_minority_downsampled = resample(df_minority, 
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_majority),  # to match majority class
                                   random_state=42)  # reproducible results

# Combine majority class with downsampled minority class
df1 = pd.concat([df_majority, df_minority_downsampled])

# Display new class distribution
print(df1['label'].value_counts())


label
Benign       8769642
Malicious    8769642
Name: count, dtype: int64


In [6]:
X = df1.drop('label', axis=1)  # Features
y = df1['label']                # Target variable


In [7]:

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [8]:

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)


In [9]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [10]:
# Train the model
rf_model.fit(X_train, y_train)

In [11]:
# Make predictions on the test set
y_pred = rf_model.predict(X_test)


In [12]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print(f"Accuracy: {accuracy:.4f}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.9942
Confusion Matrix:
[[1749890    4039]
 [  16476 1737452]]
Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99   1753929
           1       1.00      0.99      0.99   1753928

    accuracy                           0.99   3507857
   macro avg       0.99      0.99      0.99   3507857
weighted avg       0.99      0.99      0.99   3507857



In [14]:
joblib.dump(rf_model, 'networkdetection.joblib')


['networkdetection.joblib']