In [32]:
import pandas as pd

# Load the training and testing data
train_df = pd.read_csv('dataset\\UNSW_NB15_training-set.csv')
test_df = pd.read_csv('dataset\\UNSW_NB15_testing-set.csv')

# Display the first few rows of the training data
print(train_df.head())


   id       dur proto service state  spkts  dpkts  sbytes  dbytes       rate  \
0   1  0.121478   tcp       -   FIN      6      4     258     172  74.087490   
1   2  0.649902   tcp       -   FIN     14     38     734   42014  78.473372   
2   3  1.623129   tcp       -   FIN      8     16     364   13186  14.170161   
3   4  1.681642   tcp     ftp   FIN     12     12     628     770  13.677108   
4   5  0.449454   tcp       -   FIN     10      6     534     268  33.373826   

   ...  ct_dst_sport_ltm  ct_dst_src_ltm  is_ftp_login  ct_ftp_cmd  \
0  ...                 1               1             0           0   
1  ...                 1               2             0           0   
2  ...                 1               3             0           0   
3  ...                 1               3             1           1   
4  ...                 1              40             0           0   

   ct_flw_http_mthd  ct_src_ltm  ct_srv_dst  is_sm_ips_ports  attack_cat  \
0                 0   

In [26]:
!pip install scapy spacy pymongo requests scikit-learn pandas numpy imbalanced-learn 



In [33]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
import pickle


# Drop unnecessary columns
train_df.drop(['id', 'label'], axis=1, inplace=True)
test_df.drop(['id', 'label'], axis=1, inplace=True)

# Combine datasets for encoding
cat_columns = train_df.select_dtypes(include=['object']).columns
combined_df = pd.concat([train_df[cat_columns], test_df[cat_columns]], axis=0)

# Encode categorical features
for col in cat_columns:
    encoder = LabelEncoder()
    encoder.fit(combined_df[col])
    train_df[col] = encoder.transform(train_df[col])
    test_df[col] = encoder.transform(test_df[col])

# Separate features and labels
X_train = train_df.drop(['attack_cat'], axis=1)
y_train = train_df['attack_cat']
X_test = test_df.drop(['attack_cat'], axis=1)
y_test = test_df['attack_cat']

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Combine features and labels for resampling
X_combined = pd.concat([pd.DataFrame(X_train), pd.DataFrame(X_test)], axis=0)
y_combined = pd.concat([y_train, y_test], axis=0)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_combined, y_combined)

# Split the resampled data back into train and test sets
X_train_resampled, X_test_resampled, y_train_resampled, y_test_resampled = train_test_split(X_resampled, y_resampled, test_size=0.3, random_state=42)

# Use a subset of the data for hyperparameter tuning
X_tune, _, y_tune, _ = train_test_split(X_train_resampled, y_train_resampled, test_size=0.9, random_state=42)

# Initialize the model
clf = RandomForestClassifier()

# Define a smaller parameter grid
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# Perform Grid Search with a subset of data, using single job
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=3, n_jobs=1, verbose=2)
grid_search.fit(X_tune, y_tune)

# Use the best estimator
best_clf = grid_search.best_estimator_

# Evaluate the model with the full test set
y_pred_resampled = best_clf.predict(X_test_resampled)
print(classification_report(y_test_resampled, y_pred_resampled))

# Save the best model
with open('best_model.pkl', 'wb') as model_file:
    pickle.dump(best_clf, model_file)

# Save the scaler
with open('scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

# Save the label encoder for attack categories
with open('encoder.pkl', 'wb') as encoder_file:
    pickle.dump(encoder, encoder_file)


Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.5s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=   4.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   9.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   9.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=   9.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   4.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   4.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=50; total time=   4.4s
[CV] END max_depth