<a href="https://www.kaggle.com/code/dhaks13/rscv-rf-on-drone-ids-dataset?scriptVersionId=225729324" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/drone-ids-dataset/GPS_Dataset_3D_8_Channels_Authentic_and_Simulated.xlsx
/kaggle/input/drone-ids-dataset/WSN-DS.csv


# WSN dataset

# Pre-processing

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
wsn_df = pd.read_csv('/kaggle/input/drone-ids-dataset/WSN-DS.csv')

# Drop irrelevant columns
wsn_df.drop(columns=[' id', ' Time', ' send_code '], inplace=True)

# Encode categorical target variable
encoder = LabelEncoder()
wsn_df['Attack type'] = encoder.fit_transform(wsn_df['Attack type'])
print(encoder.classes_)

wsn_df = wsn_df.astype(np.float32)

['Blackhole' 'Flooding' 'Grayhole' 'Normal' 'TDMA']


## Optimize Feature Selection

In [3]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
y = wsn_df['Attack type']
X = wsn_df.drop('Attack type',axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize RF with default parameters for feature selection
rf = RandomForestClassifier(
    n_estimators=150, 
    min_samples_split=15, 
    min_samples_leaf=5, 
    max_features=5, 
    max_depth=200, 
    criterion='entropy', 
    bootstrap=True,
    random_state=42
)

# Fit RF on training data
rf.fit(X_train, y_train)

# Get feature importance scores
feature_importances = pd.DataFrame({
    'Feature': X_train.columns, 
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display top 10 most important features
print(feature_importances.head(10))


             Feature  Importance
3              ADV_S    0.217610
0              Is_CH    0.180222
12   Data_Sent_To_BS    0.098849
10            DATA_S    0.096756
14    Expaned Energy    0.096439
7              SCH_S    0.069793
9               Rank    0.061723
13     dist_CH_To_BS    0.043501
6             JOIN_R    0.026741
4              ADV_R    0.022524


In [4]:
# Set importance threshold (e.g., keep features with importance > 0.01)
selected_features = feature_importances[feature_importances['Importance'] > 0.01]['Feature'].tolist()

# Filter dataset with selected features
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

print(f"Selected Features: {selected_features}")
print(f"Reduced feature set shape: {X_train_selected.shape}")

Selected Features: [' ADV_S', ' Is_CH', ' Data_Sent_To_BS', ' DATA_S', 'Expaned Energy', ' SCH_S', 'Rank', ' dist_CH_To_BS', ' JOIN_R', ' ADV_R', ' who CH', ' JOIN_S', ' DATA_R', ' Dist_To_CH', ' SCH_R']
Reduced feature set shape: (299728, 15)


## Normalization

In [5]:
from scipy.stats import zscore

# Apply Z-score normalization
X_selected = wsn_df[selected_features]
X_normalized = X_selected.apply(zscore)

print(X_normalized.head())

      ADV_S     Is_CH   Data_Sent_To_BS    DATA_S  Expaned Energy     SCH_S  \
0  0.355289  2.763714          2.206935 -1.053636        3.232062  0.258106   
1 -0.129878 -0.361832         -0.232198 -0.161081       -0.352658 -0.104904   
2 -0.129878 -0.361832         -0.232198 -0.090616       -0.353539 -0.104904   
3 -0.129878 -0.361832         -0.232198 -0.161081       -0.356900 -0.104904   
4 -0.129878 -0.361832         -0.232198 -0.090616       -0.358977 -0.104904   

       Rank   dist_CH_To_BS    JOIN_R     ADV_R    who CH    JOIN_S    DATA_R  \
0 -0.659800        2.139262  5.171597 -0.985272 -0.446206 -1.882417  4.890899   
1 -0.523578       -0.448907 -0.157198 -0.417438 -0.446093  0.531232 -0.320918   
2  0.634312       -0.448907 -0.157198 -0.417438 -0.446180  0.531232 -0.320918   
3  0.429979       -0.448907 -0.157198 -0.417438 -0.446093  0.531232 -0.320918   
4  1.042979       -0.448907 -0.157198 -0.417438 -0.446180  0.531232 -0.320918   

    Dist_To_CH     SCH_R  
0    -1.029

## Handle Class Imbalance with SMOTE

In [6]:
from imblearn.over_sampling import SMOTE

y = wsn_df['Attack type']
print(y)
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_normalized, y)

0         3.0
1         3.0
2         3.0
3         3.0
4         3.0
         ... 
374656    3.0
374657    3.0
374658    3.0
374659    3.0
374660    3.0
Name: Attack type, Length: 374661, dtype: float32


## Splitting Data for training-80% testing-20%

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model

1. Random Forest(RF)
2. Decision Tree(DT)
3. Gaussian Naive Bayes (GNB)
4. Adaptive Boosting (AdaBoost)
5. Logistics Regression (LR)

In [8]:
# Import Required Modules
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report


# Define hyperparameter grids for each model
rf_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [3, 5, 10],
    'max_depth': [50, 100, 150, 200, None],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

dt_param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 3, 5],
    'criterion': ['gini', 'entropy']
}

gnb_param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
}

ab_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'algorithm': ['SAMME', 'SAMME.R']
}

lr_param_grid = {
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 300],
    'C': [0.01, 0.1, 1.0, 10]
}


## Haperparameter Tuning

In [9]:
print("Running RF")
rf = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(rf, rf_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)
print("RF fitted")

Running RF
RF fitted


In [10]:
print("Running DT")
dt = DecisionTreeClassifier(random_state=42)
dt_random = RandomizedSearchCV(dt, dt_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
dt_random.fit(X_train, y_train)
print("DT fitted")

Running DT
DT fitted


In [11]:
print("Running GBN")
gnb = GaussianNB()
gnb_random = RandomizedSearchCV(gnb, gnb_param_grid, n_iter=5, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
gnb_random.fit(X_train, y_train)
print("GBN fitted")

Running GBN




GBN fitted


In [12]:
print("Running AB")
ab = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), random_state=42)
ab_random = RandomizedSearchCV(ab, ab_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
ab_random.fit(X_train, y_train)
print("AB fitted")

Running AB




AB fitted


In [13]:
print("Running LR")
lr = LogisticRegression(random_state=42)
lr_random = RandomizedSearchCV(lr, lr_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
lr_random.fit(X_train, y_train)
print("LR fitted")

Running LR


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.90049432 0.8999503  0.90059062     

LR fitted


In [14]:
print("Best Parameters for Random Forest:", rf_random.best_params_)
print("Best Parameters for Decision Tree:", dt_random.best_params_)
print("Best Parameters for Gaussian Naive Bayes:", gnb_random.best_params_)
print("Best Parameters for AdaBoost:", ab_random.best_params_)
print("Best Parameters for Logistic Regression:", lr_random.best_params_)

Best Parameters for Random Forest: {'n_estimators': 200, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 10, 'max_depth': 50, 'criterion': 'entropy', 'bootstrap': True}
Best Parameters for Decision Tree: {'min_samples_split': 10, 'min_samples_leaf': 1, 'max_depth': 30, 'criterion': 'entropy'}
Best Parameters for Gaussian Naive Bayes: {'var_smoothing': 1e-06}
Best Parameters for AdaBoost: {'n_estimators': 50, 'learning_rate': 1, 'algorithm': 'SAMME'}
Best Parameters for Logistic Regression: {'solver': 'liblinear', 'penalty': 'l1', 'max_iter': 200, 'C': 0.01}


## Train Final Models Using Best Hyperparameters

In [15]:
# model defining
rf_best = RandomForestClassifier(**rf_random.best_params_, random_state=42)
dt_best = DecisionTreeClassifier(**dt_random.best_params_, random_state=42)
gnb_best = GaussianNB(**gnb_random.best_params_)
ab_best = AdaBoostClassifier(**ab_random.best_params_, random_state=42)
lr_best = LogisticRegression(**lr_random.best_params_, random_state=42)

# Train models
rf_best.fit(X_train, y_train)
dt_best.fit(X_train, y_train)
gnb_best.fit(X_train, y_train)
ab_best.fit(X_train, y_train)
lr_best.fit(X_train, y_train)

# Evaluate Final Models

## Random Forest

In [16]:
y_pred_rf = rf_best.predict(X_test)

print("\nModel: Random Forest")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))


Model: Random Forest
Accuracy: 0.9986
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     67995
         1.0       1.00      1.00      1.00     68361
         2.0       1.00      1.00      1.00     68156
         3.0       1.00      1.00      1.00     67557
         4.0       1.00      1.00      1.00     67997

    accuracy                           1.00    340066
   macro avg       1.00      1.00      1.00    340066
weighted avg       1.00      1.00      1.00    340066



##  Decision Tree

In [17]:
y_pred_dt = dt_best.predict(X_test)

print("\nModel: Decision Tree")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(classification_report(y_test, y_pred_dt))


Model: Decision Tree
Accuracy: 0.9976
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     67995
         1.0       1.00      1.00      1.00     68361
         2.0       1.00      1.00      1.00     68156
         3.0       1.00      1.00      1.00     67557
         4.0       1.00      1.00      1.00     67997

    accuracy                           1.00    340066
   macro avg       1.00      1.00      1.00    340066
weighted avg       1.00      1.00      1.00    340066



## Gaussian Naive Bayes

In [18]:
y_pred_gnb = gnb_best.predict(X_test)

print("\nModel: Gaussian Naive Bayes")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gnb):.4f}")
print(classification_report(y_test, y_pred_gnb))


Model: Gaussian Naive Bayes
Accuracy: 0.8589
              precision    recall  f1-score   support

         0.0       0.65      1.00      0.79     67995
         1.0       0.93      1.00      0.96     68361
         2.0       0.96      0.60      0.74     68156
         3.0       0.94      0.97      0.95     67557
         4.0       1.00      0.72      0.84     67997

    accuracy                           0.86    340066
   macro avg       0.90      0.86      0.86    340066
weighted avg       0.90      0.86      0.86    340066



## AdaBoost

In [19]:
y_pred_ab = ab_best.predict(X_test)

print("\nModel: AdaBoost")
print(f"Accuracy: {accuracy_score(y_test, y_pred_ab):.4f}")
print(classification_report(y_test, y_pred_ab))


Model: AdaBoost
Accuracy: 0.9546
              precision    recall  f1-score   support

         0.0       0.96      0.91      0.94     67995
         1.0       1.00      0.99      1.00     68361
         2.0       0.89      0.97      0.93     68156
         3.0       0.94      0.97      0.95     67557
         4.0       1.00      0.92      0.96     67997

    accuracy                           0.95    340066
   macro avg       0.96      0.95      0.95    340066
weighted avg       0.96      0.95      0.95    340066



## Logistic Regression

In [20]:
y_pred_lr = lr_best.predict(X_test)

print("\nModel: Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(classification_report(y_test, y_pred_lr))


Model: Logistic Regression
Accuracy: 0.8997
              precision    recall  f1-score   support

         0.0       0.71      1.00      0.83     67995
         1.0       1.00      1.00      1.00     68361
         2.0       0.96      0.60      0.73     68156
         3.0       0.94      0.97      0.95     67557
         4.0       0.99      0.93      0.96     67997

    accuracy                           0.90    340066
   macro avg       0.92      0.90      0.90    340066
weighted avg       0.92      0.90      0.90    340066

