<a href="https://www.kaggle.com/code/dhaks13/rscv-pcc-on-drone-ids-dataset?scriptVersionId=225614042" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/drone-ids-dataset/GPS_Dataset_3D_8_Channels_Authentic_and_Simulated.xlsx
/kaggle/input/drone-ids-dataset/WSN-DS.csv


# WSN dataset

# Pre-processing

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
wsn_df = pd.read_csv('/kaggle/input/drone-ids-dataset/WSN-DS.csv')

# Drop irrelevant columns
wsn_df.drop(columns=[' id', ' Time', ' send_code '], inplace=True)

# Encode categorical target variable
encoder = LabelEncoder()
wsn_df['Attack type'] = encoder.fit_transform(wsn_df['Attack type'])
print(encoder.classes_)

wsn_df = wsn_df.astype(np.float32)

['Blackhole' 'Flooding' 'Grayhole' 'Normal' 'TDMA']


## Optimize Feature Selection

In [3]:
# Compute correlation with target
correlation_matrix = wsn_df.corr()['Attack type'].abs().sort_values(ascending=False)

# Select features with highest correlation (excluding the target itself)
selected_features = correlation_matrix.index[1:10]  # Top 10 features

print("Selected Features:", selected_features)


Selected Features: Index([' Is_CH', ' ADV_R', ' JOIN_S', ' SCH_R', ' ADV_S', ' DATA_S',
       ' Dist_To_CH', ' DATA_R', ' who CH'],
      dtype='object')


## Normalization

In [4]:
from scipy.stats import zscore

# Apply Z-score normalization
X_selected = wsn_df[selected_features]
X_normalized = X_selected.apply(zscore)

print(X_normalized.head())

      Is_CH     ADV_R    JOIN_S     SCH_R     ADV_S    DATA_S  Dist_To_CH  \
0  2.763714 -0.985272 -1.882417 -1.720360  0.355289 -1.053636   -1.029314   
1 -0.361832 -0.417438  0.531232  0.581274 -0.129878 -0.161081    2.401377   
2 -0.361832 -0.417438  0.531232  0.581274 -0.129878 -0.090616    1.109283   
3 -0.361832 -0.417438  0.531232  0.581274 -0.129878 -0.161081    1.924457   
4 -0.361832 -0.417438  0.531232  0.581274 -0.129878 -0.090616   -0.809171   

     DATA_R    who CH  
0  4.890899 -0.446206  
1 -0.320918 -0.446093  
2 -0.320918 -0.446180  
3 -0.320918 -0.446093  
4 -0.320918 -0.446180  


## Handle Class Imbalance with SMOTE

In [5]:
from imblearn.over_sampling import SMOTE

y = wsn_df['Attack type']
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_normalized, y)

## Splitting Data for training-80% testing-20%

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model

1. Random Forest(RF)
2. Decision Tree(DT)
3. Gaussian Naive Bayes (GNB)
4. Adaptive Boosting (AdaBoost)
5. Logistics Regression (LR)

In [7]:
# Import Required Modules
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report


# Define hyperparameter grids for each model
rf_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [3, 5, 10],
    'max_depth': [50, 100, 150, 200, None],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

dt_param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 3, 5],
    'criterion': ['gini', 'entropy']
}

gnb_param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
}

ab_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'algorithm': ['SAMME', 'SAMME.R']
}

lr_param_grid = {
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 300],
    'C': [0.01, 0.1, 1.0, 10]
}


## Haperparameter Tuning

In [8]:
print("Running RF")
rf = RandomForestClassifier(random_state=42)
rf_random = RandomizedSearchCV(rf, rf_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)
print("RF fitted")

Running RF




RF fitted


In [9]:
print("Running DT")
dt = DecisionTreeClassifier(random_state=42)
dt_random = RandomizedSearchCV(dt, dt_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
dt_random.fit(X_train, y_train)
print("DT fitted")

Running DT
DT fitted


In [10]:
print("Running GBN")
gnb = GaussianNB()
gnb_random = RandomizedSearchCV(gnb, gnb_param_grid, n_iter=5, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
gnb_random.fit(X_train, y_train)
print("GBN fitted")

Running GBN




GBN fitted


In [11]:
print("Running AB")
ab = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), random_state=42)
ab_random = RandomizedSearchCV(ab, ab_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
ab_random.fit(X_train, y_train)
print("AB fitted")

Running AB




AB fitted


In [12]:
print("Running LR")
lr = LogisticRegression(random_state=42)
lr_random = RandomizedSearchCV(lr, lr_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
lr_random.fit(X_train, y_train)
print("LR fitted")

Running LR


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.7824959  0.78365744 0.78254515     

LR fitted


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
print("Best Parameters for Random Forest:", rf_random.best_params_)
print("Best Parameters for Decision Tree:", dt_random.best_params_)
print("Best Parameters for Gaussian Naive Bayes:", gnb_random.best_params_)
print("Best Parameters for AdaBoost:", ab_random.best_params_)
print("Best Parameters for Logistic Regression:", lr_random.best_params_)

Best Parameters for Random Forest: {'n_estimators': 50, 'min_samples_split': 15, 'min_samples_leaf': 2, 'max_features': 5, 'max_depth': None, 'criterion': 'gini', 'bootstrap': True}
Best Parameters for Decision Tree: {'min_samples_split': 2, 'min_samples_leaf': 3, 'max_depth': 30, 'criterion': 'entropy'}
Best Parameters for Gaussian Naive Bayes: {'var_smoothing': 1e-07}
Best Parameters for AdaBoost: {'n_estimators': 100, 'learning_rate': 0.5, 'algorithm': 'SAMME'}
Best Parameters for Logistic Regression: {'solver': 'lbfgs', 'penalty': 'l2', 'max_iter': 200, 'C': 0.1}


## Train Final Models Using Best Hyperparameters

In [14]:
# model defining
rf_best = RandomForestClassifier(**rf_random.best_params_, random_state=42)
dt_best = DecisionTreeClassifier(**dt_random.best_params_, random_state=42)
gnb_best = GaussianNB(**gnb_random.best_params_)
ab_best = AdaBoostClassifier(**ab_random.best_params_, random_state=42)
lr_best = LogisticRegression(**lr_random.best_params_, random_state=42)

# Train models
rf_best.fit(X_train, y_train)
dt_best.fit(X_train, y_train)
gnb_best.fit(X_train, y_train)
ab_best.fit(X_train, y_train)
lr_best.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Evaluate Final Models

## Random Forest

In [15]:
y_pred_rf = rf_best.predict(X_test)

print("\nModel: Random Forest")
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}")
print(classification_report(y_test, y_pred_rf))


Model: Random Forest
Accuracy: 0.9046
              precision    recall  f1-score   support

         0.0       0.78      0.77      0.78     67995
         1.0       1.00      1.00      1.00     68361
         2.0       0.77      0.79      0.78     68156
         3.0       0.99      0.98      0.99     67557
         4.0       0.98      0.98      0.98     67997

    accuracy                           0.90    340066
   macro avg       0.91      0.90      0.90    340066
weighted avg       0.90      0.90      0.90    340066



##  Decision Tree

In [16]:
y_pred_dt = dt_best.predict(X_test)

print("\nModel: Decision Tree")
print(f"Accuracy: {accuracy_score(y_test, y_pred_dt):.4f}")
print(classification_report(y_test, y_pred_dt))


Model: Decision Tree
Accuracy: 0.9013
              precision    recall  f1-score   support

         0.0       0.77      0.77      0.77     67995
         1.0       1.00      1.00      1.00     68361
         2.0       0.77      0.78      0.77     68156
         3.0       0.99      0.98      0.98     67557
         4.0       0.98      0.98      0.98     67997

    accuracy                           0.90    340066
   macro avg       0.90      0.90      0.90    340066
weighted avg       0.90      0.90      0.90    340066



## Gaussian Naive Bayes

In [17]:
y_pred_gnb = gnb_best.predict(X_test)

print("\nModel: Gaussian Naive Bayes")
print(f"Accuracy: {accuracy_score(y_test, y_pred_gnb):.4f}")
print(classification_report(y_test, y_pred_gnb))


Model: Gaussian Naive Bayes
Accuracy: 0.6130


  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

         0.0       0.56      0.25      0.34     67995
         1.0       0.97      1.00      0.99     68361
         2.0       0.34      0.85      0.48     68156
         3.0       0.94      0.97      0.96     67557
         4.0       0.00      0.00      0.00     67997

    accuracy                           0.61    340066
   macro avg       0.56      0.61      0.55    340066
weighted avg       0.56      0.61      0.55    340066



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## AdaBoost

In [18]:
y_pred_ab = ab_best.predict(X_test)

print("\nModel: AdaBoost")
print(f"Accuracy: {accuracy_score(y_test, y_pred_ab):.4f}")
print(classification_report(y_test, y_pred_ab))


Model: AdaBoost
Accuracy: 0.7731
              precision    recall  f1-score   support

         0.0       0.52      0.40      0.45     67995
         1.0       1.00      1.00      1.00     68361
         2.0       0.52      0.64      0.57     68156
         3.0       0.94      0.97      0.96     67557
         4.0       0.88      0.86      0.87     67997

    accuracy                           0.77    340066
   macro avg       0.77      0.77      0.77    340066
weighted avg       0.77      0.77      0.77    340066



## Logistic Regression

In [19]:
y_pred_lr = lr_best.predict(X_test)

print("\nModel: Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(classification_report(y_test, y_pred_lr))


Model: Logistic Regression
Accuracy: 0.7835
              precision    recall  f1-score   support

         0.0       0.59      0.34      0.43     67995
         1.0       1.00      1.00      1.00     68361
         2.0       0.55      0.74      0.63     68156
         3.0       0.94      0.97      0.96     67557
         4.0       0.84      0.87      0.85     67997

    accuracy                           0.78    340066
   macro avg       0.78      0.78      0.77    340066
weighted avg       0.78      0.78      0.77    340066

