In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/drone-ids-dataset/GPS_Dataset_3D_8_Channels_Authentic_and_Simulated.xlsx
/kaggle/input/drone-ids-dataset/WSN-DS.csv


# WSN dataset

# Pre-processing

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load dataset
wsn_df = pd.read_csv('/kaggle/input/drone-ids-dataset/WSN-DS.csv')

# Drop irrelevant columns
wsn_df.drop(columns=[' id', ' Time', ' send_code '], inplace=True)

# Encode categorical target variable
encoder = LabelEncoder()
wsn_df['Attack type'] = encoder.fit_transform(wsn_df['Attack type'])
print(encoder.classes_)

wsn_df = wsn_df.astype(np.float32)

['Blackhole' 'Flooding' 'Grayhole' 'Normal' 'TDMA']


## Optimize Feature Selection

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE

# Take a sample (reduce memory usage)
sample_size = min(30000, len(wsn_df))
wsn_sample = wsn_df.sample(n=sample_size, random_state=42)

X_sample = wsn_sample.drop(columns=['Attack type'])
y_sample = wsn_sample['Attack type']

# Apply RFE
rfe = RFE(RandomForestClassifier(n_estimators=50, random_state=42), n_features_to_select=10)
rfe.fit(X_sample, y_sample)

# Get selected features
selected_features = X_sample.columns[rfe.support_]
print("Selected Features:", selected_features)

Selected Features: Index([' Is_CH', ' who CH', ' ADV_S', ' JOIN_R', ' SCH_S', 'Rank', ' DATA_S',
       ' Data_Sent_To_BS', ' dist_CH_To_BS', 'Expaned Energy'],
      dtype='object')


## Normalization

In [4]:
from scipy.stats import zscore

# Apply Z-score normalization
X_selected = wsn_df[selected_features]
X_normalized = X_selected.apply(zscore)

print(X_normalized.head())

      Is_CH    who CH     ADV_S    JOIN_R     SCH_S      Rank    DATA_S  \
0  2.763714 -0.446206  0.355289  5.171597  0.258106 -0.659800 -1.053636   
1 -0.361832 -0.446093 -0.129878 -0.157198 -0.104904 -0.523578 -0.161081   
2 -0.361832 -0.446180 -0.129878 -0.157198 -0.104904  0.634312 -0.090616   
3 -0.361832 -0.446093 -0.129878 -0.157198 -0.104904  0.429979 -0.161081   
4 -0.361832 -0.446180 -0.129878 -0.157198 -0.104904  1.042979 -0.090616   

    Data_Sent_To_BS   dist_CH_To_BS  Expaned Energy  
0          2.206935        2.139262        3.232062  
1         -0.232198       -0.448907       -0.352658  
2         -0.232198       -0.448907       -0.353539  
3         -0.232198       -0.448907       -0.356900  
4         -0.232198       -0.448907       -0.358977  


## Handle Class Imbalance with SMOTE

In [5]:
from imblearn.over_sampling import SMOTE

y = wsn_df['Attack type']
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X_normalized, y)

## Splitting Data for training-80% testing-20%

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Model

1. Random Forest(RF)
2. Decision Tree(DT)
3. Gaussian Naive Bayes (GNB)
4. Adaptive Boosting (AdaBoost)
5. Logistics Regression (LR)

In [7]:
# Import Required Modules
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report


# Define hyperparameter grids for each model
rf_param_grid = {
    'n_estimators': [50, 100, 150, 200],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10],
    'max_features': [3, 5, 10],
    'max_depth': [50, 100, 150, 200, None],
    'criterion': ['gini', 'entropy'],
    'bootstrap': [True, False]
}

dt_param_grid = {
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 3, 5],
    'criterion': ['gini', 'entropy']
}

gnb_param_grid = {
    'var_smoothing': [1e-9, 1e-8, 1e-7, 1e-6]
}

ab_param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.5, 1],
    'algorithm': ['SAMME', 'SAMME.R']
}

lr_param_grid = {
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l1', 'l2'],
    'max_iter': [100, 200, 300],
    'C': [0.01, 0.1, 1.0, 10]
}


## Haperparameter Tuning

In [8]:
print("Running AB")
ab = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1), random_state=42)
ab_random = RandomizedSearchCV(ab, ab_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
ab_random.fit(X_train, y_train)
print("AB fitted")

Running AB




AB fitted


In [9]:
print("Running LR")
lr = LogisticRegression(random_state=42)
lr_random = RandomizedSearchCV(lr, lr_param_grid, n_iter=10, cv=5, scoring='accuracy', random_state=42, n_jobs=-1)
lr_random.fit(X_train, y_train)
print("LR fitted")

Running LR


10 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

 0.90042227 0.89911444 0.90010763     

LR fitted


In [10]:
#print("Best Parameters for Random Forest:", rf_random.best_params_)
#print("Best Parameters for Decision Tree:", dt_random.best_params_)
#print("Best Parameters for Gaussian Naive Bayes:", gnb_random.best_params_)
print("Best Parameters for AdaBoost:", ab_random.best_params_)
print("Best Parameters for Logistic Regression:", lr_random.best_params_)

Best Parameters for AdaBoost: {'n_estimators': 50, 'learning_rate': 1, 'algorithm': 'SAMME'}
Best Parameters for Logistic Regression: {'solver': 'liblinear', 'penalty': 'l2', 'max_iter': 100, 'C': 1.0}


## Train Final Models Using Best Hyperparameters

In [11]:
# model defining
#rf_best = RandomForestClassifier(**rf_random.best_params_, random_state=42)
#dt_best = DecisionTreeClassifier(**dt_random.best_params_, random_state=42)
#gnb_best = GaussianNB(**gnb_random.best_params_)
ab_best = AdaBoostClassifier(**ab_random.best_params_, random_state=42)
lr_best = LogisticRegression(**lr_random.best_params_, random_state=42)

# Train models
#rf_best.fit(X_train, y_train)
#dt_best.fit(X_train, y_train)
#gnb_best.fit(X_train, y_train)
ab_best.fit(X_train, y_train)
lr_best.fit(X_train, y_train)

# Evaluate Final Models

## AdaBoost

In [12]:
y_pred_ab = ab_best.predict(X_test)

print("\nModel: AdaBoost")
print(f"Accuracy: {accuracy_score(y_test, y_pred_ab):.4f}")
print(classification_report(y_test, y_pred_ab))


Model: AdaBoost
Accuracy: 0.9416
              precision    recall  f1-score   support

         0.0       0.96      0.89      0.92     67995
         1.0       1.00      0.99      1.00     68361
         2.0       0.84      0.97      0.90     68156
         3.0       0.94      0.97      0.96     67557
         4.0       1.00      0.88      0.94     67997

    accuracy                           0.94    340066
   macro avg       0.95      0.94      0.94    340066
weighted avg       0.95      0.94      0.94    340066



## Logistic Regression

In [13]:
y_pred_lr = lr_best.predict(X_test)

print("\nModel: Logistic Regression")
print(f"Accuracy: {accuracy_score(y_test, y_pred_lr):.4f}")
print(classification_report(y_test, y_pred_lr))


Model: Logistic Regression
Accuracy: 0.9012
              precision    recall  f1-score   support

         0.0       0.72      1.00      0.84     67995
         1.0       1.00      1.00      1.00     68361
         2.0       0.96      0.60      0.74     68156
         3.0       0.94      0.97      0.96     67557
         4.0       0.99      0.93      0.96     67997

    accuracy                           0.90    340066
   macro avg       0.92      0.90      0.90    340066
weighted avg       0.92      0.90      0.90    340066

