In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from numba import jit
from numba import njit
from numba import cuda
from numba import types
from numba.typed import Dict

# error rate
@jit(nopython=True,target_backend='cuda')
def error_rate(xtrain, ytrain, x, opts):
    # parameters
    k     = opts['k']
    fold  = opts['fold']
    xt    = fold['xt']
    yt    = fold['yt']
    xv    = fold['xv']
    yv    = fold['yv']

    # Number of instances
    num_train = np.size(xt, 0)
    num_valid = np.size(xv, 0)
    # Define selected features
    xtrain  = xt[:, x == 1]
    ytrain  = yt.reshape(num_train)  # Solve bug
    xvalid  = xv[:, x == 1]
    yvalid  = yv.reshape(num_valid)  # Solve bug
    # Training
    mdl     = KNeighborsClassifier(n_neighbors = k)
    mdl.fit(xtrain, ytrain)
    # Prediction
    ypred   = mdl.predict(xvalid)
    acc     = np.sum(yvalid == ypred) / num_valid
    error   = 1 - acc

    return error


# Error rate & Feature size
@jit(nopython=True,target_backend='cuda')
def Fun(xtrain, ytrain, x, opts):
    # Parameters
    alpha    = 0.99
    beta     = 1 - alpha
    # Original feature size
    max_feat = len(x)
    # Number of selected features
    num_feat = np.sum(x == 1)
    # Solve if no feature selected
    if num_feat == 0:
        cost  = 1
    else:
        # Get error rate
        error = error_rate(xtrain, ytrain, x, opts)
        # Objective function
        cost  = alpha * error + beta * (num_feat / max_feat)

    return cost

In [2]:
import numpy as np
from numpy.random import rand
#from FS.functionHO import Fun

@jit(nopython=True,target_backend='cuda')
def init_position(lb, ub, N, dim):
    X = np.zeros([N, dim], dtype='float')
    for i in range(N):
        for d in range(dim):
            X[i,d] = lb[0,d] + (ub[0,d] - lb[0,d]) * rand()

    return X

@jit(nopython=True,target_backend='cuda')
def binary_conversion(X, thres, N, dim):
    Xbin = np.zeros([N, dim], dtype='int')
    for i in range(N):
        for d in range(dim):
            if X[i,d] > thres:
                Xbin[i,d] = 1
            else:
                Xbin[i,d] = 0

    return Xbin

@jit(nopython=True,target_backend='cuda')
def boundary(x, lb, ub):
    if x < lb:
        x = lb
    if x > ub:
        x = ub

    return x


#--- Opposition based learning (7)
@jit(nopython=True,target_backend='cuda')
def opposition_based_learning(X, lb, ub, thres, N, dim):
    Xo = np.zeros([N, dim], dtype='float')
    for i in range(N):
        for d in range(dim):
            Xo[i,d] = lb[0,d] + ub[0,d] - X[i,d]

    return Xo

# @jit(target_backend='cuda')
@jit(nopython=True,target_backend='cuda')
def jfs(xtrain, ytrain, opts):
    # Parameters
    ub             = 1
    lb             = 0
    thres          = 0.5
    max_local_iter = 10     # maximum iteration for local search

    N              = opts['N']
    max_iter       = opts['T']
    if 'maxLt' in opts:
        max_local_iter = opts['maxLt']

    # Dimension
    dim = np.size(xtrain, 1)
    if np.size(lb) == 1:
        ub = ub * np.ones([1, dim], dtype='float')
        lb = lb * np.ones([1, dim], dtype='float')

    # Initialize position
    X    = init_position(lb, ub, N, dim)

    # Pre
    fit  = np.zeros([N, 1], dtype='float')
    Xf   = np.zeros([1, dim], dtype='float')
    fitF = float('inf')

    # Binary conversion
    Xbin = binary_conversion(X, thres, N, dim)

    # Fitness
    for i in range(N):
        fit[i,0] = Fun(xtrain, ytrain, Xbin[i,:], opts)
        if fit[i,0] < fitF:
            Xf[0,:] = X[i,:]
            fitF    = fit[i,0]

    #--- Opposition based learning
    Xo    = opposition_based_learning(X, lb, ub, thres, N, dim)
    #--- Binary conversion
    Xobin = binary_conversion(Xo, thres, N, dim)

    #--- Fitness
    fitO  = np.zeros([N, 1], dtype='float')
    for i in range(N):
        fitO[i,0] = Fun(xtrain, ytrain, Xobin[i,:], opts)
        if fitO[i,0] < fitF:
            Xf[0,:] = Xo[i,:]
            fitF    = fitO[i,0]

    #--- Merge opposite & current population, and select best N
    XX  = np.concatenate((X, Xo), axis=0)
    FF  = np.concatenate((fit, fitO), axis=0)
    #--- Sort in ascending order
    ind = np.argsort(FF, axis=0)
    for i in range(N):
        X[i,:]   = XX[ind[i,0],:]
        fit[i,0] = FF[ind[i,0]]

    curve = np.zeros([1, max_iter], dtype='float')
    t     = 0

    # Store result
    curve[0,t] = fitF.copy()
    print("Iteration:", t + 1)
    print("Best (ISSA):", curve[0,t])
    t += 1

    while t < max_iter:
 	    # Compute coefficient, c1 (2)
        c1 = 2 * np.exp(-(4 * t / max_iter) ** 2)

        for i in range(N):
            # First leader update
            if i == 0:
                for d in range(dim):
                    # Coefficient c2 & c3 [0 ~ 1]
                    c2 = rand()
                    c3 = rand()
              	    # Leader update (1)
                    if c3 >= 0.5:
                        X[i,d] = Xf[0,d] + c1 * ((ub[0,d] - lb[0,d]) * c2 + lb[0,d])
                    else:
                        X[i,d] = Xf[0,d] - c1 * ((ub[0,d] - lb[0,d]) * c2 + lb[0,d])

                    # Boundary
                    X[i,d] = boundary(X[i,d], lb[0,d], ub[0,d])

            # Salp update
            elif i >= 1:
                for d in range(dim):
                    # Salp update by following front salp (3)
                    X[i,d] = (X[i,d] + X[i-1, d]) / 2
                    # Boundary
                    X[i,d] = boundary(X[i,d], lb[0,d], ub[0,d])

        # Binary conversion
        Xbin = binary_conversion(X, thres, N, dim)

        # Fitness
        for i in range(N):
            fit[i,0] = Fun(xtrain, ytrain, Xbin[i,:], opts)
            if fit[i,0] < fitF:
                Xf[0,:] = X[i,:]
                fitF    = fit[i,0]

        #--- Local search algorithm
        Lt        = 0
        temp      = np.zeros([1, dim], dtype='float')
        temp[0,:] = Xf[0,:]

        while Lt < max_local_iter:
            #--- Random three features
            RD = np.random.permutation(dim)
            for d in range(3):
                index = RD[d]
                #--- Flip the selected three features
                if temp[0,index] > thres:
                    temp[0,index] = temp[0,index] - thres
                else:
                    temp[0,index] = temp[0,index] + thres

            #--- Binary conversion
            temp_bin = binary_conversion(temp, thres, 1, dim)

            #--- Fitness
            Fnew = Fun(xtrain, ytrain, temp_bin[0,:], opts)
            if Fnew < fitF:
                fitF    = Fnew
                Xf[0,:] = temp[0,:]

            Lt += 1


        # Store result
        curve[0,t] = fitF.copy()
        print("Iteration:", t + 1)
        print("Best (ISSA):", curve[0,t])
        t += 1


    # Best feature subset
    Gbin       = binary_conversion(Xf, thres, 1, dim)
    Gbin       = Gbin.reshape(dim)
    pos        = np.asarray(range(0, dim))
    sel_index  = pos[Gbin == 1]
    num_feat   = len(sel_index)
    # Create dictionary
    issa_data = {'sf': sel_index, 'c': curve, 'nf': num_feat}
        


    return issa_data

In [3]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [4]:
data0  = pd.read_csv('H:\Hybrid Metaheuristic Feature Selection\Dataset\IoT ID 20_StatisticalFS.csv',low_memory=False)
#read dataset. Use your own dataset path here
data0

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Bwd_Pkts/b_Avg,Bwd_Blk_Rate_Avg,Init_Fwd_Win_Byts,Init_Bwd_Win_Byts,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Idle_Std,Label,Cat
0,12682,25883,8002,203,447,1,2716,122,1,0,...,0,0,0,748,0,0,0,0,1,3
1,12682,25883,8002,203,447,1,2796,483,3,0,...,0,0,0,748,0,0,0,20002,1,3
2,47157,25889,9404,10,9,1,2763,256,0,2,...,0,0,0,297,0,0,0,8601,1,3
3,12684,25886,9281,200,70,1,2893,152,0,1,...,0,0,0,748,0,0,0,0,1,3
4,12682,25883,8002,203,447,1,2750,342,0,1,...,0,0,0,3278,0,0,0,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42014,296,25889,8406,188,9,1,21,143,2,0,...,0,0,0,1236,0,0,0,253,0,1
42015,47156,7,4,205,276,1,3488,81,1,0,...,0,0,0,187,0,0,0,0,0,2
42016,13703,25883,10853,233,67,2,3263,31,6,0,...,0,0,0,0,0,0,0,3178,0,2
42017,12446,25883,8006,203,76,2,3486,202,2,0,...,0,0,0,0,0,0,0,11269,0,2


In [5]:
data0['Label'].value_counts()

Label
1    40073
0     1946
Name: count, dtype: int64

In [6]:
data1 = data0[(data0['Label']==1)]
data1 = data1.sample(n=None, frac=1,replace=False,weights=None,random_state=42,axis=0)

data2 = data0[(data0['Label']==0)]
data2 = data2.sample(n=None, frac=0.7,replace=False,weights=None,random_state=42,axis=0)

data0 = pd.concat([data1,data2])
data0

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Bwd_Pkts/b_Avg,Bwd_Blk_Rate_Avg,Init_Fwd_Win_Byts,Init_Bwd_Win_Byts,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Idle_Std,Label,Cat
21626,12682,25883,8002,203,447,1,2698,352,2,0,...,0,0,0,748,0,0,0,13532,1,3
24751,12682,25883,8002,203,447,1,2625,195,1,0,...,0,0,0,748,0,0,0,0,1,3
32910,12682,25883,8002,203,447,1,2707,295,3,0,...,0,0,0,748,0,0,0,11610,1,3
34257,12682,25883,8002,203,447,1,2840,201,2,0,...,0,0,0,748,0,0,0,12618,1,3
2907,12682,25883,8002,203,447,1,2641,258,2,0,...,0,0,0,748,0,0,0,8144,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40571,12695,25886,9866,200,70,1,2300,120,0,1,...,0,0,0,748,0,0,0,0,0,4
41349,13738,25883,10888,233,67,2,3270,198,1,0,...,0,0,0,0,0,0,0,0,0,2
41398,922,379,125,200,10,1,3695,1459,0,1,...,0,0,0,2341,0,0,0,0,0,0
40084,47156,7,4,205,276,1,3460,76,0,1,...,0,0,0,2254,0,0,0,0,0,2


In [7]:
data0['Label'].value_counts()

Label
1    40073
0     1362
Name: count, dtype: int64

In [8]:
# z = data.columns
# for i in z:
#   print('Val: \n',data[i].value_counts(),'\n\n')

In [9]:
printdata = data0 #the dataframe used for displaying feature names since everthing will be in values now
printdata.to_csv('H:\Hybrid Metaheuristic Feature Selection\Dataset\IoT ID 20_StatisticalFS.csv',index=False)

In [10]:
data  = data0.values
print(data)

[[12682 25883  8002 ... 13532     1     3]
 [12682 25883  8002 ...     0     1     3]
 [12682 25883  8002 ... 11610     1     3]
 ...
 [  922   379   125 ...     0     0     0]
 [47156     7     4 ...     0     0     2]
 [46557 25886  9752 ...     0     0     2]]


In [11]:
#seperate the features and labels
feat  = np.asarray(data[:, 0:-1])   # feature vector
feat

array([[12682, 25883,  8002, ...,     0, 13532,     1],
       [12682, 25883,  8002, ...,     0,     0,     1],
       [12682, 25883,  8002, ...,     0, 11610,     1],
       ...,
       [  922,   379,   125, ...,     0,     0,     0],
       [47156,     7,     4, ...,     0,     0,     0],
       [46557, 25886,  9752, ...,     0,     0,     0]], dtype=int64)

In [12]:
label = np.asarray(data[:, -2])     # label vector
label

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [13]:
# split data into train & validation (70 -- 30)
xtrain, xtest, ytrain, ytest = train_test_split(feat, label, test_size=0.3, stratify=label)

fold = {'xt':xtrain, 'yt':ytrain, 'xv':xtest, 'yv':ytest}
    
# parameter
k     = 5     # k-value in KNN
N     = 34    # number of salps
T     = 5     # maximum number of iterations
maxLt = 10     # maximum iteration for local search algorithm
opts = {'k':k, 'fold':fold, 'N':N, 'T':T, 'maxLt':maxLt}
# opts = Dict()
# for a,b in {'k':k, 'fold':fold, 'N':N, 'T':T, 'maxLt':maxLt}.items():
#     opts[a] = b 

# perform feature selection
fmdl  = jfs(feat, label, opts)
sf    = fmdl['sf']

# model with selected features
num_train = np.size(xtrain, 0)
num_valid = np.size(xtest, 0)
x_train   = xtrain[:, sf]
y_train   = ytrain.reshape(num_train)  # Solve bug
x_valid   = xtest[:, sf]
y_valid   = ytest.reshape(num_valid)  # Solve bug

mdl = KNeighborsClassifier(n_neighbors = k)
mdl.fit(x_train, y_train)

# accuracy
y_pred    = mdl.predict(x_valid)
Acc       = np.sum(y_valid == y_pred)  / num_valid
print("Accuracy:", 100 * Acc)

# number of selected features
num_feat = fmdl['nf']
print("Feature Size:", num_feat)
print("Feature selected:", sf)
# plot convergence
curve   = fmdl['c']
curve   = curve.reshape(np.size(curve,1))
x       = np.arange(0, opts['T'], 1.0) + 1.0

fig, ax = plt.subplots()
ax.plot(x, curve, 'o-')
ax.set_xlabel('Number of Iterations')
ax.set_ylabel('Fitness')
ax.set_title('ISSA')
ax.grid()
plt.show()

TypingError: Failed in nopython mode pipeline (step: nopython frontend)
[1m[1mnon-precise type pyobject[0m
[0m[1mDuring: typing of argument at C:\Users\Dhruva\AppData\Local\Temp\ipykernel_17428\690144866.py (47)[0m
[1m
File "C:\Users\Dhruva\AppData\Local\Temp\ipykernel_17428\690144866.py", line 47:[0m
[1mdef opposition_based_learning(X, lb, ub, thres, N, dim):
    <source elided>
# @jit(target_backend='cuda')
[1m@jit(nopython=True,target_backend='cuda')
[0m[1m^[0m[0m 

This error may have been caused by the following argument(s):
- argument 2: [1mCannot determine Numba type of <class 'dict'>[0m


In [None]:
for i in sf:
  print(printdata.columns[i])

In [None]:

import seaborn as sns

x_train_df = pd.DataFrame(x_train)
x_valid_df = pd.DataFrame(x_valid)

# Calculate correlation matrix
corrmat = x_train_df.corr(method='pearson')
plt.figure(figsize=(10,10))
sns.heatmap(corrmat,annot=True,linewidth=.5,fmt=".1f",cmap=plt.cm.Reds)
plt.show()

# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(pd.DataFrame(x_train), 0.7)
len(set(corr_features))

corr_features

x_train_corr=pd.DataFrame(x_train).drop(corr_features,axis=1)
x_test_corr=pd.DataFrame(x_valid).drop(corr_features,axis=1)

x_train_corr.shape,x_test_corr.shape,y_train.shape,y_valid.shape

In [None]:
selected_columns_index = sf

In [None]:
selected_columns = printdata.iloc[:, selected_columns_index]
selected_columns

In [None]:
label_index = printdata.columns.get_loc('Label')

In [None]:
label_new = printdata.iloc[:, label_index]
label_new

In [None]:
selected_columns[len(selected_columns.columns)+1] = label_new
selected_columns

In [None]:
new_df = pd.DataFrame(selected_columns)
new_df.to_csv('H:/Hybrid Metaheuristic Feature Selection/Dataset/IoT ID 20_MHFS.csv',index=False)