In [1]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier


# error rate
def error_rate(xtrain, ytrain, x, opts):
    # parameters
    k     = opts['k']
    fold  = opts['fold']
    xt    = fold['xt']
    yt    = fold['yt']
    xv    = fold['xv']
    yv    = fold['yv']

    # Number of instances
    num_train = np.size(xt, 0)
    num_valid = np.size(xv, 0)
    # Define selected features
    xtrain  = xt[:, x == 1]
    ytrain  = yt.reshape(num_train)  # Solve bug
    xvalid  = xv[:, x == 1]
    yvalid  = yv.reshape(num_valid)  # Solve bug
    # Training
    mdl     = KNeighborsClassifier(n_neighbors = k)
    mdl.fit(xtrain, ytrain)
    # Prediction
    ypred   = mdl.predict(xvalid)
    acc     = np.sum(yvalid == ypred) / num_valid
    error   = 1 - acc

    return error


# Error rate & Feature size
def Fun(xtrain, ytrain, x, opts):
    # Parameters
    alpha    = 0.99
    beta     = 1 - alpha
    # Original feature size
    max_feat = len(x)
    # Number of selected features
    num_feat = np.sum(x == 1)
    # Solve if no feature selected
    if num_feat == 0:
        cost  = 1
    else:
        # Get error rate
        error = error_rate(xtrain, ytrain, x, opts)
        # Objective function
        cost  = alpha * error + beta * (num_feat / max_feat)

    return cost

In [2]:
import numpy as np
from numpy.random import rand
#from FS.functionHO import Fun


def init_position(lb, ub, N, dim):
    X = np.zeros([N, dim], dtype='float')
    for i in range(N):
        for d in range(dim):
            X[i,d] = lb[0,d] + (ub[0,d] - lb[0,d]) * rand()

    return X


def binary_conversion(X, thres, N, dim):
    Xbin = np.zeros([N, dim], dtype='int')
    for i in range(N):
        for d in range(dim):
            if X[i,d] > thres:
                Xbin[i,d] = 1
            else:
                Xbin[i,d] = 0

    return Xbin


def boundary(x, lb, ub):
    if x < lb:
        x = lb
    if x > ub:
        x = ub

    return x


#--- Opposition based learning (7)
def opposition_based_learning(X, lb, ub, thres, N, dim):
    Xo = np.zeros([N, dim], dtype='float')
    for i in range(N):
        for d in range(dim):
            Xo[i,d] = lb[0,d] + ub[0,d] - X[i,d]

    return Xo


def jfs(xtrain, ytrain, opts):
    # Parameters
    ub             = 1
    lb             = 0
    thres          = 0.5
    max_local_iter = 10     # maximum iteration for local search

    N              = opts['N']
    max_iter       = opts['T']
    if 'maxLt' in opts:
        max_local_iter = opts['maxLt']

    # Dimension
    dim = np.size(xtrain, 1)
    if np.size(lb) == 1:
        ub = ub * np.ones([1, dim], dtype='float')
        lb = lb * np.ones([1, dim], dtype='float')

    # Initialize position
    X    = init_position(lb, ub, N, dim)

    # Pre
    fit  = np.zeros([N, 1], dtype='float')
    Xf   = np.zeros([1, dim], dtype='float')
    fitF = float('inf')

    # Binary conversion
    Xbin = binary_conversion(X, thres, N, dim)

    # Fitness
    for i in range(N):
        fit[i,0] = Fun(xtrain, ytrain, Xbin[i,:], opts)
        if fit[i,0] < fitF:
            Xf[0,:] = X[i,:]
            fitF    = fit[i,0]

    #--- Opposition based learning
    Xo    = opposition_based_learning(X, lb, ub, thres, N, dim)
    #--- Binary conversion
    Xobin = binary_conversion(Xo, thres, N, dim)

    #--- Fitness
    fitO  = np.zeros([N, 1], dtype='float')
    for i in range(N):
        fitO[i,0] = Fun(xtrain, ytrain, Xobin[i,:], opts)
        if fitO[i,0] < fitF:
            Xf[0,:] = Xo[i,:]
            fitF    = fitO[i,0]

    #--- Merge opposite & current population, and select best N
    XX  = np.concatenate((X, Xo), axis=0)
    FF  = np.concatenate((fit, fitO), axis=0)
    #--- Sort in ascending order
    ind = np.argsort(FF, axis=0)
    for i in range(N):
        X[i,:]   = XX[ind[i,0],:]
        fit[i,0] = FF[ind[i,0]]

    curve = np.zeros([1, max_iter], dtype='float')
    t     = 0

    # Store result
    curve[0,t] = fitF.copy()
    print("Iteration:", t + 1)
    print("Best (ISSA):", curve[0,t])
    t += 1

    while t < max_iter:
 	    # Compute coefficient, c1 (2)
        c1 = 2 * np.exp(-(4 * t / max_iter) ** 2)

        for i in range(N):
            # First leader update
            if i == 0:
                for d in range(dim):
                    # Coefficient c2 & c3 [0 ~ 1]
                    c2 = rand()
                    c3 = rand()
              	    # Leader update (1)
                    if c3 >= 0.5:
                        X[i,d] = Xf[0,d] + c1 * ((ub[0,d] - lb[0,d]) * c2 + lb[0,d])
                    else:
                        X[i,d] = Xf[0,d] - c1 * ((ub[0,d] - lb[0,d]) * c2 + lb[0,d])

                    # Boundary
                    X[i,d] = boundary(X[i,d], lb[0,d], ub[0,d])

            # Salp update
            elif i >= 1:
                for d in range(dim):
                    # Salp update by following front salp (3)
                    X[i,d] = (X[i,d] + X[i-1, d]) / 2
                    # Boundary
                    X[i,d] = boundary(X[i,d], lb[0,d], ub[0,d])

        # Binary conversion
        Xbin = binary_conversion(X, thres, N, dim)

        # Fitness
        for i in range(N):
            fit[i,0] = Fun(xtrain, ytrain, Xbin[i,:], opts)
            if fit[i,0] < fitF:
                Xf[0,:] = X[i,:]
                fitF    = fit[i,0]

        #--- Local search algorithm
        Lt        = 0
        temp      = np.zeros([1, dim], dtype='float')
        temp[0,:] = Xf[0,:]

        while Lt < max_local_iter:
            #--- Random three features
            RD = np.random.permutation(dim)
            for d in range(3):
                index = RD[d]
                #--- Flip the selected three features
                if temp[0,index] > thres:
                    temp[0,index] = temp[0,index] - thres
                else:
                    temp[0,index] = temp[0,index] + thres

            #--- Binary conversion
            temp_bin = binary_conversion(temp, thres, 1, dim)

            #--- Fitness
            Fnew = Fun(xtrain, ytrain, temp_bin[0,:], opts)
            if Fnew < fitF:
                fitF    = Fnew
                Xf[0,:] = temp[0,:]

            Lt += 1


        # Store result
        curve[0,t] = fitF.copy()
        print("Iteration:", t + 1)
        print("Best (ISSA):", curve[0,t])
        t += 1


    # Best feature subset
    Gbin       = binary_conversion(Xf, thres, 1, dim)
    Gbin       = Gbin.reshape(dim)
    pos        = np.asarray(range(0, dim))
    sel_index  = pos[Gbin == 1]
    num_feat   = len(sel_index)
    # Create dictionary
    issa_data = {'sf': sel_index, 'c': curve, 'nf': num_feat}

    return issa_data

In [3]:
import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [4]:
data0  = pd.read_csv('\Hybrid Metaheuristic Feature Selection\Dataset\IoT ID 20_StatisticalFS.csv',low_memory=False)
#read dataset. Use your own dataset path here
data0

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Init_Bwd_Win_Byts,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Cat,Label
0,12446,25883,2,203,2,1,3496,10117,1,0,...,0,0,0,0,15418,0,10017,9144,2,0
1,22760,34617,1278,200,727,2,3664,7491,1,14,...,210,0,0,0,6857,10148,6217,316,0,0
2,12691,25886,6121,200,1027,2,2082,1790,0,21,...,392,0,0,0,14878,253,9636,8714,4,0
3,12704,25886,6207,200,1027,2,791,2084,0,14,...,392,0,0,0,3092,0,1995,1912,2,0
4,611,25881,3113,317,10,1,1040,2125,31,0,...,0,0,0,0,15529,253,10191,9219,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
625778,62081,25889,6869,233,1015,1,3443,4328,1,0,...,0,0,0,0,7091,0,4255,3977,2,0
625779,18760,30623,4523,200,727,2,3637,2435,0,14,...,210,0,0,0,3639,0,2348,2229,0,0
625780,12695,25886,6134,200,1027,2,2285,10334,1,0,...,1797,0,0,0,15620,0,10191,9299,4,0
625781,12682,25883,11200,203,392,2,2872,3793,31,0,...,392,0,0,0,1526,20579,1205,758,3,1


In [5]:
data0['Label'].value_counts()

Label
0    585710
1     40073
Name: count, dtype: int64

In [6]:
data1 = data0[(data0['Label']==1)]
data1 = data1.sample(n=None, frac=1,replace=False,weights=None,random_state=42,axis=0)

data2 = data0[(data0['Label']==0)]
data2 = data2.sample(n=None, frac=0.1,replace=False,weights=None,random_state=42,axis=0)

data0 = pd.concat([data1,data2])
data0

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Init_Bwd_Win_Byts,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Cat,Label
336733,12682,25883,11200,203,392,2,2816,3110,0,21,...,1722,0,0,0,17759,12006,932,9299,3,1
385404,47157,25889,5495,10,236,2,2910,4858,0,14,...,3079,0,0,0,8244,0,4802,4485,3,1
513047,12683,25886,5250,200,1027,2,2696,6305,1,0,...,1888,0,0,0,10469,0,6317,5869,3,1
534342,12682,25883,11200,203,392,2,2762,11260,0,14,...,1722,0,0,0,16651,0,10965,10009,3,1
44163,12682,25883,11200,203,392,2,2845,4130,1,0,...,392,0,0,0,6812,0,4047,3793,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276094,12144,25883,3535,69,1008,2,1218,9904,1,0,...,338,0,0,0,15196,0,9832,8980,2,0
240069,12446,25883,2,203,2,1,3506,2040,0,14,...,0,0,0,0,2943,0,1956,1874,2,0
246571,50657,6279,9222,205,12,2,515,4528,0,14,...,1,0,0,0,7396,0,4455,4165,0,0
288972,47156,7,4278,205,217,2,3223,3613,31,0,...,695,0,0,0,1151,18423,2065,9066,2,0


In [7]:
data0['Label'].value_counts()

Label
0    58571
1    40073
Name: count, dtype: int64

In [8]:
printdata = data0 #the dataframe used for displaying feature names since everthing will be in values now
printdata.to_csv('\Hybrid Metaheuristic Feature Selection\Dataset\IoT ID 20_GetColumnNames.csv',index=False)

In [9]:
y = data0['Label']
X = data0.drop(['Label'],axis=1)

In [10]:
# The next step is to split training and testing data. For this we will use sklearn function train_test_split().
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3,random_state=1)

In [11]:
X_train.shape, y_train.shape ,X_test.shape , y_test.shape

((69050, 43), (69050,), (29594, 43), (29594,))

In [12]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
scaler.fit(X)
features = scaler.transform(X)

In [13]:
from collections import Counter
from imblearn.over_sampling import SMOTE

In [14]:
Counter(y) #before smote

Counter({0: 58571, 1: 40073})

In [15]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

In [16]:
Counter(y_res) #after smote

Counter({1: 58571, 0: 58571})

In [22]:
X = X_res
y = y_res

In [23]:
X

Unnamed: 0,Flow_ID,Src_IP,Src_Port,Dst_IP,Dst_Port,Protocol,Timestamp,Flow_Duration,Tot_Fwd_Pkts,Tot_Bwd_Pkts,...,Init_Fwd_Win_Byts,Init_Bwd_Win_Byts,Fwd_Seg_Size_Min,Active_Mean,Active_Std,Idle_Mean,Idle_Std,Idle_Max,Idle_Min,Cat
0,12682,25883,11200,203,392,2,2816,3110,0,21,...,0,1722,0,0,0,17759,12006,932,9299,3
1,47157,25889,5495,10,236,2,2910,4858,0,14,...,0,3079,0,0,0,8244,0,4802,4485,3
2,12683,25886,5250,200,1027,2,2696,6305,1,0,...,0,1888,0,0,0,10469,0,6317,5869,3
3,12682,25883,11200,203,392,2,2762,11260,0,14,...,0,1722,0,0,0,16651,0,10965,10009,3
4,12682,25883,11200,203,392,2,2845,4130,1,0,...,0,392,0,0,0,6812,0,4047,3793,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117137,12683,25886,5250,200,1027,2,2678,5535,1,0,...,0,2586,0,0,0,9219,0,5505,5125,3
117138,12682,25883,11200,203,392,2,2820,4067,1,0,...,0,392,0,0,0,6696,0,3977,3729,3
117139,12682,25883,11200,203,392,2,2820,9904,1,0,...,0,392,0,0,0,15196,0,9832,8980,3
117140,12682,25883,11200,203,392,2,2755,7288,31,0,...,0,392,0,0,0,6689,19224,4483,3208,3


In [26]:
X = X.values
y = y.values

In [27]:
X

array([[12682, 25883, 11200, ...,   932,  9299,     3],
       [47157, 25889,  5495, ...,  4802,  4485,     3],
       [12683, 25886,  5250, ...,  6317,  5869,     3],
       ...,
       [12682, 25883, 11200, ...,  9832,  8980,     3],
       [12682, 25883, 11200, ...,  4483,  3208,     3],
       [12686, 25886,  5261, ...,  7322,  6756,     3]], dtype=int64)

In [None]:
# data  = data0.values
# print(data)

In [None]:
# #seperate the features and labels
# feat  = np.asarray(data[:, 0:-1])   # feature vector
# feat

In [None]:
# label = np.asarray(data[:, -2])     # label vector
# label

In [28]:
# split data into train & validation (70 -- 30)
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, stratify=y)
fold = {'xt':xtrain, 'yt':ytrain, 'xv':xtest, 'yv':ytest}

# real parameter
k     = 5     # k-value in KNN
N     = 44    # number of salps
T     = 5   # maximum number of iterations
maxLt = 10    # maximum iteration for local search algorithm
opts  = {'k':k, 'fold':fold, 'N':N, 'T':T, 'maxLt':maxLt}

#trial parameters
# k     = 5     # k-value in KNN
# N     = 44    # number of salps
# T     = 5   # maximum number of iterations
# maxLt = 4    # maximum iteration for local search algorithm
# opts  = 10
# {'k':k, 'fold':fold, 'N':N, 'T':T, 'maxLt':maxLt}

# perform feature selection
fmdl  = jfs(X, y, opts)
sf    = fmdl['sf']

# model with selected features
num_train = np.size(xtrain, 0)
num_valid = np.size(xtest, 0)
x_train   = xtrain[:, sf]
y_train   = ytrain.reshape(num_train)  # Solve bug
x_valid   = xtest[:, sf]
y_valid   = ytest.reshape(num_valid)  # Solve bug

mdl = KNeighborsClassifier(n_neighbors = k)
mdl.fit(x_train, y_train)

# accuracy
y_pred    = mdl.predict(x_valid)
Acc       = np.sum(y_valid == y_pred)  / num_valid
print("Accuracy:", 100 * Acc)

# number of selected features
num_feat = fmdl['nf']
print("Feature Size:", num_feat)
print("Feature selected:", sf)
# plot convergence
curve   = fmdl['c']
curve   = curve.reshape(np.size(curve,1))
x       = np.arange(0, opts['T'], 1.0) + 1.0

fig, ax = plt.subplots()
ax.plot(x, curve, 'o-')
ax.set_xlabel('Number of Iterations')
ax.set_ylabel('Fitness')
ax.set_title('ISSA')
ax.grid()
plt.show()

KeyboardInterrupt: 

In [None]:
for i in sf:
  print(printdata.columns[i])

In [None]:
import seaborn as sns

x_train_df = pd.DataFrame(x_train)
x_valid_df = pd.DataFrame(x_valid)

# Calculate correlation matrix
corrmat = x_train_df.corr(method='pearson')
plt.figure(figsize=(10,10))
sns.heatmap(corrmat,annot=True,linewidth=.5,fmt=".1f",cmap=plt.cm.Reds)
plt.show()

# with the following function we can select highly correlated features
# it will remove the first feature that is correlated with anything other feature

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

corr_features = correlation(pd.DataFrame(x_train), 0.7)
len(set(corr_features))

corr_features

x_train_corr=pd.DataFrame(x_train).drop(corr_features,axis=1)
x_test_corr=pd.DataFrame(x_valid).drop(corr_features,axis=1)

x_train_corr.shape,x_test_corr.shape,y_train.shape,y_valid.shape

In [None]:
selected_columns_index = sf

In [None]:
selected_columns = printdata.iloc[:, selected_columns_index]
selected_columns

In [None]:
label_index = printdata.columns.get_loc('Label')

In [None]:
label_new = printdata.iloc[:, label_index]
label_new

In [None]:
selected_columns[len(selected_columns.columns)+1] = label_new
selected_columns

In [None]:
new_df = pd.DataFrame(selected_columns)
new_df.to_csv('\Hybrid Metaheuristic Feature Selection\Dataset\IoT ID 20_MHFS.csv',index=False)