In [12]:
# libraries:
# General
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Preprocessing, metric, etc...
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix

# Existing methods
from sklearn.ensemble import RandomForestClassifier #NEW
from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score,StratifiedKFold

In [13]:
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data', header=None,
                 prefix='Column_')
display(df.sample(5))
# display(df.info())



  df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data', header=None,


Unnamed: 0,Column_0,Column_1,Column_2,Column_3,Column_4,Column_5,Column_6,Column_7,Column_8,Column_9,...,Column_48,Column_49,Column_50,Column_51,Column_52,Column_53,Column_54,Column_55,Column_56,Column_57
4573,0.0,0.0,0.18,0.0,0.18,0.18,0.0,0.0,0.0,0.0,...,0.033,0.033,0.0,0.099,0.0,0.0,1.489,11,137,0
859,0.09,0.0,0.09,0.0,0.39,0.09,0.09,0.0,0.19,0.29,...,0.0,0.14,0.0,0.326,0.155,0.0,6.813,494,1458,1
1589,0.08,0.0,0.32,0.0,0.24,0.32,0.0,0.16,0.16,0.0,...,0.0,0.045,0.0,0.36,0.03,0.0,1.42,10,196,1
2885,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.764,6,30,0
3266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.204,0.068,0.0,0.0,0.0,2.108,15,97,0


In [14]:
# Generate Train - Test splits
x_set = df.iloc[:,:-1].values
y_set = df.iloc[:,-1].values
xtrain, xtest, ytrain, ytest = train_test_split(x_set, y_set, test_size=0.3, random_state=10)

In [15]:
# Scaling the Train - Test splits
scaler = StandardScaler()
scaler.fit(xtrain)
xtrain = scaler.transform(xtrain)
xtest = scaler.transform(xtest)

# Calculating accuracy (cross validation)

In [16]:
# file: functionHO (accuracy calculation)

def error_rate(xtrain, ytrain, x, opts, model):
    # parameters
    k     = opts['k']
    fold  = opts['fold']
    xt    = fold['xt']
    yt    = fold['yt']
    
    
    # Number of instances
    num_train = np.size(xt, 0)

    # Define selected features
    xtrain  = xt[:, x == 1]
    ytrain  = yt.reshape(num_train)  # Solve bug
    
    # Training
    mdl     = model
    # mdl.fit(xtrain, ytrain)

    # Cross Validation
    stratifiedkf=StratifiedKFold(n_splits=5)
    score=cross_val_score(mdl, xtrain, ytrain, cv=stratifiedkf)
    # print("Cross Validation Scores are {}".format(score))
    # print("Average Cross Validation score :{}".format(score.mean()))
    
    error = 1 - score.mean()
    
    return error


# Error rate & Feature size
def Fun(xtrain, ytrain, x, opts, model):
    # Parameters
    alpha    = 0.99
    beta     = 1 - alpha
    # Original feature size
    max_feat = len(x)
    # Number of selected features
    num_feat = np.sum(x == 1)
    # Solve if no feature selected
    if num_feat == 0:
        cost  = 1
    else:
        # Get error rate
        error = error_rate(xtrain, ytrain, x, opts, model)
        # Objective function
        cost  = alpha * error + beta * (num_feat / max_feat)
        
    return cost

# Genetic Algorithm for Feature Selection:

In [17]:
# Genetic Algorithm for Feature Selection:

import numpy as np
from numpy.random import rand

def init_position(lb, ub, N, dim):
    X = np.zeros([N, dim], dtype='float')
    for i in range(N):
        for d in range(dim):
            X[i,d] = lb[0,d] + (ub[0,d] - lb[0,d]) * rand()        
    return X

def binary_conversion(X, thres, N, dim):
    Xbin = np.zeros([N, dim], dtype='int')
    for i in range(N):
        for d in range(dim):
            if X[i,d] > thres:
                Xbin[i,d] = 1
            else:
                Xbin[i,d] = 0
    return Xbin

def roulette_wheel(prob):
    num = len(prob)
    C   = np.cumsum(prob)
    P   = rand()
    for i in range(num):
        if C[i] > P:
            index = i;
            break
    return index

def jfsGA(xtrain, ytrain, opts, model):
    # Parameters
    ub       = 1
    lb       = 0
    thres    = 0.5    
    CR       = 0.8     # crossover rate
    MR       = 0.01    # mutation rate

    N        = opts['N']
    max_iter = opts['T']
    if 'CR' in opts:
        CR   = opts['CR'] 
    if 'MR' in opts: 
        MR   = opts['MR']  
     # Dimension
    dim = np.size(xtrain, 1)
    if np.size(lb) == 1:
        ub = ub * np.ones([1, dim], dtype='float')
        lb = lb * np.ones([1, dim], dtype='float')
    # Initialize position 
    X     = init_position(lb, ub, N, dim)
    # Binary conversion
    X     = binary_conversion(X, thres, N, dim)
    # Fitness at first iteration
    fit   = np.zeros([N, 1], dtype='float')
    Xgb   = np.zeros([1, dim], dtype='int')
    fitG  = float('inf')
    for i in range(N):
        fit[i,0] = Fun(xtrain, ytrain, X[i,:], opts, model)
        if fit[i,0] < fitG:
            Xgb[0,:] = X[i,:]
            fitG     = fit[i,0]
    # Pre
    curve = np.zeros([1, max_iter], dtype='float')
    t     = 0
    
    curve[0,t] = fitG.copy()
    print("Generation:", t + 1)
    print("Best (GA):", curve[0,t])
    t += 1
    while t < max_iter:
        # Probability
        inv_fit = 1 / (1 + fit)
        prob    = inv_fit / np.sum(inv_fit) 
        # Number of crossovers
        Nc = 0
        for i in range(N):
            if rand() < CR:
              Nc += 1
        x1 = np.zeros([Nc, dim], dtype='int')
        x2 = np.zeros([Nc, dim], dtype='int')
        for i in range(Nc):
            # Parent selection
            k1      = roulette_wheel(prob)
            k2      = roulette_wheel(prob)
            P1      = X[k1,:].copy()
            P2      = X[k2,:].copy()
            # Random one dimension from 1 to dim
            index   = np.random.randint(low = 1, high = dim-1)
            # Crossover
            x1[i,:] = np.concatenate((P1[0:index] , P2[index:]))
            x2[i,:] = np.concatenate((P2[0:index] , P1[index:]))
            # Mutation
            for d in range(dim):
                if rand() < MR:
                    x1[i,d] = 1 - x1[i,d]
                    
                if rand() < MR:
                    x2[i,d] = 1 - x2[i,d]
        # Merge two group into one
        Xnew = np.concatenate((x1 , x2), axis=0)
        # Fitness
        Fnew = np.zeros([2 * Nc, 1], dtype='float')
        for i in range(2 * Nc):
            Fnew[i,0] = Fun(xtrain, ytrain, Xnew[i,:], opts, model)
            if Fnew[i,0] < fitG:
                Xgb[0,:] = Xnew[i,:]
                fitG     = Fnew[i,0]       
        # Store result
        curve[0,t] = fitG.copy()
        print("Generation:", t + 1)
        print("Best (GA):", curve[0,t])
        t += 1
        # Elitism 
        XX  = np.concatenate((X , Xnew), axis=0)
        FF  = np.concatenate((fit , Fnew), axis=0)
        # Sort in ascending order
        ind = np.argsort(FF, axis=0)
        for i in range(N):
            X[i,:]   = XX[ind[i,0],:]
            fit[i,0] = FF[ind[i,0]]         
    # Best feature subset
    Gbin       = Xgb[0,:]
    Gbin       = Gbin.reshape(dim)
    pos        = np.asarray(range(0, dim))    
    sel_index  = pos[Gbin == 1]
    num_feat   = len(sel_index)
    # Create dictionary
    ga_data = {'sf': sel_index, 'c': curve, 'nf': num_feat}
    return ga_data 

# Particle Swarm Optimization for Feature Selection:

In [18]:
# Particle Swarm Optimization for Feature Selection:

def init_position(lb, ub, N, dim):
    X = np.zeros([N, dim], dtype='float')
    for i in range(N):
        for d in range(dim):
            X[i,d] = lb[0,d] + (ub[0,d] - lb[0,d]) * rand()        
    return X

def init_velocity(lb, ub, N, dim):
    V    = np.zeros([N, dim], dtype='float')
    Vmax = np.zeros([1, dim], dtype='float')
    Vmin = np.zeros([1, dim], dtype='float')
    # Maximum & minimum velocity
    for d in range(dim):
        Vmax[0,d] = (ub[0,d] - lb[0,d]) / 2
        Vmin[0,d] = -Vmax[0,d]
    for i in range(N):
        for d in range(dim):
            V[i,d] = Vmin[0,d] + (Vmax[0,d] - Vmin[0,d]) * rand()
    return V, Vmax, Vmin

def binary_conversion(X, thres, N, dim):
    Xbin = np.zeros([N, dim], dtype='int')
    for i in range(N):
        for d in range(dim):
            if X[i,d] > thres:
                Xbin[i,d] = 1
            else:
                Xbin[i,d] = 0 
    return Xbin

def boundary(x, lb, ub):
    if x < lb:
        x = lb
    if x > ub:
        x = ub
    return x
    
def jfsPSO(xtrain, ytrain, opts, model):
    # Parameters
    ub    = 1
    lb    = 0
    thres = 0.5
    w     = 0.9    # inertia weight
    c1    = 2      # acceleration factor
    c2    = 2      # acceleration factor

    N        = opts['N']
    max_iter = opts['T']
    if 'w' in opts:
        w    = opts['w']
    if 'c1' in opts:
        c1   = opts['c1']
    if 'c2' in opts:
        c2   = opts['c2'] 
    # Dimension
    dim = np.size(xtrain, 1) # number of columns (Features)
    if np.size(lb) == 1:
        ub = ub * np.ones([1, dim], dtype='float')
        lb = lb * np.ones([1, dim], dtype='float')
    # Initialize position & velocity
    X             = init_position(lb, ub, N, dim)
    V, Vmax, Vmin = init_velocity(lb, ub, N, dim) 
    # Pre
    fit   = np.zeros([N, 1], dtype='float')
    Xgb   = np.zeros([1, dim], dtype='float')
    fitG  = float('inf')
    Xpb   = np.zeros([N, dim], dtype='float')
    fitP  = float('inf') * np.ones([N, 1], dtype='float')
    curve = np.zeros([1, max_iter], dtype='float') 
    t     = 0
    while t < max_iter:
        # Binary conversion
        Xbin = binary_conversion(X, thres, N, dim)
        # Fitness
        for i in range(N):
            fit[i,0] = Fun(xtrain, ytrain, Xbin[i,:], opts, model)
            if fit[i,0] < fitP[i,0]:
                Xpb[i,:]  = X[i,:]
                fitP[i,0] = fit[i,0]
            if fitP[i,0] < fitG:
                Xgb[0,:]  = Xpb[i,:]
                fitG      = fitP[i,0]
        # Store result
        curve[0,t] = fitG.copy()
        print("Iteration:", t + 1)
        print("Best (PSO):", curve[0,t])
        t += 1
        for i in range(N):
            for d in range(dim):
                # Update velocity
                r1     = rand()
                r2     = rand()
                V[i,d] = w * V[i,d] + c1 * r1 * (Xpb[i,d] - X[i,d]) + c2 * r2 * (Xgb[0,d] - X[i,d]) 
                # Boundary
                V[i,d] = boundary(V[i,d], Vmin[0,d], Vmax[0,d])
                # Update position
                X[i,d] = X[i,d] + V[i,d]
                # Boundary
                X[i,d] = boundary(X[i,d], lb[0,d], ub[0,d])     
    # Best feature subset
    Gbin       = binary_conversion(Xgb, thres, 1, dim) 
    Gbin       = Gbin.reshape(dim)
    pos        = np.asarray(range(0, dim))    
    sel_index  = pos[Gbin == 1]
    num_feat   = len(sel_index)
    # Create dictionary
    pso_data = {'sf': sel_index, 'c': curve, 'nf': num_feat}
    return pso_data    

# file: GA_implementation

In [None]:
# file: GA_implementation

def GA_implementation(model):
    fold = {'xt':xtrain, 'yt':ytrain, 'xv':xtest, 'yv':ytest}
    mdl       = model
    mdl.fit(xtrain, ytrain)

    # accuracy
    y_pred    = mdl.predict(xtest)
    print("Accuracy:", accuracy_score(ytest, y_pred))

    # parameter
    k    = 3     # k-value in KNN
    N    = 10    # number of chromosomes
    T    = 20    # maximum number of generations
    CR   = 0.8
    MR   = 0.01
    opts = {'k':k, 'fold':fold, 'N':N, 'T':T, 'CR':CR, 'MR':MR}

    # perform feature selection
    fmdl = jfsGA(x_set, y_set, opts, model)
    sf   = fmdl['sf']

    # model with selected features
    num_train = np.size(xtrain, 0)
    num_valid = np.size(xtest, 0)
    x_train   = xtrain[:, sf]
    y_train   = ytrain.reshape(num_train)  # Solve bug
    x_valid   = xtest[:, sf]
    y_valid   = ytest.reshape(num_valid)  # Solve bug

    mdl       = model
    mdl.fit(x_train, y_train)

    # accuracy
    y_pred    = mdl.predict(x_valid)
    print("Accuracy:", accuracy_score(y_valid, y_pred))
    print("Confusion Matrix:= \n", confusion_matrix(y_valid, y_pred) )

    # number of selected features
    num_feat = fmdl['nf']
    print("Feature Size:", num_feat)
    features_indexes = fmdl["sf"];
    print("Features Indexes:", fmdl["sf"])
    # selected_features = [y_set[i] for i in fmdl["sf"]]
    # print("Selected Features:", selected_features)

    # plot convergence
    curve   = fmdl['c']
    curve   = curve.reshape(np.size(curve,1))
    x       = np.arange(0, opts['T'], 1.0) + 1.0

    fig, ax = plt.subplots()
    ax.plot(x, curve, 'o-')
    ax.set_xlabel('Number of Iterations')
    ax.set_ylabel('Fitness')
    ax.set_title('GA')
    ax.grid()
    plt.show()

# file: PSO_implementation

In [None]:
# file: PSO_implementation (KNN)

def PSO_implementation(model):
    fold = {'xt':xtrain, 'yt':ytrain, 'xv':xtest, 'yv':ytest}
    mdl       = model
    mdl.fit(xtrain, ytrain)

    # accuracy
    y_pred    = mdl.predict(xtest)
    print("Accuracy:", accuracy_score(ytest, y_pred))

    # parameter
    k    = 5     # k-value in KNN
    N    = 10    # number of particles
    T    = 20   # maximum number of iterations
    w    = 0.9
    c1   = 2
    c2   = 2
    opts = {'k':k, 'fold':fold, 'N':N, 'T':T, 'w':w, 'c1':c1, 'c2':c2}

    # perform feature selection
    fmdl = jfsPSO(x_set, y_set, opts, model)
    sf = fmdl['sf']

    # model with selected features
    num_train = np.size(xtrain, 0)
    num_valid = np.size(xtest, 0)
    x_train   = xtrain[:, sf]
    y_train   = ytrain.reshape(num_train)  # Solve bug
    x_valid   = xtest[:, sf]
    y_valid   = ytest.reshape(num_valid)  # Solve bug

    mdl       = model
    mdl.fit(x_train, y_train)

    # accuracy
    y_pred    = mdl.predict(x_valid)
    print("Accuracy:", accuracy_score(y_valid, y_pred))

    # number of selected features
    num_feat = fmdl['nf']
    print("Feature Size:", num_feat)
    features_indexes = fmdl["sf"];
    print("Features Indexes:", fmdl["sf"])
    # selected_features = [y_set[i] for i in fmdl["sf"]]
    # print("Selected Features:", selected_features)

    # plot convergence
    curve   = fmdl['c']
    curve   = curve.reshape(np.size(curve,1))
    x       = np.arange(0, opts['T'], 1.0) + 1.0

    fig, ax = plt.subplots()
    ax.plot(x, curve, 'o-')
    ax.set_xlabel('Number of Iterations')
    ax.set_ylabel('Fitness')
    ax.set_title('PSO')
    ax.grid()
    plt.show()

# Decision Tree Classifier

In [27]:
# Train
dtClf = tree.DecisionTreeClassifier(random_state=0,criterion='entropy',splitter='best')
PSO_implementation(dtClf)
# GA_implementation(dtClf)


Accuracy: 0.9174511223750905
Generation: 1
Best (GA): 0.10474604990737707


KeyboardInterrupt: 

## Random Forest Classifier

In [28]:
rfClf = RandomForestClassifier(random_state=0)
PSO_implementation(rfClf)
# GA_implementation(rfClf)


Accuracy: 0.9572773352643013
Iteration: 1
Best (PSO): 0.06389816933638433


KeyboardInterrupt: 

# Naive Bayes

In [13]:
NBClf = GaussianNB()   
PSO_implementation(NBClf)
# GA_implementation(NBClf)

Class 1:= 0 	 Class 2:= 1
NB for Numerical Data: 

Accuracy:= 0.8211440984793628
Confusion Matrix:= 
 [[595 226]
 [ 21 539]]


# Support Vector Machines -- *Linear*

In [14]:
#SVM Model
clf = svm.SVC(kernel='linear') 
PSO_implementation(clf)
# GA_implementation(clf)

Linear SVM: 

Accuracy:= 0.9355539464156408
Confusion Matrix:= 
 [[781  40]
 [ 49 511]]


# Support Vector Machines -- *Kernel*

In [15]:
#SVM Model
# clf = svm.SVC(kernel='rbf') 
param_grid = {
    "C": [1e3, 5e3, 1e4, 5e4, 1e5],
    "gamma": [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1],
}
clf = GridSearchCV(svm.SVC(kernel="rbf", class_weight="balanced"), param_grid)
PSO_implementation(clf)
# GA_implementation(clf)

Kernel SVM with CV: 

Accuracy:= 0.9283128167994207
Confusion Matrix:= 
 [[770  51]
 [ 48 512]]


## KNN

In [29]:
#KNN Model
neigh = KNeighborsClassifier(n_neighbors=3)
# PSO_implementation(neigh)
GA_implementation(neigh)

Accuracy: 0.9051412020275162
Generation: 1
Best (GA): 0.09420578620464189


KeyboardInterrupt: 

## Multi Layer Perceptron Classifier (MLPC)

In [17]:
from sklearn.neural_network import MLPClassifier

#ANN Model
ANNclf = MLPClassifier(random_state=1)
PSO_implementation(ANNclf)
# GA_implementation(ANNclf)

ANN for Numerical Data: 

Accuracy:= 0.9514844315713251
Confusion Matrix:= 
 [[786  35]
 [ 32 528]]




# AdaBoosted  Classifiers

In [18]:
# Create and fit an AdaBoosted decision tree
from sklearn.ensemble import AdaBoostClassifier
dta = AdaBoostClassifier(
    tree.DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200
)

PSO_implementation(dta)
# GA_implementation(dta)

Decision Tree: 

Accuracy:= 0.9362780593772628
Confusion Matrix:= 
 [[788  33]
 [ 55 505]]


In [19]:
# Create and fit an AdaBoosted SVM
from sklearn.ensemble import AdaBoostClassifier
svmla = AdaBoostClassifier(
     svm.SVC(kernel='linear'), algorithm="SAMME", n_estimators=200
)
PSO_implementation(svmla)
# GA_implementation(svmla)

Decision Tree: 

Accuracy:= 0.8769007965242578
Confusion Matrix:= 
 [[739  82]
 [ 88 472]]
