In [1]:
import numpy as np
# Import backend modules
import pyswarms.backend as P
from pyswarms.backend.topology import Star
import numpy as np
import pandas as pd
from scipy.io import arff

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
def data_prepration(file):
    data = arff.loadarff(file)
    X=[]
    y=[]
    for i in data[0]:

        X.append(list(i)[0:len(i)-2])
        if(i[len(i)-1] == b'Y'):
            y.append(1)
        else:
            y.append(0)
        
    X=np.array(X)
    y=np.array(y)
    return X,y

In [4]:
from sklearn.naive_bayes import GaussianNB

In [5]:
def f_per_particle(m):
    classifier = GaussianNB()
    """Computes for the fitness function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.

    Returns
    -------
    numpy.ndarray
        Computed finess function
    """
    count_non_zero=np.count_nonzero(m)
    # Get the subset of the features from the binary mask
    if np.count_nonzero(m) == 0:
        X_subset = X_train
        X_subset_test= X_test
        count_non_zero=len(m)
    else:
        
        X_subset = X_train[:,m==1]
        X_subset_test= X_test[:,m==1]
        
    
    # Perform classification and compute the fitness function
    classifier.fit(X_subset, y_train)
    error_rate=(classifier.predict(X_subset_test) != y_test).mean()

    j =  error_rate
#     j=np.count_nonzero(m)

    return j

In [6]:
def f(x):
    """Higher-level method to do classification in the
    whole swarm.

    Inputs
    ------
    x: numpy.ndarray of shape (n_particles, dimensions)
        The swarm that will perform the search

    Returns
    -------
    numpy.ndarray of shape (n_particles, )
        The computed loss for each particle
    """
    n_particles = x.shape[0]
    j = [f_per_particle(x[i]) for i in range(n_particles)]
    
    return np.array(j)

In [7]:
def _compute_position(swarm):
        """Update the position matrix of the swarm
        This computes the next position in a binary swarm. It compares the
        sigmoid output of the velocity-matrix and compares it with a randomly
        generated matrix.
        Parameters
        ----------
        swarm: pyswarms.backend.swarms.Swarm
            a Swarm class
        """
        
        temp=np.random.random_sample(size=(swarm.n_particles,swarm.dimensions)) < swarm.velocity
        postion=np.empty((my_swarm.n_particles, my_swarm.dimensions))
        postion1=swarm.position

        for j in range(swarm.n_particles):
            for i in range(len(temp[0])):
                if (temp[j][i] == True ):                  
                    postion[j][i]=1-postion1[j][i]
                else:
                    postion[j][i]=postion1[j][i]
            
                
        
        return postion

In [8]:
def compute_velocity(my_swarm, stickness,Is ,bounds=None):
    """Update the velocity matrix
    This method updates the velocity matrix using the best and current
    positions of the swarm. The velocity matrix is computed using the
    cognitive and social terms of the swarm. The velocity is handled
    by a :code:`VelocityHandler`.
    A sample usage can be seen with the following:
    .. code-block :: python
        import pyswarms.backend as P
        from pyswarms.swarms.backend import Swarm, VelocityHandler
        my_swarm = P.create_swarm(n_particles, dimensions)
        my_vh = VelocityHandler(strategy="invert")
        for i in range(iters):
            # Inside the for-loop
            my_swarm.velocity = compute_velocity(my_swarm, clamp, my_vh, bounds)
    Parameters
    ----------
    swarm : pyswarms.backend.swarms.Swarm
        a Swarm instance
    clamp : tuple of floats, optional
        a tuple of size 2 where the first entry is the minimum velocity
        and the second entry is the maximum velocity. It
        sets the limits for velocity clamping.
    vh : pyswarms.backend.handlers.VelocityHandler
        a VelocityHandler object with a specified handling strategy.
        For further information see :mod:`pyswarms.backend.handlers`.
    bounds : tuple of numpy.ndarray or list, optional
        a tuple of size 2 where the first entry is the minimum bound while
        the second entry is the maximum bound. Each array must be of shape
        :code:`(dimensions,)`.
    Returns
    -------
    numpy.ndarray
        Updated velocity matrix
    """
    try:
        # Prepare parameters
        swarm_size = my_swarm.position.shape
        alpha = my_swarm.options["c1"]
        gb=my_swarm.best_pos
        Pb= my_swarm.pbest_pos
        updated_velocity= np.empty((my_swarm.n_particles, my_swarm.dimensions))

        for p in range(my_swarm.n_particles):
            p_postion=my_swarm.position[p]
            pb=Pb[p]

            for i in range(len(p_postion)):                
                if (p_postion[i] == pb[i] and   pb[i] == gb[i]):
                    
                    updated_velocity[p][i]=Is*(1-stickness[p][i])
                elif(p_postion[i] == pb[i] and pb[i] != gb[i]):
                    
                    updated_velocity[p][i]=Is*(1-stickness[p][i] - (1/(alpha+1))) +(1/(alpha+1))
                elif(p_postion[i] != pb[i] and p_postion[i]== gb[i]):
                    
                    updated_velocity[p][i]=Is*(1-stickness[p][i] - (alpha/(alpha+1))) +(alpha/(alpha+1))
                elif(p_postion[i] != pb[i] and pb[i]== gb[i]):
                    
                    updated_velocity[p][i]= 1-Is*stickness[p][i]
        

    except AttributeError:
        rep.logger.exception(
            "Please pass a Swarm class. You passed {}".format(type(swarm))
        )
        raise
    except KeyError:
        rep.logger.exception("Missing keyword in swarm.options")
        raise
    else:

        return updated_velocity

In [9]:
def compute_stickness(stickness, old_position, ustkS, notchanged):
    
    new_stickness=np.empty((my_swarm.n_particles, my_swarm.dimensions))
    new_position=my_swarm.position
    
    for j in range(my_swarm.n_particles):

        
        for i in range(len(stickness[j])):
            if(new_position[j][i] == old_position[j][i]):
                notchanged[j][i]+=1
                if(notchanged[j][i] < ustkS):
                    val=stickness[j][i] - (1/ustkS)
                    if(val > 0):
                        new_stickness[j][i]=val
                    else:
                        new_stickness[j][i]=0
                else:
                    new_stickness[j][i]=0   
            else:
                notchanged[j][i]=0
                new_stickness[j][i]=1

    return new_stickness

In [10]:
def AUC(y_true,y_pred):
    
    from sklearn.metrics import roc_curve, auc
    from sklearn.preprocessing import label_binarize
    from sklearn.multiclass import OneVsRestClassifier
    from scipy import interp

    from sklearn.metrics import roc_curve  
    from sklearn.metrics import roc_auc_score ,auc
    from  sklearn.preprocessing import label_binarize 
    
    classes=list(set(y))
    y_test = label_binarize(y_true, classes=classes)
    y_score = label_binarize(y_pred, classes=classes)

    n_classes = len(classes)
    fpr = dict()
    tpr = dict()
    roc_auc = dict()
    for i in range(n_classes):
        fpr[i], tpr[i], _ = roc_curve(y_test[:,i], y_score[:,i])
        roc_auc[i] = auc(fpr[i], tpr[i])
    
    fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
    roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
    
    #auc = roc_auc_score(y,y_pred,average='micro')  
      
    

    # Compute macro-average ROC curve and ROC area

    # First aggregate all false positive rates
    all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

    # Then interpolate all ROC curves at this points
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(n_classes):
        mean_tpr += interp(all_fpr, fpr[i], tpr[i])

    # Finally average it and compute AUC
    mean_tpr /= n_classes

    fpr["macro"] = all_fpr
    tpr["macro"] = mean_tpr
    roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
    
    
    return roc_auc,tpr,fpr

In [11]:
def run_swarm():
   

    stickness=np.random.random_sample(size=(my_swarm.n_particles,my_swarm.dimensions))
    notchanged=np.zeros((my_swarm.n_particles,my_swarm.dimensions))

    
    for i in range(iterations):
        my_swarm.current_cost = f(my_swarm.position) # Compute current cost
        my_swarm.pbest_cost = f(my_swarm.pbest_pos)  # Compute personal best pos
        my_swarm.pbest_pos, my_swarm.pbest_cost = P.compute_pbest(my_swarm) # Update and store



        Is=Isu-(i/iterations) *(Isu-Isl)
        ustkS = ustkSL + (i/iterations) *(ustkSU- ustkSL)

        # Part 2: Update global best
        # Note that gbest computation is dependent on your topology
        if np.min(my_swarm.pbest_cost) < my_swarm.best_cost:
            my_swarm.best_pos, my_swarm.best_cost = my_topology.compute_gbest(my_swarm)
        if i%20==0:
                print('Iteration: {} | my_swarm.best_cost: {:.4f}'.format(i+1, my_swarm.best_cost))

        my_swarm.velocity = compute_velocity(my_swarm, stickness, Is)
        old_position= my_swarm.position
        my_swarm.position = _compute_position(my_swarm)
        stickness= compute_stickness(stickness,old_position, ustkS, notchanged)

In [12]:
import os
root_folder="D:\\SDP\\Afnan"
files = [os.path.join(root_folder, x) for x in os.listdir(root_folder)]

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix, multilabel_confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
import math 

In [15]:
my_topology = Star() # The Topology Class
my_options = {'c1': 0.6} # arbitrarily set
n_particles= 20

filexs=pd.DataFrame(columns=["Dataset", "precision", "Accuracy","Recall","F-measure","AUC","G-mean"])
fnum=-1
for file in files:
    fnum+=1
    
    ds=file.split("\\")[3][0:-5]

    print("{} dataset...".format(ds))
    
#     ds=file
    fr = open("C:/Users/afnan/SDP/SDP/Test Result/SBPSO DS2/NB/{}.txt".format(ds), "a")
    fr.write('parameters: '+str(my_options)+"\n")
    
    # read the dataset and prepare it X for features and y for lables:
    
    X,y=data_prepration(file)
    
    # split dataset for training and testing DS
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
    
    # create swarm based on num of features and num of particles and the options 
    my_swarm = P.create_swarm(n_particles=n_particles,
                              dimensions=X.shape[1], options=my_options, discrete=True, binary=True) # The Swarm Class
    
    # set swarm parameters 
    
    iterations=100
    ustkSL= 1*(iterations/100)
    ustkSU= 8*(iterations/100)
    Isu=0
    Isl=10/my_swarm.dimensions
    
    # run swarm 
    run_swarm()
    
    # if the best pos no features select all features 
    if( np.count_nonzero( my_swarm.best_pos) == 0):
        my_swarm.best_pos= np.ones(my_swarm.dimensions)
    
    # train the classfier on the selected features 
    classifier =GaussianNB()
    X_selected_features = X_train[:,my_swarm.best_pos==1]
    X_selected_features_test= X_test[:,my_swarm.best_pos==1]
    classifier.fit(X_selected_features, y_train)
    
    # test it and evaluate 
    y_pred= classifier.predict(X_selected_features_test)    
    report=classification_report(y_test,y_pred)
    subset_performance = (y_pred== y_test).mean()
    
    print(len(list(set(y))))
    if(len(list(set(y)))>2):
        print('multi')
        matrix = multilabel_confusion_matrix(y_test, y_pred)
        tn = matrix[:, 0, 0]
        tp = matrix[:, 1, 1]
        fn = matrix[:, 1, 0]
        fp = matrix[:, 0, 1]
        G_mean= math.sqrt((tp.mean()/(tp.mean()+fn.mean()))*(tn.mean()/(fp.mean()+tn.mean())))
        auc=AUC(y_test,y_pred)[0]["micro"]
    else:
        print('binary')
        matrix= confusion_matrix(y_test,y_pred, labels=list(set(y)))
        tn, fp, fn, tp = matrix.ravel()
        G_mean= math.sqrt((tp/(tp+fn))*(tn/(fp+tn)))
        auc=roc_auc_score(y_test,y_pred)
        
    ac=accuracy_score(y_test,y_pred)
    prec=precision_score(y_test,y_pred,average='weighted')
    rc=recall_score(y_test,y_pred,average='weighted')
    F_Score = (2 * rc * prec) / (rc + prec)  
    
    
    # store the result 
    print("subset performance= {}".format(subset_performance))
    fr.write('number of particle {}:\n'.format(n_particles))
    fr.write("best pos cost= "+str(my_swarm.best_cost)+"\nNumber of selected features:"+str(len([i for i in my_swarm.best_pos if i==1]))+"\nSelected features:"+str(my_swarm.best_pos))
    fr.write("\nclassification_report:"+str(report)+"\n")
    fr.write("subset performance = {}".format(subset_performance))
    
    fr.write("accuracy_score= {}\n".format(ac))
    fr.write("precision_score= {}\n".format(prec))
    fr.write("recall_score= {}\n".format(rc))
    fr.write("F_Score= {}\n".format(F_Score))
    fr.write("auc= {}\n".format(auc))
    fr.write("G_mean= {}\n".format(G_mean))
    
    
    
    fr.write("confusion_matrix=\n{}\n".format(matrix))
    fr.write("labels {}\n".format(set(y)))
    fr.write("True Negative= {}\n".format(tn))
    fr.write("True Positive= {}\n".format(tp))
    fr.write("Flase Negative= {}\n".format(fn))
    fr.write("False Positive= {}\n".format(fp))
    fr.close()
    #["Dataset", "precision", "Accuracy","Recall","F-measure","AUC","G-mean"]

    filexs.loc[fnum] =[ds,prec,ac,rc,F_Score,auc,G_mean]
filexs.to_excel("SBPSO NB DS2.xlsx")


CM1 dataset...
Iteration: 1 | my_swarm.best_cost: 0.2222
Iteration: 21 | my_swarm.best_cost: 0.2130
Iteration: 41 | my_swarm.best_cost: 0.2037
Iteration: 61 | my_swarm.best_cost: 0.1944
Iteration: 81 | my_swarm.best_cost: 0.1944
2
binary
subset performance= 0.8148148148148148
JM1 dataset...
Iteration: 1 | my_swarm.best_cost: 0.2067
Iteration: 21 | my_swarm.best_cost: 0.2044
Iteration: 41 | my_swarm.best_cost: 0.2036
Iteration: 61 | my_swarm.best_cost: 0.2009
Iteration: 81 | my_swarm.best_cost: 0.2001
2
binary
subset performance= 0.8007006617360841
KC1 dataset...
Iteration: 1 | my_swarm.best_cost: 0.2609
Iteration: 21 | my_swarm.best_cost: 0.2430
Iteration: 41 | my_swarm.best_cost: 0.2430
Iteration: 61 | my_swarm.best_cost: 0.2430
Iteration: 81 | my_swarm.best_cost: 0.2430
2
binary
subset performance= 0.7570332480818415
KC3 dataset...
Iteration: 1 | my_swarm.best_cost: 0.2154
Iteration: 21 | my_swarm.best_cost: 0.2154
Iteration: 41 | my_swarm.best_cost: 0.2154
Iteration: 61 | my_swarm.b