In [1]:
# Import modules
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn import model_selection, preprocessing
from sklearn.model_selection import GridSearchCV

# Import PySwarms
# !pip3 install pyswarms
import pyswarms as ps

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
data = pd.read_csv('parkinsons.csv', delimiter=',')

In [3]:
X = data.drop(columns=['name', 'status'])
y = data['status']
X = np.array(X)
y = np.array(y)
y.dtype

dtype('int64')

In [4]:
# num_features = X.shape[1]

In [5]:
from sklearn.svm import SVC
# rameters = {'C':(1,2,3,4,5), 'degree':(1,2,3,4,5)}
classifier = SVC(random_state=0, kernel='linear', gamma='scale', C=2, degree=3)
# assifier = GridSearchCV(classifier, parameters)

In [6]:
def f_per_particle(m, alpha):
    """Computes for the objective function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.
    alpha: float (default is 0.5)
        Constant weight for trading-off classifier performance
        and number of features

    Returns
    -------
    numpy.ndarray
        Computed objective function
    """
    total_features = X.shape[1]
    # Get the subset of the features from the binary mask
    if np.count_nonzero(m) == 0:
        X_subset = preprocessing.scale(X)
    else:
        X_subset = preprocessing.scale(X[:,m==1])
    # Perform classification and store performance in P
    classifier.fit(X_subset, y)
    P = (classifier.predict(X_subset) == y).mean()
    # Compute for the objective function
    j = (alpha * (1.0 - P)
        + (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))

    return j

In [7]:
def f(x, alpha=0.88):
    """Higher-level method to do classification in the
    whole swarm.

    Inputs
    ------
    x: numpy.ndarray of shape (n_particles, dimensions)
        The swarm that will perform the search

    Returns
    -------
    numpy.ndarray of shape (n_particles, )
        The computed loss for each particle
    """
    n_particles = x.shape[0]
    j = [f_per_particle(x[i], alpha) for i in range(n_particles)]
    return np.array(j)

In [8]:
# Initialize swarm, arbitrary
options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

# Call instance of PSO
dimensions = X.shape[1] # dimensions should be the number of features

optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)

# Perform optimization
cost, pos = optimizer.optimize(f, iters=1000)

2020-06-22 21:56:41,505 - pyswarms.discrete.binary - INFO - Optimize for 1000 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 30, 'p': 2}
pyswarms.discrete.binary: 100%|██████████|1000/1000, best_cost=0.0686
2020-06-22 22:00:07,565 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.0686340326340326, best pos: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1]


In [9]:
# Get the selected features from the final positions
X_selected_features = X[:,pos==1]  # subset

In [10]:
x_train, x_test, y_train, y_test = model_selection.train_test_split(X_selected_features, y, test_size=0.3, random_state=0)

In [11]:
x_val, x_test, y_val, y_test = model_selection.train_test_split(x_test, y_test, test_size=0.5, random_state=0)

In [12]:
from sklearn.svm import SVC
parameters = {'C':(1,2,3,4,5), 'degree':(1,2,3,4,5)}
clf = SVC(random_state=0, kernel='linear', gamma='scale')
clf = GridSearchCV(clf, parameters)

In [13]:
X_selected_features.shape

(195, 21)

In [14]:
x_train = preprocessing.scale(x_train)
x_train.shape

(136, 21)

In [15]:
clf.fit(x_train, y_train)

GridSearchCV(cv=None, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='linear', max_iter=-1,
                           probability=False, random_state=0, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': (1, 2, 3, 4, 5), 'degree': (1, 2, 3, 4, 5)},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [16]:
clf.best_params_

{'C': 4, 'degree': 1}

In [17]:
x_val = preprocessing.scale(x_val)
clf.score(x_val, y_val)

0.8275862068965517

In [18]:
from sklearn.metrics import confusion_matrix
y_true = y_test
x_test = preprocessing.scale(x_test)
y_pred = clf.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
specificity = tn / (tn+fp)
print("Specificity is : " , specificity)
accuracy = (tp + tn) / (tp + tn + fp + fn)
print("Accuracy is : " , accuracy)
sensitivity = tp / (tp + fn)
print("Sensitivity is : " , sensitivity)

Specificity is :  0.7142857142857143
Accuracy is :  0.8666666666666667
Sensitivity is :  0.9130434782608695
