In [1]:
# Import modules
import numpy as np
import seaborn as sns
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing

# Import PySwarms
import pyswarms as ps

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
data = pd.read_csv('final-dataset.csv')

In [3]:
X = data.drop(columns=['Signal','Label','Hurst Component'])
y = data['Label']
X = np.array(X)
y = np.array(y)

In [4]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'criterion':('gini', 'entropy'), 'splitter':('best', 'random'), 'max_features':('auto', 'sqrt','log2')}
classifier = DecisionTreeClassifier(random_state=0)
classifier = GridSearchCV(classifier, parameters)

In [5]:
def f_per_particle(m, alpha):
    """Computes for the objective function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.
    alpha: float (default is 0.5)
        Constant weight for trading-off classifier performance
        and number of features

    Returns
    -------
    numpy.ndarray
        Computed objective function
    """
    total_features = X.shape[1]
    # Get the subset of the features from the binary mask
    if np.count_nonzero(m) == 0:
        X_subset = preprocessing.scale(X)
    else:
        X_subset = preprocessing.scale(X[:,m==1])
    # Perform classification and store performance in P
    classifier.fit(X_subset, y)
    P = (classifier.predict(X_subset) == y).mean()
    # Compute for the objective function
    j = (alpha * (1.0 - P)
        + (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))

    return j

In [6]:
def f(X, alpha=0.88):
    """Higher-level method to do classification in the
    whole swarm.

    Inputs
    ------
    x: numpy.ndarray of shape (n_particles, dimensions)
        The swarm that will perform the search

    Returns
    -------
    numpy.ndarray of shape (n_particles, )
        The computed loss for each particle
    """
    n_particles = X.shape[0]
    j = [f_per_particle(X[i], alpha) for i in range(n_particles)]
    return np.array(j)

In [7]:
# Initialize swarm, arbitrary
options = {'c1': 0.5, 'c2': 0.5, 'w':0.9, 'k': 30, 'p':2}

# Call instance of PSO
dimensions = X.shape[1] # dimensions should be the number of features

optimizer = ps.discrete.BinaryPSO(n_particles=30, dimensions=dimensions, options=options)

# Perform optimization
cost, pos = optimizer.optimize(f, iters=1000)

2020-07-23 17:50:08,015 - pyswarms.discrete.binary - INFO - Optimize for 1000 iters with {'c1': 0.5, 'c2': 0.5, 'w': 0.9, 'k': 30, 'p': 2}
pyswarms.discrete.binary: 100%|█████████████████████████████████████████████████████████████████|1000/1000, best_cost=0
2020-07-23 19:30:52,441 - pyswarms.discrete.binary - INFO - Optimization finished | best cost: 0.0, best pos: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]


In [21]:
# Get the selected features from the final positions
X_selected_features = X[:,pos==1]  # subset
X_selected_features.shape

(1400, 16)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_selected_features, y, test_size=0.3, random_state=0)

In [10]:
x_val, x_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)

In [11]:
from sklearn import preprocessing as pre
X_train = pre.scale(X_train)
x_test = pre.scale(x_test)
x_val = pre.scale(x_val)
X_train.shape

(980, 16)

In [12]:
from sklearn.model_selection import GridSearchCV

In [13]:
from sklearn.tree import DecisionTreeClassifier
parameters = {'criterion':('gini', 'entropy'), 'splitter':('best', 'random'), 'max_features':('auto', 'sqrt','log2')}
clf = DecisionTreeClassifier(random_state=0)
clf = GridSearchCV(clf, parameters)

In [14]:
clf.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
             param_grid={'criterion': ('gini', 'entropy'),
                         'max_features': ('auto', 'sqrt', 'log2'),
                         'splitter': ('best', 'random')})

In [15]:
clf.best_params_

{'criterion': 'entropy', 'max_features': 'auto', 'splitter': 'best'}

In [16]:
clf.score(x_val, y_val)

0.5571428571428572

In [17]:
clf.score(x_test, y_test)

0.5333333333333333

In [18]:
from sklearn.metrics import confusion_matrix
y_true = y_test
y_pred = clf.predict(x_test)
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
specificity = tn / (tn+fp)
print("Specificity is : " , specificity)
accuracy = (tp + tn) / (tp + tn + fp + fn)
print("Accuracy is : " , accuracy)
sensitivity = tp / (tp + fn)
print("Sensitivity is : " , sensitivity)

Specificity is :  0.29245283018867924
Accuracy is :  0.5333333333333333
Sensitivity is :  0.7788461538461539
