In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif
from sklearn import preprocessing as pre
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [2]:
def dataset():
    data = pd.read_csv('final-dataset.csv')
    X = data.drop(columns=['Signal','Label','Hurst Component'])
    y = data['Label']
    feature_names = X.columns
    return X, y, feature_names

In [3]:
parameters = {'criterion':('gini', 'entropy'), 'class_weight':('balanced', 'balanced_subsample'), 'max_features':('auto', 'sqrt','log2')}

In [4]:
def get_feature_names(selector, feature_names):
    mask = selector.get_support() #list of booleans
    new_features = [] # The list of your K best features
    for bool, feature in zip(mask, feature_names):
        if bool:
            new_features.append(feature)
    return new_features

In [5]:
def feature_scaling_min_max(X):
    min_max_scaler = pre.MinMaxScaler()
    X = min_max_scaler.fit_transform(X)
    return X

In [6]:
def feature_scaling_basic(X):
    return pre.scale(X)

In [7]:
from sklearn.metrics import confusion_matrix
def performance_measures(x_test, y_test):
    y_true = y_test
    y_pred = clf.predict(x_test)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    specificity = tn / (tn+fp)
    sensitivity = tp / (tp + fn)
    return sensitivity, specificity

In [8]:
max_score = 0
best_params = {}
best_k = 2
best_val_score = 0
selected_names = []
best_sensitivity = 0
best_specificity = 0

for k in range(2,10):
    X, y, feature_names = dataset()
    X = feature_scaling_min_max(X)
    selector = SelectKBest(chi2, k=k)
    X_new = selector.fit_transform(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)
    x_val, x_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)
    clf = RandomForestClassifier(random_state=0)
    clf = GridSearchCV(clf, parameters)
    clf.fit(X_train, y_train)
    names = get_feature_names(selector, feature_names)
    val_score = clf.score(x_val, y_val)
    score = clf.score(x_test, y_test)
    sensitivity, specificity = performance_measures(x_test, y_test)
    if(score > max_score):
        max_score = score
        best_k = k
        best_params = clf.best_params_
        best_val_score = val_score
        selected_names = names
        best_sensitivity = sensitivity
        best_specificity = specificity
    print("Passed ", k)

print("Test Score is ", max_score)
print("Best params are ", best_params)
print("Best k is ", best_k)
print("names are ", selected_names)
print("Val score is ", best_val_score)
print("Sensitivity ", sensitivity)
print("Specificity ", specificity)

Passed  2
Passed  3
Passed  4
Passed  5
Passed  6
Passed  7
Passed  8
Passed  9
Test Score is  0.8952380952380953
Best params are  {'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'log2'}
Best k is  8
names are  ['Zero Crossings', 'Frequency (Hz)', 'Delta (µV² /Hz)', 'Alpha (µV² /Hz)', 'Beta (µV² /Hz)', 'Gamma (µV² /Hz)', 'Skewness', 'Hjorth Mobility']
Val score is  0.9047619047619048
Sensitivity  0.8942307692307693
Specificity  0.8867924528301887


In [9]:
max_score = 0
best_params = {}
best_k = 2
best_val_score = 0
selected_names = []

for k in range(2,10):
    X, y, feature_names = dataset()
    X = feature_scaling_min_max(X)
    selector = SelectKBest(f_classif, k=k)
    X_new = selector.fit_transform(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)
    x_val, x_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)
    clf = RandomForestClassifier(random_state=0)
    clf = GridSearchCV(clf, parameters)
    clf.fit(X_train, y_train)
    val_score = clf.score(x_val, y_val)
    names = get_feature_names(selector, feature_names)
    score = clf.score(x_test, y_test)
    sensitivity, specificity = performance_measures(x_test, y_test)
    if(score > max_score):
        max_score = score
        best_k = k
        best_params = clf.best_params_
        best_val_score = val_score
        selected_names = names
        best_sensitivity = sensitivity
        best_specificity = specificity
    print("Passed ", k)

print("Test Score is ", max_score)
print("Best params are ", best_params)
print("Best k is ", best_k)
print("Val score is ", best_val_score)
print("Names are ", selected_names)
print("Sensitivity ", sensitivity)
print("Specificity ", specificity)

Passed  2
Passed  3
Passed  4
Passed  5
Passed  6
Passed  7
Passed  8
Passed  9
Test Score is  0.9
Best params are  {'class_weight': 'balanced_subsample', 'criterion': 'entropy', 'max_features': 'log2'}
Best k is  8
Val score is  0.8952380952380953
Names are  ['Zero Crossings', 'Frequency (Hz)', 'Alpha (µV² /Hz)', 'Beta (µV² /Hz)', 'Gamma (µV² /Hz)', 'Skewness', 'Hjorth Mobility', 'Spectral Entropy']
Sensitivity  0.8942307692307693
Specificity  0.8867924528301887


In [10]:
max_score = 0
best_params = {}
best_k = 2
best_val_score = 0
selected_names = []

for k in range(2,10):
    X, y, feature_names = dataset()
    X = feature_scaling_min_max(X)
    selector = SelectKBest(mutual_info_classif, k=k)
    X_new = selector.fit_transform(X, y)
    X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.3, random_state=0)
    x_val, x_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=0)
    clf = RandomForestClassifier(random_state=0)
    clf = GridSearchCV(clf, parameters)
    clf.fit(X_train, y_train)
    val_score = clf.score(x_val, y_val)
    names = get_feature_names(selector, feature_names)
    score = clf.score(x_test, y_test)
    sensitivity, specificity = performance_measures(x_test, y_test)
    if(score > max_score):
        max_score = score
        best_k = k
        best_params = clf.best_params_
        best_val_score = val_score
        selected_names = names
        best_sensitivity = sensitivity
        best_specificity = specificity
    print("Passed ", k)

print("Test Score is ", max_score)
print("Best params are ", best_params)
print("Best k is ", best_k)
print("Val score is ", best_val_score)
print("names are ", selected_names)
print("Sensitivity ", sensitivity)
print("Specificity ", specificity)

Passed  2
Passed  3
Passed  4
Passed  5
Passed  6
Passed  7
Passed  8
Passed  9
Test Score is  0.9333333333333333
Best params are  {'class_weight': 'balanced', 'criterion': 'entropy', 'max_features': 'auto'}
Best k is  6
Val score is  0.9238095238095239
names are  ['Zero Crossings', 'Theta (µV² /Hz)', 'Alpha (µV² /Hz)', 'Beta (µV² /Hz)', 'Gamma (µV² /Hz)', 'Variance']
Sensitivity  0.8942307692307693
Specificity  0.9056603773584906
