In [1]:
import numpy as np
import os
import pandas as pd


data1 = pd.read_csv('sonar dataset.csv')
data1.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,feature_58,feature_59,feature_60,Class
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [2]:
data1["Class"].value_counts()

M    111
R     97
Name: Class, dtype: int64

In [3]:
data1['Class'].unique()

array(['R', 'M'], dtype=object)

In [4]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
data1['Class']= label_encoder.fit_transform(data1['Class'])

data1['Class'].unique()


array([1, 0])

In [5]:
# Stratified Sampling using Scikit-learn's Stratified Shuffle Split Class
from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25, random_state=42)
for train_index, test_index in split.split(data1, data1["Class"]):
    strat_train_set = data1.loc[train_index]
    strat_test_set = data1.loc[test_index]

In [6]:
train_set = strat_train_set.drop("Class", axis=1) # drop labels for training set
train_labels = strat_train_set["Class"].copy()
test_set = strat_test_set.drop("Class", axis=1) # drop labels for testing set
test_labels = strat_test_set["Class"].copy()

KNN Classifier Algorithm

In [7]:
from sklearn.metrics import matthews_corrcoef
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
parameters = {'n_neighbors':[1,3,5,7,10,13,15], 'weights':['uniform', 'distance'], 'metric':['euclidean', 'manhattan'], 'algorithm':['auto','ball_tree', 'kd_tree', 'brute']}
neigh1 = KNeighborsClassifier()
clf = GridSearchCV(neigh1, parameters, scoring = 'balanced_accuracy', cv=10)
res1=clf.fit(train_set, train_labels)
res1.best_estimator_

In [8]:
res1.best_params_

{'algorithm': 'auto',
 'metric': 'manhattan',
 'n_neighbors': 1,
 'weights': 'uniform'}

In [9]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
y_test_pred=res1.best_estimator_.predict(test_set)
print("matthews_corrcoef:",matthews_corrcoef(test_labels, y_test_pred))
print("accuracy_score:",accuracy_score(test_labels, y_test_pred))
print("precision_score:",precision_score(test_labels, y_test_pred, average=None))
print("score:",f1_score(test_labels, y_test_pred, average=None))
print("balanced_accuracy_score:",balanced_accuracy_score(test_labels,y_test_pred))

matthews_corrcoef: 0.6531435102866939
accuracy_score: 0.8269230769230769
precision_score: [0.80645161 0.85714286]
score: [0.84745763 0.8       ]
balanced_accuracy_score: 0.8214285714285714


In [10]:
from sklearn.metrics import matthews_corrcoef
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
parameters = {'loss':['hinge'],'penalty':['l2', 'l1', 'elasticnet'], 'alpha':[0.0001], 'max_iter':[1000], 'tol':[0.001]}
neigh1 = SGDClassifier(max_iter=1000, tol=1e-3)
clf = GridSearchCV(neigh1, parameters, scoring = 'balanced_accuracy', cv=10)
res1=clf.fit(train_set, train_labels)
res1.best_estimator_

In [11]:
res1.best_params_

{'alpha': 0.0001,
 'loss': 'hinge',
 'max_iter': 1000,
 'penalty': 'l1',
 'tol': 0.001}

In [12]:
y_test_pred=res1.best_estimator_.predict(test_set)
print("matthews_corrcoef:",matthews_corrcoef(test_labels, y_test_pred))
print("accuracy_score:",accuracy_score(test_labels, y_test_pred))
print("precision_score:",precision_score(test_labels, y_test_pred, average=None))
print("score:",f1_score(test_labels, y_test_pred, average=None))
print("balanced_accuracy_score:",balanced_accuracy_score(test_labels,y_test_pred))

matthews_corrcoef: 0.32433748657040123
accuracy_score: 0.6538461538461539
precision_score: [0.72727273 0.6       ]
score: [0.64       0.66666667]
balanced_accuracy_score: 0.6607142857142857


Naive Bayes Classifiers

In [13]:
from sklearn.metrics import matthews_corrcoef
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
parameters = {'priors':[None], 'var_smoothing':[1e-09]}
neigh1 = GaussianNB()
clf = GridSearchCV(neigh1, parameters, scoring = 'balanced_accuracy', cv=10)
res1=clf.fit(train_set, train_labels)
res1.best_estimator_

In [14]:
res1.best_params_

{'priors': None, 'var_smoothing': 1e-09}

In [15]:
y_test_pred=res1.best_estimator_.predict(test_set)
print("matthews_corrcoef:",matthews_corrcoef(test_labels, y_test_pred))
print("accuracy_score:",accuracy_score(test_labels, y_test_pred))
print("precision_score:",precision_score(test_labels, y_test_pred, average=None))
print("score:",f1_score(test_labels, y_test_pred, average=None))
print("balanced_accuracy_score:",balanced_accuracy_score(test_labels,y_test_pred))

matthews_corrcoef: 0.3019518288806687
accuracy_score: 0.6346153846153846
precision_score: [0.73684211 0.57575758]
score: [0.59574468 0.66666667]
balanced_accuracy_score: 0.6458333333333333
