# Binary Classification ML problem with Sonar Mines vs Rock Datadet


In [53]:
############################## Load Libraries, dataset #####################################

import numpy
from pandas import read_csv
from pandas import set_option
from pandas.tools.plotting import scatter_matrix
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


dataset = read_csv('C:/Users/Satish/python_files/sonar.csv', header=None)


############################## Analyse the Data #####################################
# print(dataset.head(10))
# print(dataset.shape)
# print(dataset.describe())
# print(dataset.dtypes)
# print(dataset.groupby(60).size())

############################## Visualize the Data #####################################
########### Unimodal Data Visualizations

# dataset.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1) 
### From histogram we can see lot Gaussian and Exponential distribution of Data

# dataset.plot(kind='density', sharex=False, subplots=True, layout=(8,8), legend=False, fontsize=1)
# dataset.plot(kind='box', sharex=False, subplots=True, layout=(8,8), sharey=False, fontsize=1)
# pyplot.show()

########## Multimodal Data Visualizations
# fig = pyplot.figure()
# ax = fig.add_subplot(111)
# cax = ax.matshow(dataset.corr(), vmin=1, vmax=1, interpolation = 'none')
# fig.colorbar(cax)
# pyplot.show()
### This shows interesting fact that attribute next to each other are generally correlated   

############################## Split up dataset into train and test set #####################

array = dataset.values
x = array[:,0:60]
y = array[:,60]
test_size = 0.2
seed = 27
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_size, random_state=seed)
num_fold = 10
scoring = 'accuracy'
############################## Build the model #####################################

models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))

############################## Evaluate each model #####################################
# results = []
# names = []
# for name, model in models:
#     kfold = KFold(n_splits=num_fold, random_state=seed)
#     cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     print(name, cv_results.mean(), cv_results.std())
### Results suggest both Logistic and KNN worth to further study


############################## Compare the model #######################################
# fig = pyplot.figure()
# fig.suptitle('Algorithm Comparison')
# ax = fig.add_subplot(111)
# pyplot.boxplot(results)
# ax.set_xticklabels(names)
# pyplot.show()

######################## Standardize the dataset & build model ##############################

pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', StandardScaler()),('LR', LogisticRegression())])))
pipelines.append(('ScaledLDA', Pipeline([('Scaler', StandardScaler()),('LDA', LinearDiscriminantAnalysis())])))
pipelines.append(('ScaledKNN', Pipeline([('Scaler', StandardScaler()),('KNN', KNeighborsClassifier())])))
pipelines.append(('ScaledCART', Pipeline([('Scaler', StandardScaler()),('CART', DecisionTreeClassifier())])))
pipelines.append(('ScaledNB', Pipeline([('Scaler', StandardScaler()),('NB', GaussianNB())])))
pipelines.append(('ScaledSVM', Pipeline([('Scaler', StandardScaler()),('SVM', SVC())])))

results = []
names = []
# for name, model in pipelines:
#     kfold = KFold(n_splits=num_fold, random_state=seed)
#     cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
#     results.append(cv_results)
#     names.append(name)
#     print(name, cv_results.mean(), cv_results.std())
### from results we can see that standardization has lifted the skills of SVM to be the most accurate algo tested

############################## Compare the Scaled model #######################################
# fig = pyplot.figure()
# fig.suptitle('Scaled Algorithm Comparison')
# ax = fig.add_subplot(111)
# pyplot.boxplot(results)
# ax.set_xticklabels(names)
# pyplot.show()

############################## Algorithm Tunning #######################################
########### Tunning KNN
scaler = StandardScaler().fit(x_train)
scaledx = scaler.transform(x_train)
kvalues = [1,3,5,7,9,11,13,15,17,19,21]
param_grid = dict(n_neighbors=kvalues)
model = KNeighborsClassifier()
kfold = KFold(n_splits=num_fold, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(scaledx, y_train)
# print("Best: ", grid_result.best_score_, "using: ", grid_result.best_params_)
mean = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
# for mean, std, param in zip(mean, stds, params):
#     print(mean, std, param)
### After Tunning KNN algo for different k values, It shows high score = 0.843  with k value = 1

########### Tunning of SVM with different kernal values and C values
scaler = StandardScaler().fit(x_train)
rescaledx = scaler.transform(x_train)
c_values = [0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.3, 1.5, 1.7, 1.9, 2.0]
kernal_values = ['linear', 'poly', 'rbf', 'sigmoid']
param_grid = dict(C=c_values, kernel=kernal_values)
model = SVC()
kfold = KFold(n_splits=num_fold, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=kfold, scoring=scoring)
grid_result = grid.fit(rescaledx, y_train)
# print("Best: ", grid_result.best_score_, "using: ", grid_result.best_params_)
mean = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
# for mean, std, param in zip(mean, stds, params):
#     print(mean, std, param)
### After Tunning SVM algo for different C and kernal values, It shows high score(0.8313) for C=1.3 & kernel=rbf

################## Improve the performance of model using Ensamble methods ##########################
ensembles = []
ensembles.append(('AB', AdaBoostClassifier()))
ensembles.append(('GBM', GradientBoostingClassifier()))
ensembles.append(('RF', RandomForestClassifier()))
ensembles.append(('ET',ExtraTreesClassifier()))
results = []
names = []
for name, model in ensembles:
    kfold = KFold(n_splits=num_fold, random_state=seed)
    cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
#     print(name, cv_results.mean(), cv_results.std())
### After boosting technique , we can see that result of GBM worth to further study  

############################## Compare the Scaled model #######################################
# fig = pyplot.figure()
# fig.suptitle('Ensemble Algorothm Comparison')
# ax = fig.add_subplot(111)
# pyplot.boxplot(results)
# ax.set_xticklabels(names)
# pyplot.show()

############################## Finalize the model #######################################

scaler = StandardScaler().fit(x_train)
rescaledx = scaler.transform(x_train)
model = KNeighborsClassifier()
model.fit(rescaledx, y_train)

rescaled_testx = scaler.transform(x_test)
prediction = model.predict(rescaled_testx)
print(accuracy_score(y_test,prediction))
print(confusion_matrix(y_test,prediction))
print(classification_report(y_test, prediction))

############# We achive accuracy of 90% ###########################



0.9047619047619048
[[25  0]
 [ 4 13]]
             precision    recall  f1-score   support

          M       0.86      1.00      0.93        25
          R       1.00      0.76      0.87        17

avg / total       0.92      0.90      0.90        42



