### Dataset Information

The dataset contains sonar signals used to distinguish between metal cylinders (mines) and rocks. There are 111 patterns from metal cylinders and 97 from rocks, collected under various conditions and angles. Each pattern includes 60 numbers representing the energy in specific frequency bands, with higher frequencies occurring later. The labels are "R" for rocks and "M" for mines. 

### Imports 

In [5]:
import matplotlib as mpl

font = {'family' : 'Georgia',
        'weight' : 'bold',
        'size'   : 12}

COLOR = 'gray'
mpl.rcParams['text.color'] = COLOR
mpl.rcParams['axes.labelcolor'] = COLOR
mpl.rcParams['xtick.color'] = COLOR
mpl.rcParams['ytick.color'] = COLOR

mpl.rc('font', **font)

import numpy as np

import matplotlib.pyplot as plt
import pandas as pd

# Metrics
from sklearn.model_selection import train_test_split, RepeatedKFold, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Classifiers
from sklearn.preprocessing import RobustScaler, MaxAbsScaler, MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF, ConstantKernel as C
from sklearn.naive_bayes import GaussianNB


### Importing the Dataset

In [None]:
# Load the dataset
data = pd.read_csv('sonar.all-data-uci.csv')
data.head(5)


### Splitting the data into training and testing sets

In [None]:
# Assign feature columns and target column
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
def cross_validation(classifiers, classifier_names, scalers, scaler_names, X, y):
    results = {}
    for scaler, scaler_name in zip(scalers, scaler_names):
        X_scaled = scaler.fit_transform(X)
        train_x, test_x, train_y, test_y = train_test_split(X_scaled, y, test_size=0.33)

        for clf, clf_name in zip(classifiers, classifier_names):
            clf.fit(train_x, train_y)
            y_pred = clf.predict(test_x)
            score = accuracy_score(test_y, y_pred)
            # print(f"Scaler: {scaler_name}, Classifier: {clf_name}, Accuracy: {score:.2f}")

            cv = RepeatedKFold(n_splits=10, n_repeats=5, random_state=1)
            cv_scores = cross_val_score(clf, X_scaled, y, scoring='accuracy', cv=cv, n_jobs=-1)
            results[(scaler_name, clf_name)] = cv_scores

    return results

# Define scalers and classifiers
scaler_names = ['MinMax', 'MaxAbs', 'Robust']
scalers = [MinMaxScaler(), MaxAbsScaler(), RobustScaler()]

classifier_names = [
    "K Nearest Neighbors", 
    "Linear SVC", 
    "RBF SVC", 
    "Gaussian Process",
    "Decision Tree", 
    "Random Forest", 
    "Neural Net",
    "Gaussian Naive Bayes", 
    "Gradient Boost", 
    "Logistic Regression"
]

classifiers = [
    KNeighborsClassifier(3), 
    SVC(kernel="linear", C=0.025), 
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)), 
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000), 
    GaussianNB(), 
    GradientBoostingClassifier(), 
    LogisticRegression()
]


# Perform cross-validation
results = cross_validation(classifiers, classifier_names, scalers, scaler_names, X_train, y_train)

# Convert results to a DataFrame
data = []
for (scaler_name, clf_name), scores in results.items():
    mean_accuracy = np.mean(scores)
    data.append((scaler_name, clf_name, mean_accuracy))

df = pd.DataFrame(data, columns=['Scaler', 'Classifier', 'Accuracy'])
df['Combination'] = df['Scaler'] + ' + ' + df['Classifier']

# Filter out combinations with accuracy below 50% since they are worse than taking a random guess 
df_filtered = df[df['Accuracy'] >= 0.50]

# Plot the results
plt.figure(figsize=(15, 10))
plt.barh(df_filtered['Combination'], df_filtered['Accuracy'], color='skyblue')
plt.xlabel('Accuracy')
plt.title('Classifier and Scaler Combinations vs. Accuracy')
plt.xlim(0.60, 1.0)  # Set x-axis to start at 60%
plt.grid(axis='x')
plt.xticks(ticks=np.arange(0.60, 1.01, 0.05), labels=[f'{int(x * 100)}%' for x in np.arange(0.60, 1.01, 0.05)])
plt.show()

It's clear from the chart that Gaussian Process is the strongest classifier and for scalers MaxAbs slightly outperforms MinMax. So we will continue with MaxAbs and Gaussian Process. 

In [None]:


# Standardize the feature values using Robust Scaler
scaler = MaxAbsScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the expanded parameter grid
param_grid = {
    'kernel': [
    1.0 * RBF(length_scale) for length_scale in [0.1, 1.0, 10.0]
    ] + [
    C(1.0, (1e-6, 1e2)) * RBF(length_scale, (1e-6, 1e6)) for length_scale in [0.01, 0.1, 1.0]
    ],
    'n_restarts_optimizer': [0, 1, 2],
    'max_iter_predict': [5, 10, 15, 100],
    'multi_class': ['one_vs_rest', 'one_vs_one'],
    'optimizer': ['fmin_l_bfgs_b', None],
    'n_jobs': [-1]  # Use all available cores
}

# Initialize the classifier
gpc = GaussianProcessClassifier()

# Set up the grid search
grid_search = GridSearchCV(estimator=gpc, param_grid=param_grid, cv=10, scoring='accuracy', n_jobs=-1)

grid_search.fit(X_train_scaled, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best parameters found: ", best_params)

# Train the classifier with the best parameters
best_gpc = grid_search.best_estimator_
best_gpc.fit(X_train_scaled, y_train)

# Evaluate the best model on the test set
y_pred_best_gpc = best_gpc.predict(X_test_scaled)
accuracy_best_gpc = accuracy_score(y_test, y_pred_best_gpc)

# Generate the classification report
report_dict = classification_report(y_test, y_pred_best_gpc, output_dict=True)

# Convert the classification report to a DataFrame
report_df = pd.DataFrame(report_dict).transpose()

print("Best Gaussian Process Classifier")
print(f"Accuracy: {accuracy_best_gpc:.2f}")

report_df

In [7]:
# Load the dataset
data = pd.read_csv('sonar.all-data-uci.csv')

# Assign feature columns and target column
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

def experiment():
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    scaler = MaxAbsScaler()

    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.fit_transform(X_test)

    # Define the kernel with appropriate bounds
    base_kernel = 1.0 * RBF(length_scale=1.0, length_scale_bounds=(1e-5, 1e5))

    # Set up the best parameters for the classifier
    best_params = {
        'kernel': base_kernel,
        'max_iter_predict': 10,
        'multi_class': 'one_vs_rest',
        'n_jobs': -1,
        'n_restarts_optimizer': 0,
        'optimizer': 'fmin_l_bfgs_b'
    }

    # Initialize the Gaussian Process Classifier with the best parameters
    best_gpc = GaussianProcessClassifier(**best_params)

    # Train the classifier with the training data
    best_gpc.fit(X_train_scaled, y_train)

    # Evaluate the best model on the test set
    y_pred_best_gpc = best_gpc.predict(X_test_scaled)
    accuracy_best_gpc = accuracy_score(y_test, y_pred_best_gpc)

    # Generate the classification report
    # report_dict = classification_report(y_test, y_pred_best_gpc, output_dict=True)

    # Convert the classification report to a DataFrame
    # report_df = pd.DataFrame(report_dict).transpose()

    # Print the results
    # print("Best Gaussian Process Classifier")
    # print(f"Accuracy: {accuracy_best_gpc:.2f}")
    # print("Classification Report:")
    # print(report_df)
    return accuracy_best_gpc

# Run the experiment multiple times and save the accuracies
num_iterations = 100
accuracies = []

for i in range(num_iterations):
    accuracy = experiment()
    accuracies.append(accuracy)
    # print(f"Iteration {i+1}/{num_iterations}, Accuracy: {accuracy:.2f}")

# Save the accuracies to a DataFrame
accuracies_df = pd.DataFrame(accuracies, columns=['Accuracy'])

accuracies_df.describe()

# Calculate the y-axis limits dynamically
# y_min = max(0, min(accuracies) - 0.05)  # Adding a little padding
# y_max = min(1, max(accuracies) + 0.05)  # Adding a little padding

# # Plot the accuracies as a box plot with individual points
# plt.figure(figsize=(10, 6))
# plt.boxplot(accuracies, vert=True, patch_artist=True)
# plt.scatter([1] * len(accuracies), accuracies, color='red', zorder=2)
# plt.title('Box Plot of Accuracies over Multiple Iterations')
# plt.ylabel('Accuracy')
# plt.xticks([1], ['Accuracies'])
# plt.ylim(y_min, y_max)  # Dynamically setting y-axis limits
# plt.grid(True)
# plt.show()


Unnamed: 0,Accuracy
count,100.0
mean,0.79619
std,0.066515
min,0.595238
25%,0.738095
50%,0.809524
75%,0.833333
max,0.928571


### Interpretation
- The classifier performs exceptionally well with high precision and recall for both classes.
- The perfect precision for class 'M' and perfect recall for class 'R' indicate very few false positives for class 'M' and no false negatives for class 'R'.
- The high overall accuracy 95.24% and balanced macro and weighted averages suggest that the model is reliable and robust across different classes.