# About Dataset

## NAME: Sonar, Mines vs. Rocks
- This is the data set used by Gorman and Sejnowski in their study of the classification of sonar signals using a neural network. 
- The task is to train a network to discriminate between sonar signals bounced
off a metal cylinder and those bounced off a roughly cylindrical rock.

In [38]:
import numpy as np
import pandas as pd

In [39]:
dataset = pd.read_csv('Sonar.csv')
dataset.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V52,V53,V54,V55,V56,V57,V58,V59,V60,Class
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,1
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,1
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,1
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,1
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,1


In [40]:
dataset.shape

(208, 61)

In [41]:
dataset.columns

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31',
       'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41',
       'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51',
       'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60', 'Class'],
      dtype='object')

In [42]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 208 entries, 0 to 207
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   V1      208 non-null    float64
 1   V2      208 non-null    float64
 2   V3      208 non-null    float64
 3   V4      208 non-null    float64
 4   V5      208 non-null    float64
 5   V6      208 non-null    float64
 6   V7      208 non-null    float64
 7   V8      208 non-null    float64
 8   V9      208 non-null    float64
 9   V10     208 non-null    float64
 10  V11     208 non-null    float64
 11  V12     208 non-null    float64
 12  V13     208 non-null    float64
 13  V14     208 non-null    float64
 14  V15     208 non-null    float64
 15  V16     208 non-null    float64
 16  V17     208 non-null    float64
 17  V18     208 non-null    float64
 18  V19     208 non-null    float64
 19  V20     208 non-null    float64
 20  V21     208 non-null    float64
 21  V22     208 non-null    float64
 22  V2

# About the features of the data set
- There are no null values in the data set it has 61 features
- The first 60 features are the signal reading and the last 61th feature is the target feature
- the target feature is binary in nature and signify the type of object the signal is being received after bouncing back

In [43]:
# how many class and how many samples of each class
dataset["Class"].value_counts()

Class
0    111
1     97
Name: count, dtype: int64

# Mean Classifier
Declared a class with two methods fit and predict. The instance of the class has two variables classes and class_mean.
- fit():
  - the method calculates the target variable classes present and there mean values
- predict()
  - Based on the distance from the mean values from the class this method calculates the distances and assign the minimum distance class to that point.


In [44]:
# Mean-based classifier
class MeanClassifier:
    def fit(self, X, y):
        self.class_means_ = {}
        self.classes_ = np.unique(y)
        for cls in self.classes_:
            self.class_means_[cls] = X[y == cls].mean(axis=0)
    
    def predict(self, X):
        distances = np.array([np.linalg.norm(X - mean, axis=1) for mean in self.class_means_.values()]).T
        return np.array([self.classes_[np.argmin(dist)] for dist in distances])


# Splitting the dataset
The function train_test_spilt_manual() splits the dataset into test and train datasets based on test_size and random_state parameters

In [45]:
# Manual train-test split
def train_test_split_manual(X, y, test_size=0.3, random_state=None):
    np.random.seed(random_state)
    indices = np.arange(X.shape[0])
    np.random.shuffle(indices)
    
    test_size = int(test_size * X.shape[0])
    test_indices = indices[:test_size]
    train_indices = indices[test_size:]
    
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]
    
    return X_train, X_test, y_train, y_test

# Forward Selection
- The function takes the dataframe and the target feature as argument. 
- The function iterates over the features and selects the feature with highest accuracy score. 
- This function keeps on adding features to selected_features till the accuracy improves if in any for loop iteration if no update took place we break out of the while loop and return the selected_features list

In [46]:

def forward_selection(df, target_col):
    selected_features = []
    remaining_features = list(df.columns)
    remaining_features.remove(target_col)
    best_accuracy = 0
    best_features = []

    while remaining_features:
        feature_to_add = None

        for feature in remaining_features:
            features = selected_features + [feature]
            X = df[features].values
            y = df[target_col].values
                
            X_train, X_test, y_train, y_test = train_test_split_manual(X, y, test_size=0.4, random_state=40)
                
            model = MeanClassifier()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            accuracy = np.mean(predictions == y_test)
                
            print(f"Evaluating features: {features}, Accuracy: {accuracy}")
                
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                feature_to_add = [feature]
                best_features = features
        
        if feature_to_add is not None:
            selected_features.extend(feature_to_add)
            for feat in feature_to_add:
                remaining_features.remove(feat)
        else:
            break

    print(f"Best features: {best_features}, Best accuracy: {best_accuracy}")
    return selected_features, best_accuracy

df = pd.read_csv("Sonar.csv")
best_features, best_accuracy = forward_selection(df, 'Class')



Evaluating features: ['V1'], Accuracy: 0.6385542168674698
Evaluating features: ['V2'], Accuracy: 0.6144578313253012
Evaluating features: ['V3'], Accuracy: 0.6144578313253012
Evaluating features: ['V4'], Accuracy: 0.6385542168674698
Evaluating features: ['V5'], Accuracy: 0.6144578313253012
Evaluating features: ['V6'], Accuracy: 0.6265060240963856
Evaluating features: ['V7'], Accuracy: 0.5903614457831325
Evaluating features: ['V8'], Accuracy: 0.5301204819277109
Evaluating features: ['V9'], Accuracy: 0.6265060240963856
Evaluating features: ['V10'], Accuracy: 0.6024096385542169
Evaluating features: ['V11'], Accuracy: 0.6987951807228916
Evaluating features: ['V12'], Accuracy: 0.7590361445783133
Evaluating features: ['V13'], Accuracy: 0.7228915662650602
Evaluating features: ['V14'], Accuracy: 0.6024096385542169
Evaluating features: ['V15'], Accuracy: 0.5783132530120482
Evaluating features: ['V16'], Accuracy: 0.5301204819277109
Evaluating features: ['V17'], Accuracy: 0.4939759036144578
Evalua

# Backward selection

- The function takes the dataframe and the target feature as argument. 
- The function iterates over the features and removes the feature with lowest accuracy score. 
- This function keeps on removing features from selected_features till the accuracy improves if in any for loop iteration if no update took place we break out of the while loop and return the selected_features list

In [47]:
def backward_selection(df, target_col):
    selected_features = list(df.columns)
    selected_features.remove(target_col)
    best_accuracy = 0
    best_features = []

    while selected_features:
        feature_to_remove = None

        for feature in selected_features:
            features = [f for f in selected_features if f != feature]
            X = df[features].values
            y = df[target_col].values
                
            # split
            X_train, X_test, y_train, y_test = train_test_split_manual(X, y, test_size=0.3, random_state=42)
                
            model = MeanClassifier()
            model.fit(X_train, y_train)
            predictions = model.predict(X_test)
            accuracy = np.mean(predictions == y_test)
                
            print(f"Evaluating features: {features}, Accuracy: {accuracy:.4f}")
                
            if accuracy > best_accuracy:
                best_accuracy = accuracy
                feature_to_remove = [feature]
                best_features = features

        if feature_to_remove is not None:
            for feat in feature_to_remove:
                selected_features.remove(feat)
        else:
            break

    print(f"Best features: {best_features}, Best accuracy: {best_accuracy}")
    return selected_features, best_accuracy

df = pd.read_csv("Sonar.csv")
best_features, best_accuracy = backward_selection(df, 'Class')

Evaluating features: ['V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60'], Accuracy: 0.6774
Evaluating features: ['V1', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'V29', 'V30', 'V31', 'V32', 'V33', 'V34', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V41', 'V42', 'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V55', 'V56', 'V57', 'V58', 'V59', 'V60'], Accuracy: 0.6774
Evaluating features: ['V1', 'V2', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V14', 'V15'