In [42]:
import pandas as pd #DataFrame

from sklearn.preprocessing import LabelEncoder #Encoding our classes (Mines and Rocks)
from sklearn.model_selection import train_test_split, GridSearchCV # Splitting the data for training and testing. GridSearchCV for finding the best combination
from sklearn.model_selection import StratifiedKFold # Stratified K-Fold Cross Validation for stratified sampling
from sklearn.linear_model import LogisticRegression # Model 1
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score, classification_report # For accuracy reports

from imblearn.over_sampling import SMOTE # Over/Under sampling the dataset
import xgboost as xgb # Model 2

In [43]:
df = pd.read_csv(r'E:\datasets\ml\sonar.all-data.csv', header=None)
type(df)
df.head()
df.shape

(208, 61)

In [44]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,50,51,52,53,54,55,56,57,58,59
count,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,...,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0,208.0
mean,0.029164,0.038437,0.043832,0.053892,0.075202,0.10457,0.121747,0.134799,0.178003,0.208259,...,0.016069,0.01342,0.010709,0.010941,0.00929,0.008222,0.00782,0.007949,0.007941,0.006507
std,0.022991,0.03296,0.038428,0.046528,0.055552,0.059105,0.061788,0.085152,0.118387,0.134416,...,0.012008,0.009634,0.00706,0.007301,0.007088,0.005736,0.005785,0.00647,0.006181,0.005031
min,0.0015,0.0006,0.0015,0.0058,0.0067,0.0102,0.0033,0.0055,0.0075,0.0113,...,0.0,0.0008,0.0005,0.001,0.0006,0.0004,0.0003,0.0003,0.0001,0.0006
25%,0.01335,0.01645,0.01895,0.024375,0.03805,0.067025,0.0809,0.080425,0.097025,0.111275,...,0.008425,0.007275,0.005075,0.005375,0.00415,0.0044,0.0037,0.0036,0.003675,0.0031
50%,0.0228,0.0308,0.0343,0.04405,0.0625,0.09215,0.10695,0.1121,0.15225,0.1824,...,0.0139,0.0114,0.00955,0.0093,0.0075,0.00685,0.00595,0.0058,0.0064,0.0053
75%,0.03555,0.04795,0.05795,0.0645,0.100275,0.134125,0.154,0.1696,0.233425,0.2687,...,0.020825,0.016725,0.0149,0.0145,0.0121,0.010575,0.010425,0.01035,0.010325,0.008525
max,0.1371,0.2339,0.3059,0.4264,0.401,0.3823,0.3729,0.459,0.6828,0.7106,...,0.1004,0.0709,0.039,0.0352,0.0447,0.0394,0.0355,0.044,0.0364,0.0439


In [45]:
df[60].value_counts()

60
M    111
R     97
Name: count, dtype: int64

In [46]:
# Preprocessing the data
df.dropna(inplace=True)
X = df.drop(columns=60, axis=1)
Y = df[60]

In [47]:
# Split the dataset into training and testing, 80% for training and 20% for testing, set random state to prevent randomness across different runs
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=42)

### Logistic Regression

In [48]:
# Load logistic Regression
model = LogisticRegression()

In [49]:
# Fitting the data into the model, train it
model.fit(X_train, Y_train)

In [50]:
# Feed in the training data and fetch the accuracy score
training_predict = model.predict(X_train)
training_accuracy = accuracy_score(training_predict, Y_train)

In [51]:
# Print the accuracy on the training dataset
print(f"accuracy when tested with training data = {training_accuracy}")

accuracy when tested with training data = 0.8373493975903614


In [62]:
# Print the classification report
Y_pred_lr = model.predict(X_test)
print(classification_report(Y_test, Y_pred_lr))

              precision    recall  f1-score   support

           M       0.81      0.95      0.88        22
           R       0.94      0.75      0.83        20

    accuracy                           0.86        42
   macro avg       0.87      0.85      0.85        42
weighted avg       0.87      0.86      0.86        42



### Random Forest Classifier

In [52]:
# Import SMOTE with a fixed random state for consistent results across different runs  
sm = SMOTE(random_state=42)

# Apply SMOTE to training data for balanced learning (we have more rocks than mines in the dataset)
X_train_resampled, Y_train_resampled = sm.fit_resample(X_train, Y_train)

In [54]:
# Define hyperparameters for Random Forest
param_grid_rf = {
    'n_estimators': [100, 200, 225, 250, 300],
    'max_depth': [3, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Load the RFC model
rf_model = RandomForestClassifier(random_state=42)

# To prevent random sampling which would result in otherwise bad accuracy, here, recall
stratified_cv = StratifiedKFold(n_splits=10)

# Initialize GridSearchCV with Random Forest and resampled data
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid_rf, cv=stratified_cv, n_jobs=-1, verbose=2)

# Fit the resampled data and train the model
grid_search_rf.fit(X_train_resampled, Y_train_resampled)

Fitting 10 folds for each of 180 candidates, totalling 1800 fits


In [55]:
# Fetch the best params and recall score
best_params_rf = grid_search_rf.best_params_
best_recall_rf = grid_search_rf.best_score_

# Print the params and recall score
print(f"Best parameters (RF): {best_params_rf}")
print(f"Best recall score (RF): {best_recall_rf}")

# Generate the classification report
best_rf_model = grid_search_rf.best_estimator_
Y_pred_rf = best_rf_model.predict(X_test)
print(classification_report(Y_test, Y_pred_rf))

Best parameters (RF): {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Best recall score (RF): 0.888888888888889
              precision    recall  f1-score   support

           M       0.81      1.00      0.90        22
           R       1.00      0.75      0.86        20

    accuracy                           0.88        42
   macro avg       0.91      0.88      0.88        42
weighted avg       0.90      0.88      0.88        42



### XGBoost

In [56]:
# Import SMOTE with a fixed random state for consistent results across different runs  
sm = SMOTE(random_state=42)

# Apply SMOTE to training data for balanced learning (we have more rocks than mines in the dataset)
X_train_resampled, Y_train_resampled = sm.fit_resample(X_train, Y_train)

In [57]:
# Encode classes (M and R)
label_encoder = LabelEncoder()
Y_train_resampled_encoded = label_encoder.fit_transform(Y_train_resampled)
Y_test_encoded = label_encoder.transform(Y_test)

In [58]:
# Print the encoded classes for sanity
for i, class_label in enumerate(label_encoder.classes_):
    print(f"Class {i}: {class_label}")

Class 0: M
Class 1: R


In [63]:
# Init XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False,
                              eval_metric='logloss', # logloss is well suited for classification problems
                              random_state=42)

# Hyperparameters
param_grid_xgb = {
    'n_estimators': [100, 200, 225, 250, 300],
    'max_depth': [3, 5, 10, 20],
    'learning_rate': [0.01, 0.05, 0.1],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.6, 0.8, 1.0],
    'alpha' : [0.0, 0.05, 0.1, 1.0], # L1 regularizer (lasso)
    'lambda' : [0.0, 0.1, 1.0, 5.0, 10.0] # L2 regularizer (Ridge)
}

# To prevent random sampling which would result in otherwise bad accuracy, here, recall
stratified_cv = StratifiedKFold(n_splits=10)

# Initialize GridSearchCV with resampled data and params
grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid_xgb, cv=stratified_cv, n_jobs=-1, verbose=2, scoring='recall')

# Fit the resampl data into XGBoost and train
grid_search_xgb.fit(X_train_resampled, Y_train_resampled_encoded)

Fitting 10 folds for each of 10800 candidates, totalling 108000 fits


Parameters: { "use_label_encoder" } are not used.



In [64]:
# Print the classification report
best_xgb_model = grid_search_xgb.best_estimator_
Y_pred_xgb = best_xgb_model.predict(X_test)
print(classification_report(Y_test_encoded, Y_pred_xgb))

              precision    recall  f1-score   support

           0       0.88      0.95      0.91        22
           1       0.94      0.85      0.89        20

    accuracy                           0.90        42
   macro avg       0.91      0.90      0.90        42
weighted avg       0.91      0.90      0.90        42

