# Rock vs Mine Prediction Model

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



In [None]:
sonar_data = pd.read_csv(r"data\sonar.csv", header=None)

print("Shape: ", sonar_data.shape)
print("Label count: ", sonar_data[60].value_counts())

sonar_data.head()


Shape:  (208, 61)
Label count:  60
M    111
R     97
Name: count, dtype: int64


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,0.02,0.0371,0.0428,0.0207,0.0954,0.0986,0.1539,0.1601,0.3109,0.2111,...,0.0027,0.0065,0.0159,0.0072,0.0167,0.018,0.0084,0.009,0.0032,R
1,0.0453,0.0523,0.0843,0.0689,0.1183,0.2583,0.2156,0.3481,0.3337,0.2872,...,0.0084,0.0089,0.0048,0.0094,0.0191,0.014,0.0049,0.0052,0.0044,R
2,0.0262,0.0582,0.1099,0.1083,0.0974,0.228,0.2431,0.3771,0.5598,0.6194,...,0.0232,0.0166,0.0095,0.018,0.0244,0.0316,0.0164,0.0095,0.0078,R
3,0.01,0.0171,0.0623,0.0205,0.0205,0.0368,0.1098,0.1276,0.0598,0.1264,...,0.0121,0.0036,0.015,0.0085,0.0073,0.005,0.0044,0.004,0.0117,R
4,0.0762,0.0666,0.0481,0.0394,0.059,0.0649,0.1209,0.2467,0.3564,0.4459,...,0.0031,0.0054,0.0105,0.011,0.0015,0.0072,0.0048,0.0107,0.0094,R


In [21]:
def BestMLAlgo(x_train, x_test, y_train, y_test):
    """Identify best Algo on given dataset"""
    from prettytable import PrettyTable
    
    table = PrettyTable()
    table.field_names = ["Model", "Accuracy", "Precision", "Recall", "F1Score", "Log loss", "auc_score"]
                        
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.naive_bayes import MultinomialNB
    from sklearn.linear_model import LogisticRegression
    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.ensemble import GradientBoostingClassifier
    from sklearn.linear_model import SGDClassifier
    from sklearn.calibration import CalibratedClassifierCV
    from sklearn.metrics import confusion_matrix



    import xgboost
    from sklearn import svm
    from sklearn.metrics import precision_score, accuracy_score, roc_auc_score,\
                                roc_curve, auc, log_loss, recall_score
    
    base_models = {
        'kNN': KNeighborsClassifier(),
        'Naive Bayes': MultinomialNB(),
        'Log. Reg.': LogisticRegression(),
        'SVM Linear': SGDClassifier(class_weight='balanced', penalty='l2', loss='hinge', random_state=42),
        'SVM Non-linear': svm.SVC(kernel='rbf'),
        'Decision Tree': DecisionTreeClassifier(),
        'Random Forest': RandomForestClassifier(),
        'Gradient Boost': GradientBoostingClassifier(),
        'Ada Boost': AdaBoostClassifier(),
        'xgboost': xgboost.XGBClassifier(),
    }              
    for model_name, model in base_models.items():
        model.fit(x_train, y_train)
        model = CalibratedClassifierCV(model, method="sigmoid")
        model.fit(x_train, y_train)
        y_pred_proba = model.predict_proba(x_test)
        y_pred = model.predict(x_test)


        # Performance metrics
        accuracy        = round(accuracy_score(y_test, y_pred), 2)
        precision       = round(precision_score(y_test, y_pred, average='micro'), 2)
        recall          = round(recall_score(y_test, y_pred, average='micro'), 2)
        f1_score        = round((2*recall*precision)/(recall+precision), 2)
        loss            = round(log_loss(y_test, y_pred_proba, labels=[0, 1]), 2) # , labels=model.classes 

        fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)

        auc_score = round(auc(fpr, tpr), 2)      

        table.add_row([model_name, accuracy, precision, recall, f1_score, loss, auc_score])
            
    print(table)
    return model

In [7]:
# Seperating data and labels
X = sonar_data.drop(columns=60,axis=1)
Y = sonar_data[60]
Y = [1 if value == "R" else 0 for value in Y]

In [9]:
# Split size = 10
# Caliberated = True

from sklearn import model_selection

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, Y,
                                            test_size = 0.10,
                                            shuffle=False,
                                        )

print(len(X_train), ' ',len(X_test), " ", len(y_train), ' ',len(y_test))

model = BestMLAlgo(X_train, X_test, y_train, y_test)

187   21   187   21




+----------------+----------+-----------+--------+---------+----------+-----------+
|     Model      | Accuracy | Precision | Recall | F1Score | Log loss | auc_score |
+----------------+----------+-----------+--------+---------+----------+-----------+
|      kNN       |   0.19   |    0.19   |  0.19  |   0.19  |   0.79   |    nan    |
|  Naive Bayes   |   0.67   |    0.67   |  0.67  |   0.67  |   0.64   |    nan    |
|   Log. Reg.    |   0.38   |    0.38   |  0.38  |   0.38  |   0.75   |    nan    |
|   SVM Linear   |   0.43   |    0.43   |  0.43  |   0.43  |   0.72   |    nan    |
| SVM Non-linear |   0.38   |    0.38   |  0.38  |   0.38  |   0.8    |    nan    |
| Decision Tree  |   0.19   |    0.19   |  0.19  |   0.19  |   0.77   |    nan    |
| Random Forest  |   0.81   |    0.81   |  0.81  |   0.81  |   0.52   |    nan    |
| Gradient Boost |   0.95   |    0.95   |  0.95  |   0.95  |   0.49   |    nan    |
|   Ada Boost    |   0.62   |    0.62   |  0.62  |   0.62  |   0.63   |    n



In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

def objective(trial):   
    # Suggest hyperparameters
    param = {
        "objective": "binary:logistic",
        "eval_metric": "logloss",
        "verbosity": 0,
        "booster": trial.suggest_categorical("booster", ["gbtree", "dart"]),
        "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
        "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "eta": trial.suggest_loguniform("eta", 1e-8, 1.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
        "grow_policy": trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"]),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "n_estimators": trial.suggest_int("n_estimators", 50, 500)
    }
    
    # Train model on training data
    dtrain = xgb.DMatrix(X_train, label=y_train)
    cv_results = xgb.cv(param, dtrain, num_boost_round=100, nfold=3, metrics="f1-micro", early_stopping_rounds=10, seed=42)
    
    # Use the best AUC score
    return cv_results['test-auc-mean'].values[-1]


  from .autonotebook import tqdm as notebook_tqdm


In [27]:
# Split your data
X_train, X_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.1, random_state=42)

# Initialize and run the study
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)

# Get the best parameters
print("Best trial:", study.best_trial.params)


[I 2024-11-08 17:19:55,030] A new study created in memory with name: no-name-c2d52823-75cc-46be-b173-2c5cc4f52b04
  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.suggest_loguniform("alpha", 1e-8, 1.0),
  "eta": trial.suggest_loguniform("eta", 1e-8, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 1.0),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
[I 2024-11-08 17:19:56,912] Trial 0 finished with value: 0.9020393051066667 and parameters: {'booster': 'dart', 'lambda': 9.911235464888358e-06, 'alpha': 0.06266648394518015, 'max_depth': 6, 'eta': 0.04212751523391623, 'gamma': 5.314634771586566e-07, 'grow_policy': 'lossguide', 'subsample': 0.8562911571314941, 'colsample_bytree': 0.563810638890405, 'min_child_weight': 2, 'n_estimators': 178}. Best is trial 0 with value: 0.9020393051066667.
  "lambda": trial.suggest_loguniform("lambda", 1e-8, 1.0),
  "alpha": trial.sugg

Best trial: {'booster': 'gbtree', 'lambda': 0.00016733845344685155, 'alpha': 0.24509999273384425, 'max_depth': 8, 'eta': 0.3087480585194425, 'gamma': 1.3223557183586e-07, 'grow_policy': 'lossguide', 'subsample': 0.9462893816785719, 'colsample_bytree': 0.5936181272356564, 'min_child_weight': 2, 'n_estimators': 241}


In [29]:
params= {'booster': 'gbtree', 'lambda': 0.00016733845344685155, 'alpha': 0.24509999273384425, 'max_depth': 8, 'eta': 0.3087480585194425, 'gamma': 1.3223557183586e-07, 'grow_policy': 'lossguide', 'subsample': 0.9462893816785719, 'colsample_bytree': 0.5936181272356564, 'min_child_weight': 2, 'n_estimators': 241}

In [33]:
import xgboost as xgb

from prettytable import PrettyTable
from sklearn.metrics import precision_score, accuracy_score, roc_auc_score,\
                            roc_curve, auc, log_loss, recall_score

table = PrettyTable()
table.field_names = ["Model", "Accuracy", "Precision", "Recall", "F1Score", "Log loss", "auc_score"]
                    


# Define your XGBoost model with the optimal parameters
model = xgb.XGBClassifier(**params)

# Fit the model on training data
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
# Calculate accuracy, F1 score, or any other metric based on your requirements


# Performance metrics
accuracy        = round(accuracy_score(y_test, y_pred), 2)
precision       = round(precision_score(y_test, y_pred, average='micro'), 2)
recall          = round(recall_score(y_test, y_pred, average='micro'), 2)
f1_score        = round((2*recall*precision)/(recall+precision), 2)
# loss            = round(log_loss(y_test, y_pred_proba, labels=[0, 1]), 2) # , labels=model.classes 
loss = 0
fpr, tpr, thresholds = roc_curve(y_test, y_pred, pos_label=1)

auc_score = round(auc(fpr, tpr), 2)      

table.add_row(["model", accuracy, precision, recall, f1_score, loss, auc_score])
table




Model,Accuracy,Precision,Recall,F1Score,Log loss,auc_score
model,1.0,1.0,1.0,1.0,0,
