### Models

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import IsolationForest, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from boruta import BorutaPy
from joblib import dump, load
bitter_features = pd.read_csv('output/mean/processed_mean_bitter_train.csv')

replaced_data_mean = bitter_features.iloc[:, 9:]

y = bitter_features['Bitter']

# Features
X = replaced_data_mean

In [2]:
# Load the saved Boruta model
boruta = load('model/boruta/boruta_mean_model.joblib')

# Transform the dataset using the loaded model
X_transformed = boruta.transform(X.values)

In [3]:
significant_feature_names = X.columns[boruta.support_].tolist()

In [4]:
import pickle

file = open('features/bitter/bitter_boruta.p','wb')
pickle.dump(significant_feature_names,file)
file.close()

In [5]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score,
    precision_score, recall_score, f1_score, average_precision_score
)
import numpy as np

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    average_precision_score, confusion_matrix, classification_report
)
# Hyperparameter grid for AdaBoost
adaboost_param_grid = {
    "n_estimators": [300],
    "learning_rate": [0.01, 0.1],
    "algorithm": ["SAMME"]
}

ada = AdaBoostClassifier(random_state=42)

ada_random = RandomizedSearchCV(
    estimator=ada,
    param_distributions=adaboost_param_grid,
    n_iter=20,
    cv=5,
    verbose=2,
    n_jobs=-1,
    random_state=42,
    scoring='roc_auc'
)

ada_random.fit(X_train, y_train)
best_ada = ada_random.best_estimator_

y_pred_ada = best_ada.predict(X_test)
y_pred_proba_ada = best_ada.predict_proba(X_test)[:, 1]

# Evaluation
print("\nAdaBoost:")
tn, fp, fn, tp = confusion_matrix(y_test, y_pred_ada).ravel()
ner = fp / (tn + fp) if (tn + fp) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

print("Best Hyperparameters:", ada_random.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred_ada))
print("Precision:", precision_score(y_test, y_pred_ada))
print("Recall:", recall_score(y_test, y_pred_ada))
print("Sensitivity:", recall_score(y_test, y_pred_ada))
print("F1-score:", f1_score(y_test, y_pred_ada))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred_proba_ada))
print("AUPR Score:", average_precision_score(y_test, y_pred_proba_ada))
print("Non Error Rate (NER):", 1 - ner)
print("Specificity (SP):", specificity)
print("Classification Report:\n", classification_report(y_test, y_pred_ada))




Fitting 5 folds for each of 2 candidates, totalling 10 fits

AdaBoost:
Best Hyperparameters: {'n_estimators': 300, 'learning_rate': 0.1, 'algorithm': 'SAMME'}
Accuracy: 0.8451327433628318
Precision: 0.9043478260869565
Recall: 0.6380368098159509
Sensitivity: 0.6380368098159509
F1-score: 0.7482014388489209
ROC-AUC Score: 0.9056297365571995
AUPR Score: 0.8462968836665203
Non Error Rate (NER): 0.9619377162629758
Specificity (SP): 0.9619377162629758
Classification Report:
               precision    recall  f1-score   support

       False       0.82      0.96      0.89       289
        True       0.90      0.64      0.75       163

    accuracy                           0.85       452
   macro avg       0.86      0.80      0.82       452
weighted avg       0.85      0.85      0.84       452



In [8]:
import pickle

file = open('final_model/adaboost.p','wb')
pickle.dump(best_ada,file)
file.close()

In [9]:
import pickle
file = open('final_model/adaboost.p','rb')

best_model =pickle.load(file)

y_pred = best_model.predict(X_test)
y_pred_proba = best_model.predict_proba(X_test)[:, 1]

from sklearn.metrics import confusion_matrix

# Model evaluation
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba)
aupr = average_precision_score(y_test, y_pred_proba)  # AUPR
sensitivity = recall  # Sensitivity (Sn) is the same as recall

# Confusion Matrix
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

# Calculate NER and Specificity
ner = fp / (tn + fp) if (tn + fp) > 0 else 0  # Avoid division by zero
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0  # Avoid division by zero

# Print results
print("Accuracy:", accuracy)
print("Precision (P):", precision)
print("Recall (R):", recall)
print("Sensitivity (Sn):", sensitivity)
print("F1-score (F1):", f1)
print("ROC-AUC Score (AuROC):", roc_auc)
print("AUPR Score (AuPR):", aupr)
print("Non Error Rate (NER):", 1-ner)
print("Specificity (SP):", specificity)
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.8451327433628318
Precision (P): 0.9043478260869565
Recall (R): 0.6380368098159509
Sensitivity (Sn): 0.6380368098159509
F1-score (F1): 0.7482014388489209
ROC-AUC Score (AuROC): 0.9056297365571995
AUPR Score (AuPR): 0.8462968836665203
Non Error Rate (NER): 0.9619377162629758
Specificity (SP): 0.9619377162629758
Classification Report:
               precision    recall  f1-score   support

       False       0.82      0.96      0.89       289
        True       0.90      0.64      0.75       163

    accuracy                           0.85       452
   macro avg       0.86      0.80      0.82       452
weighted avg       0.85      0.85      0.84       452

