In [None]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error, r2_score, explained_variance_score, median_absolute_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.svm import SVC 
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import folium
import pandas as pd
from datetime import datetime
import requests
from io import StringIO

In [None]:
import numpy as np
import pandas as pd
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss, confusion_matrix
from imblearn.over_sampling import ADASYN
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

In [None]:

# Define the API endpoint URL
api_url = "https://earthquake.usgs.gov/fdsnws/event/1/query"

current_time = datetime.utcnow().strftime('%Y-%m-%dT%H:%M:%S')

# Define the parameters
params = {
    'format': 'csv',
    'starttime': '1960-01-01',
    'endtime': current_time,
    'latitude': 43.2,
    'longitude': 78.7,
    'maxradiuskm': 500,
    'minmagnitude': 3,
    'orderby': 'time'
}

# Make the API request
response = requests.get(api_url, params=params)

# Check if the request was successful (status code 200)
if response.status_code == 200:
    # Convert CSV data to Pandas DataFrame
    data = pd.read_csv(StringIO(response.text))
    
    # Save the DataFrame to a CSV file
    data.to_csv('earthquake_dataset_shelek.csv', index=False)

    # Print the first few rows of the DataFrame
    print(data.head())
else:
    # Print an error message if the request was not successful
    print(f"Error: {response.status_code} - {response.text}")

In [None]:
df = {
    'latitude': data.latitude, 'longitude': data.longitude,  'depth': data.depth, 
    'time': pd.to_datetime(data['time']),
    'year': pd.DatetimeIndex(data['time']).year, 'month': pd.DatetimeIndex(data['time']).month,'day': pd.DatetimeIndex(data['time']).day, 'hour': pd.DatetimeIndex(data['time']).hour,
    'magnitude': data.mag}

df = pd.DataFrame(df)
# data = data.set_index('time')

In [None]:
df['rolling_mean_magnitude'] = df['magnitude'].rolling(window=10, min_periods=1).mean()
df['time_since_last_hour'] = df['time'].diff().dt.total_seconds().div(3600).abs()

In [None]:
print(df.isna().sum())
df = df.fillna(0)
print(df.isna().sum())


In [None]:
from geopy.distance import geodesic


In [None]:
# Defining significant earthquakes
significant_threshold = 6.0
significant_earthquakes = df[df['magnitude'] >= significant_threshold].copy()

# Ensuring the time is in datetime format if not already
df['time'] = pd.to_datetime(df['time'])
significant_earthquakes['time'] = pd.to_datetime(significant_earthquakes['time'])

In [None]:
def find_closest_earthquake(row, significant_df):
    # Calculate distances
    distances = significant_df.apply(lambda x: geodesic((x['latitude'], x['longitude']), (row['latitude'], row['longitude'])).kilometers, axis=1)
    min_distance_index = distances.idxmin()
    closest_earthquake = significant_df.loc[min_distance_index]

    # Calculate time difference in days
    time_difference = abs((closest_earthquake['time'] - row['time']).total_seconds() / 86400)

    return pd.Series([closest_earthquake['time'], min_distance_index, distances[min_distance_index], time_difference])

# Apply the function to each row
df[['closest_eq_time', 'closest_eq_index', 'distance_to_closest_eq', 'time_diff_to_closest_eq']] = df.apply(find_closest_earthquake, significant_df=significant_earthquakes, axis=1)

In [None]:
print(df.isna().sum())


In [None]:
# df = pd.read_csv('dataset with new features.csv')

In [None]:
df = df.set_index('time', inplace=False)
df

In [None]:
df = df.drop(['hour', 'closest_eq_time', 'closest_eq_index'], axis=1)


In [None]:
df['magnitude'] = df['magnitude'].apply(lambda x: 1 if x >= 4.5 else 0)
df

In [None]:
class_counts = df['magnitude'].value_counts()
plt.bar(class_counts.index, class_counts.values)
plt.xlabel('Class')
plt.ylabel('Count')
plt.title('Class Distribution')
plt.xticks([0, 1], ['Magnitude < 4.5', 'Magnitude >= 4.5'])
plt.show()


In [None]:
X = df.drop('magnitude', axis=1)
scaler = RobustScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
#print(X)

y = df['magnitude']


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
# Define the parameter space for Bayesian optimization
param_space = {
    'n_neighbors': (5, 300),  # Adjust the upper limit accordingly
    'weights': ['uniform', 'distance'],
    'p': (1, 2),
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}


# Initialize the k-Nearest Neighbors Regressor
knn = KNeighborsClassifier()

# Use Bayesian optimization to find the best parameters
bayes_search = BayesSearchCV(
    knn,
    param_space,
    n_iter=50,
    cv=5,
    scoring='accuracy',  # Scoring metric to minimize (negative MSE)
    n_jobs=-3
)

np.int = int
# Fit the Bayesian optimization model
bayes_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = bayes_search.best_params_
best_score = bayes_search.best_score_

# Print the results
print("Best Hyperparameters:", best_params)
print("Best Train Score:", best_score)


# You can also access the best model via bayes_search.best_estimator_
best_model = bayes_search.best_estimator_

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Log Loss requires predicted probabilities, not class labels
y_pred_prob = best_model.predict_proba(X_test)[:, 1]
logloss = log_loss(y_test, y_pred_prob)
conf_matrix = confusion_matrix(y_test, y_pred)

TN, FP, FN, TP = conf_matrix.ravel()

print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("AUC-ROC:", roc_auc)
print("Log Loss:", logloss)
print(f'True Positives (TP): {TP}')
print(f'True Negatives (TN): {TN}')
print(f'False Positives (FP): {FP}')
print(f'False Negatives (FN): {FN}')

y_pred_train = best_model.predict(X_train)

train_accuracy = accuracy_score(y_train, y_pred_train)
print("Train accuracy:", train_accuracy)

class_counts = y_test.value_counts()
print(class_counts)


accuracy = accuracy_score(y_pred, y_test)
print("Accuracy:", accuracy)

# Save results
df_results = pd.DataFrame({
    "Model": ["KNN"],
    "Best Train Score": [best_score],
    "Train Accuracy": [train_accuracy],
    "Test Accuracy": [accuracy],
    "Precision": [precision],
    "Recall": [recall],
    "F1 Score": [f1],
    "AUC-ROC": [roc_auc],
    "Log Loss": [logloss],
    "TP": [TP],
    "TN": [TN],
    "FP": [FP],
    "FN": [FN],
})

In [None]:
# Define the parameter space for Bayesian optimization
param_space_svm = {
    'C': (0.001, 100),
    'kernel': ['rbf', 'linear'],
    'gamma': ['scale', 'auto'],
    'degree': (1, 5),
    'coef0': (0, 1)
}

# Initialize SVM Classifier
svm_classifier = SVC()

# Use Bayesian optimization to find the best parameters
bayes_search_svm = BayesSearchCV(
    svm_classifier,
    param_space_svm,
    n_iter=50,
    cv=5,
    scoring='accuracy',  # Scoring metric for SVM classifier
    n_jobs=-3
)

# Fit the Bayesian optimization model
bayes_search_svm.fit(X_train, y_train)

# Get the best parameters and best score
best_params_svm = bayes_search_svm.best_params_
best_score_svm = bayes_search_svm.best_score_

# Print the results
print("Best Hyperparameters (SVM):", best_params_svm)
print("Best Train Score (SVM):", best_score_svm)

# You can also access the best model via bayes_search_svm.best_estimator_
best_model_svm = bayes_search_svm.best_estimator_

# Evaluate the model on the test set
y_pred_svm = best_model_svm.predict(X_test)
precision_svm = precision_score(y_test, y_pred_svm)
recall_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)
roc_auc_svm = roc_auc_score(y_test, y_pred_svm)

# Log Loss requires predicted probabilities, not class labels
y_pred_prob_svm = best_model_svm.decision_function(X_test)
logloss_svm = log_loss(y_test, y_pred_prob_svm)
conf_matrix = confusion_matrix(y_test, y_pred_svm)

TN, FP, FN, TP = conf_matrix.ravel()

print("Precision (SVM):", precision_svm)
print("Recall (SVM):", recall_svm)
print("F1 Score (SVM):", f1_svm)
print("AUC-ROC (SVM):", roc_auc_svm)
print("Log Loss (SVM):", logloss_svm)
print(f'True Positives (TP) (SVM): {TP}')
print(f'True Negatives (TN) (SVM): {TN}')
print(f'False Positives (FP) (SVM): {FP}')
print(f'False Negatives (FN) (SVM): {FN}')
y_pred_train_svm = best_model_svm.predict(X_train)

train_accuracy_svm = accuracy_score(y_train, y_pred_train_svm)
print("Train accuracy (SVM):", train_accuracy_svm)

class_counts_svm = y_test.value_counts()
print(class_counts_svm)

accuracy_svm = accuracy_score(y_pred_svm, y_test)
print("Accuracy (SVM):", accuracy_svm)

# Save results
df_results = pd.concat([df_results, pd.DataFrame({
    "Model": ["SVM"],
    "Best Train Score": [best_score_svm],
    "Train Accuracy": [train_accuracy_svm],
    "Test Accuracy": [accuracy_svm],
    "Precision":[precision_svm],
    "Recall": [recall_svm],
    "F1 Score": [f1_svm],
    "AUC-ROC": [roc_auc_svm],
    "Log Loss": [logloss_svm],
    "TP": [TP],
    "TN": [TN],
    "FP": [FP],
    "FN": [FN],
})], ignore_index=True)

In [None]:
# Define the parameter space for Bayesian optimization
param_space_ada = {
    'n_estimators': (50, 500),
    'learning_rate': (0.01, 10),
    'algorithm': ['SAMME', 'SAMME.R']
}

# Initialize AdaBoost Classifier
ada_classifier = AdaBoostClassifier()

# Use Bayesian optimization to find the best parameters
bayes_search_ada = BayesSearchCV(
    ada_classifier,
    param_space_ada,
    n_iter=50,
    cv=5,
    scoring='accuracy',  # Scoring metric for AdaBoost classifier
    n_jobs=-3
)

# Fit the Bayesian optimization model
bayes_search_ada.fit(X_train, y_train)

# Get the best parameters and best score
best_params_ada = bayes_search_ada.best_params_
best_score_ada = bayes_search_ada.best_score_

# Print the results
print("Best Hyperparameters (AdaBoost):", best_params_ada)
print("Best Train Score (AdaBoost):", best_score_ada)

# You can also access the best model via bayes_search_ada.best_estimator_
best_model_ada = bayes_search_ada.best_estimator_

# Evaluate the model on the test set
y_pred_ada = best_model_ada.predict(X_test)
precision_ada = precision_score(y_test, y_pred_ada)
recall_ada = recall_score(y_test, y_pred_ada)
f1_ada = f1_score(y_test, y_pred_ada)
roc_auc_ada = roc_auc_score(y_test, y_pred_ada)

# Log Loss requires predicted probabilities, not class labels
y_pred_prob_ada = best_model_ada.predict_proba(X_test)[:, 1]
logloss_ada = log_loss(y_test, y_pred_prob_ada)
conf_matrix = confusion_matrix(y_test, y_pred_ada)

TN, FP, FN, TP = conf_matrix.ravel()

print("Precision (AdaBoost):", precision_ada)
print("Recall (AdaBoost):", recall_ada)
print("F1 Score (AdaBoost):", f1_ada)
print("AUC-ROC (AdaBoost):", roc_auc_ada)
print("Log Loss (AdaBoost):", logloss_ada)
print(f'True Positives (TP) (AdaBoost): {TP}')
print(f'True Negatives (TN) (AdaBoost): {TN}')
print(f'False Positives (FP) (AdaBoost): {FP}')
print(f'False Negatives (FN) (AdaBoost): {FN}')
y_pred_train_ada = best_model_ada.predict(X_train)

train_accuracy_ada = accuracy_score(y_train, y_pred_train_ada)
print("Train accuracy (AdaBoost):", train_accuracy_ada)

class_counts_ada = y_test.value_counts()
print(class_counts_ada)

accuracy_ada = accuracy_score(y_pred_ada, y_test)
print("Accuracy (AdaBoost):", accuracy_ada)

# Save results
df_results = pd.concat([df_results, pd.DataFrame({
    "Model": ["AdaBoost"],
    "Best Train Score": [best_score_ada],
    "Train Accuracy": [train_accuracy_ada],
    "Test Accuracy": [accuracy_ada],
    "Precision":[precision_ada],
    "Recall": [recall_ada],
    "F1 Score": [f1_ada],
    "AUC-ROC": [roc_auc_ada],
    "Log Loss": [logloss_ada],
    "TP": [TP],
    "TN": [TN],
    "FP": [FP],
    "FN": [FN],
})], ignore_index=True)

In [None]:
# Define the parameter space for Bayesian optimization
param_space_gb = {
    'n_estimators': (50, 500),
    'learning_rate': (0.01, 10),
    'max_depth': (1, 50),
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 10),
    'max_features': ['sqrt', 'log2', None]
}

# Initialize Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier()

# Use Bayesian optimization to find the best parameters
bayes_search_gb = BayesSearchCV(
    gb_classifier,
    param_space_gb,
    n_iter=50,
    cv=5,
    scoring='accuracy',  # Scoring metric for Gradient Boosting classifier
    n_jobs=-3
)

# Fit the Bayesian optimization model
bayes_search_gb.fit(X_train, y_train)

# Get the best parameters and best score
best_params_gb = bayes_search_gb.best_params_
best_score_gb = bayes_search_gb.best_score_

# Print the results
print("Best Hyperparameters (Gradient Boosting):", best_params_gb)
print("Best Train Score (Gradient Boosting):", best_score_gb)

# You can also access the best model via bayes_search_gb.best_estimator_
best_model_gb = bayes_search_gb.best_estimator_

# Evaluate the model on the test set
y_pred_gb = best_model_gb.predict(X_test)
precision_gb = precision_score(y_test, y_pred_gb)
recall_gb = recall_score(y_test, y_pred_gb)
f1_gb = f1_score(y_test, y_pred_gb)
roc_auc_gb = roc_auc_score(y_test, y_pred_gb)

# Log Loss requires predicted probabilities, not class labels
y_pred_prob_gb = best_model_gb.predict_proba(X_test)[:, 1]
logloss_gb = log_loss(y_test, y_pred_prob_gb)
conf_matrix = confusion_matrix(y_test, y_pred_gb)

TN, FP, FN, TP = conf_matrix.ravel()

print("Precision (Gradient Boosting):", precision_gb)
print("Recall (Gradient Boosting):", recall_gb)
print("F1 Score (Gradient Boosting):", f1_gb)
print("AUC-ROC (Gradient Boosting):", roc_auc_gb)
print("Log Loss (Gradient Boosting):", logloss_gb)
print(f'True Positives (TP) (Gradient Boosting): {TP}')
print(f'True Negatives (TN) (Gradient Boosting): {TN}')
print(f'False Positives (FP) (Gradient Boosting): {FP}')
print(f'False Negatives (FN) (Gradient Boosting): {FN}')
y_pred_train_gb = best_model_gb.predict(X_train)

train_accuracy_gb = accuracy_score(y_train, y_pred_train_gb)
print("Trainaccuracy (Gradient Boosting):", train_accuracy_gb)

class_counts_gb = y_test.value_counts()
print(class_counts_gb)

accuracy_gb = accuracy_score(y_pred_gb, y_test)
print("Accuracy (Gradient Boosting):", accuracy_gb)

# Save results
df_results = pd.concat([df_results, pd.DataFrame({
    "Model": ["Gradient Boosting"],
    "Best Train Score": [best_score_gb],
    "Train Accuracy": [train_accuracy_gb],
    "Test Accuracy": [accuracy_gb],
    "Precision":[precision_gb],
    "Recall": [recall_gb],
    "F1 Score": [f1_gb],
    "AUC-ROC": [roc_auc_gb],
    "Log Loss": [logloss_gb],
    "TP": [TP],
    "TN": [TN],
    "FP": [FP],
    "FN": [FN],
})], ignore_index=True)

In [None]:
# Define the parameter space for Bayesian optimization
param_space = {
    'C': (1e-6, 1e+6, 'log-uniform'),  # Regularization parameter
    'penalty': ['l2'],  # Penalty type
    'solver': ['lbfgs'],  # Optimization algorithm
    'class_weight': ['balanced', None],  # Class weights for imbalanced data
    'multi_class': ['ovr', 'multinomial'],  # Multiclass strategy
    'warm_start': [True, False]  # Reuse solution of previous call
}


# Initialize Logistic Regression
logreg = LogisticRegression()

# Use Bayesian optimization to find the best parameters
bayes_search_logreg = BayesSearchCV(
    logreg,
    param_space,
    n_iter=50,
    cv=5,
    scoring='accuracy',  # Scoring metric for logistic regression
    n_jobs=-3
)

# Fit the Bayesian optimization model
bayes_search_logreg.fit(X_train, y_train)

# Get the best parameters and best score
best_params_logreg = bayes_search_logreg.best_params_
best_score_logreg = bayes_search_logreg.best_score_

# Print the results
print("Best Hyperparameters:", best_params_logreg)
print("Best Train Score:", best_score_logreg)

# You can also access the best model via bayes_search_logreg.best_estimator_
best_model_logreg = bayes_search_logreg.best_estimator_

# Evaluate the model on the test set
y_pred_logreg = best_model_logreg.predict(X_test)
precision_logreg = precision_score(y_test, y_pred_logreg)
recall_logreg = recall_score(y_test, y_pred_logreg)
f1_logreg = f1_score(y_test, y_pred_logreg)
roc_auc_logreg = roc_auc_score(y_test, y_pred_logreg)

# Log Loss requires predicted probabilities, not class labels
y_pred_prob_logreg = best_model_logreg.predict_proba(X_test)[:, 1]
logloss_logreg = log_loss(y_test, y_pred_prob_logreg)
conf_matrix = confusion_matrix(y_test, y_pred_logreg)

TN, FP, FN, TP = conf_matrix.ravel()

print("Precision:", precision_logreg)
print("Recall:", recall_logreg)
print("F1 Score:", f1_logreg)
print("AUC-ROC:", roc_auc_logreg)
print("Log Loss:", logloss_logreg)
print(f'True Positives (TP): {TP}')
print(f'True Negatives (TN): {TN}')
print(f'False Positives (FP): {FP}')
print(f'False Negatives (FN): {FN}')
y_pred_train_logreg = best_model_logreg.predict(X_train)

train_accuracy_logreg = accuracy_score(y_train, y_pred_train_logreg)
print("Train accuracy:", train_accuracy_logreg)

class_counts_logreg = y_test.value_counts()
print(class_counts_logreg)

accuracy_logreg = accuracy_score(y_pred_logreg, y_test)
print("Accuracy:", accuracy_logreg)

# Save results
df_results = pd.concat([df_results, pd.DataFrame({
    "Model": ["Logistic Regression"],
    "Best Train Score": [best_score_logreg],
    "Train Accuracy": [train_accuracy_logreg],
    "Test Accuracy": [accuracy_logreg],
    "Precision": [precision_logreg],
    "Recall": [recall_logreg],
    "F1 Score": [f1_logreg],
    "AUC-ROC": [roc_auc_logreg],
    "Log Loss": [logloss_logreg],
    "TP": [TP],
    "TN": [TN],
    "FP": [FP],
    "FN": [FN],
})], ignore_index=True)


In [None]:
# Define the parameter space for Bayesian optimization
param_space_dt = {
    'max_depth': (1, 50),  # Adjust the upper limit accordingly
    'min_samples_split': (2, 10),
    'min_samples_leaf': (1, 10),
    'max_features': ['sqrt', 'log2', None]
}

# Initialize Decision Tree Classifier
dt_classifier = DecisionTreeClassifier()

# Use Bayesian optimization to find the best parameters
bayes_search_dt = BayesSearchCV(
    dt_classifier,
    param_space_dt,
    n_iter=50,
    cv=5,
    scoring='accuracy',  # Scoring metric for decision tree classifier
    n_jobs=-3
)

# Fit the Bayesian optimization model
bayes_search_dt.fit(X_train, y_train)

# Get the best parameters and best score
best_params_dt = bayes_search_dt.best_params_
best_score_dt = bayes_search_dt.best_score_

# Print the results
print("Best Hyperparameters:", best_params_dt)
print("Best Train Score:", best_score_dt)

# You can also access the best model via bayes_search_dt.best_estimator_
best_model_dt = bayes_search_dt.best_estimator_

# Evaluate the model on the test set
y_pred_dt = best_model_dt.predict(X_test)
precision_dt = precision_score(y_test, y_pred_dt)
recall_dt = recall_score(y_test, y_pred_dt)
f1_dt = f1_score(y_test, y_pred_dt)
roc_auc_dt = roc_auc_score(y_test, y_pred_dt)

# Log Loss requires predicted probabilities, not class labels
y_pred_prob_dt = best_model_dt.predict_proba(X_test)[:, 1]
logloss_dt = log_loss(y_test, y_pred_prob_dt)
conf_matrix = confusion_matrix(y_test, y_pred_dt)

TN, FP, FN, TP = conf_matrix.ravel()

print("Precision:", precision_dt)
print("Recall:", recall_dt)
print("F1 Score:", f1_dt)
print("AUC-ROC:", roc_auc_dt)
print("Log Loss:", logloss_dt)
print(f'True Positives (TP): {TP}')
print(f'True Negatives (TN): {TN}')
print(f'False Positives (FP): {FP}')
print(f'False Negatives (FN): {FN}')
y_pred_train_dt = best_model_dt.predict(X_train)

train_accuracy_dt = accuracy_score(y_train, y_pred_train_dt)
print("Train accuracy:", train_accuracy_dt)

class_counts_dt = y_test.value_counts()
print(class_counts_dt)

accuracy_dt = accuracy_score(y_pred_dt, y_test)
print("Accuracy:", accuracy_dt)

# Save results
df_results = pd.concat([df_results, pd.DataFrame({
    "Model": ["Decision Tree"],
    "Best Train Score": [best_score_dt],
    "Train Accuracy": [train_accuracy_dt],
    "Test Accuracy": [accuracy_dt],
    "Precision": [precision_dt],
    "Recall": [recall_dt],
    "F1 Score": [f1_dt],
    "AUC-ROC": [roc_auc_dt],
    "Log Loss": [logloss_dt],
    "TP": [TP],
    "TN": [TN],
    "FP": [FP],
    "FN": [FN],
})], ignore_index=True)


In [None]:
# Define the parameter space for Bayesian optimization
param_space_rf = {
    'n_estimators': (10, 300),
    'max_depth': (1, 50),
    'min_samples_split': (2, 50),
    'min_samples_leaf': (1, 50),
    'max_features': ['sqrt', 'log2', None]
}

# Initialize Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Use Bayesian optimization to find the best parameters
bayes_search_rf = BayesSearchCV(
    rf_classifier,
    param_space_rf,
    n_iter=50,
    cv=5,
    scoring='accuracy',  # Scoring metric for random forest classifier
    n_jobs=-3
)

# Fit the Bayesian optimization model
bayes_search_rf.fit(X_train, y_train)

# Get the best parameters and best score
best_params_rf = bayes_search_rf.best_params_
best_score_rf = bayes_search_rf.best_score_

# Print the results
print("Best Hyperparameters:", best_params_rf)
print("Best Train Score:", best_score_rf)

# You can also access the best model via bayes_search_rf.best_estimator_
best_model_rf = bayes_search_rf.best_estimator_

# Evaluate the model on the test set
y_pred_rf = best_model_rf.predict(X_test)
precision_rf = precision_score(y_test, y_pred_rf)
recall_rf = recall_score(y_test, y_pred_rf)
f1_rf = f1_score(y_test, y_pred_rf)
roc_auc_rf = roc_auc_score(y_test, y_pred_rf)

# Log Loss requires predicted probabilities, not class labels
y_pred_prob_rf = best_model_rf.predict_proba(X_test)[:, 1]
logloss_rf = log_loss(y_test, y_pred_prob_rf)
conf_matrix = confusion_matrix(y_test, y_pred_rf)

TN, FP, FN, TP = conf_matrix.ravel()

print("Precision:", precision_rf)
print("Recall:", recall_rf)
print("F1 Score:", f1_rf)
print("AUC-ROC:", roc_auc_rf)
print("Log Loss:", logloss_rf)
print(f'True Positives (TP): {TP}')
print(f'True Negatives (TN): {TN}')
print(f'False Positives (FP): {FP}')
print(f'False Negatives (FN): {FN}')
y_pred_train_rf = best_model_rf.predict(X_train)

train_accuracy_rf = accuracy_score(y_train, y_pred_train_rf)
print("Train accuracy:", train_accuracy_rf)

class_counts_rf = y_test.value_counts()

accuracy_rf = accuracy_score(y_pred_rf, y_test)
print("Accuracy:", accuracy_rf)

# Save results
df_results = pd.concat([df_results, pd.DataFrame({
    "Model": ["Random Forest"],
    "Best Train Score": [best_score_rf],
    "Train Accuracy": [train_accuracy_rf],
    "Test Accuracy": [accuracy_rf],
    "Precision": [precision_rf],
    "Recall": [recall_rf],
    "F1 Score": [f1_rf],
    "AUC-ROC": [roc_auc_rf],
    "Log Loss": [logloss_rf],
    "TP": [TP],
    "TN": [TN],
    "FP": [FP],
    "FN": [FN],
})], ignore_index=True)

In [None]:
df_results