In [None]:
import numpy as np
from numpy.random import default_rng
import pandas as pd
from hyperopt import hp, fmin, tpe, space_eval, STATUS_OK, Trials
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
import warnings

method = 'TCP_LT_AOF'
data = pd.read_csv(f'../input_data/{method}/data.csv')

parameters = ['normal_convergence_rate', 
              'subducting_ocean_floor_age',
              'obliquity_of_subduction',
              'migration_rate_x_distance']

X = data[parameters]
y = data['cu_mt']
y_cat = np.where(y > 2, 1, 0)

# Define a hyperparameter search space 
space = hp.choice('classifiers', [
    {   # Use XGBClassifier with a fixed random seed for reproducibility
        'model': XGBClassifier(seed=42),
        'params': {
            'model__max_depth': hp.choice('xgb.max_depth', range(10, 100, 1)),
            'model__learning_rate': hp.quniform('xgb.learning_rate', 0.01, 0.5, 0.01),
            'model__n_estimators': hp.choice('xgb.n_estimators', range(10, 1000, 1)),
            'model__reg_lambda': hp.uniform('xgb.reg_lambda', 0, 10),
            'model__reg_alpha': hp.uniform('xgb.reg_alpha', 0, 10),
            'model__gamma': hp.uniform('xgb.gamma', 0, 10),
            'model__min_child_weight': hp.quniform('xgb.min_child_weight', 1, 10, 1),
            'model__subsample': hp.uniform('xgb.subsample', 0, 1),
            'model__colsample_bytree': hp.uniform('xgb.colsample_bytree', 0, 1),
            'model__scale_pos_weight': hp.quniform('xgb.scale_pos_weight', 1, 20, 1)
        }
    }
])

# Define the objective function to optimize
def objective(args):
    # Create a pipeline for the chosen 'model' (XGBClassifier)
    pipeline = Pipeline(steps=[
        ('model', args['model'])
    ])

    pipeline.set_params(**args['params'])
    rskf = RepeatedStratifiedKFold(n_splits=5, n_repeats=10, random_state=42) # Define the cross-validation strategy
    score = cross_val_score(pipeline, X, y_cat, cv=rskf, scoring='f1', n_jobs=-1) # Perform cross-validations with F1 score as the evaluation metric
    
    return {'loss': -np.mean(score), 'status': STATUS_OK} # Return the negative mean F1 score as the loss to minimize

trials = Trials()
rng = default_rng(42) # Initialize a random number generator for reproducibility
best_classifier = fmin(objective, space, algo=tpe.suggest, max_evals=8000, trials=trials, rstate=rng)

best_params = space_eval(space, best_classifier) 
print(best_params['params']) # Retrieve the best hyperparameters