In [1]:
import optuna
from optuna.samplers import TPESampler 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import svm, linear_model, ensemble
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score

In [3]:
labels = pd.read_csv("antibacterial_lable.csv", index_col=0 )
data = pd.read_csv("mibig(antismash).csv", index_col=0 )

In [4]:
def get_sample_responses(vectors, responses):
    samples = vectors
    index = list(set(samples.index).intersection(set(labels.index)))
    responses = responses.loc[index]
    samples = samples.loc[index]
    return samples, responses

In [5]:
X , y = get_sample_responses(data, labels)
bgc_ids = pd.Series(y.index)

In [6]:
def objective(trial): 
    param = {
        "n_estimators": trial.suggest_int('n_estimators', 1, 200),
        "criterion": trial.suggest_categorical('criterion', ['gini','entropy']),
        "min_samples_split": trial.suggest_int('min_samples_split', 2, 50),
        "min_samples_leaf": trial.suggest_int('min_samples_leaf', 1, 10),
        "max_features": trial.suggest_categorical('max_features', ['auto','sqrt','log2']) ,
        #"oob_score": trial.suggest_categorical('oob_score', [False, True]),
        "class_weight": trial.suggest_categorical('class_weight', ['balanced','balanced_subsample'])
    }
    
    clf = ensemble.ExtraTreesClassifier(**param,bootstrap=False,n_jobs=-1)
    
    acc_kf = []
    
    cv = KFold(n_splits=5,random_state=0,shuffle=True)
    for i, (id_train, id_val) in enumerate(cv.split(bgc_ids)):
        train_ids, val_ids = bgc_ids[id_train], bgc_ids[id_val]
        X_train, X_val = X.loc[train_ids].values, X.loc[val_ids].values
        y_train, y_val = y.loc[train_ids].values, y.loc[val_ids].values
        #print('Train:', len(X_train), 'Test:', len(X_val))
    
        clf.fit(X_train, y_train)
        pred = clf.predict(X_val)
    
        acc = accuracy_score(pred, y_val)
        acc_kf.append(acc)
        
    acc_kf = np.array(acc_kf).mean()
    
    return acc_kf

In [7]:
study = optuna.create_study(direction="maximize",sampler=TPESampler(),pruner=optuna.pruners.HyperbandPruner())

[I 2024-07-01 04:37:50,343] A new study created in memory with name: no-name-1c6e1968-fa81-4527-854d-08b0ba75f33f


In [8]:
%%time
study.optimize(objective, n_trials=1)

[I 2024-07-01 04:37:50,812] Trial 0 finished with value: 0.6526468253968253 and parameters: {'n_estimators': 46, 'criterion': 'entropy', 'min_samples_split': 12, 'min_samples_leaf': 4, 'max_features': 'log2', 'class_weight': 'balanced'}. Best is trial 0 with value: 0.6526468253968253.


CPU times: user 318 ms, sys: 88.6 ms, total: 406 ms
Wall time: 392 ms
