In [1]:
import optuna
from optuna.samplers import TPESampler 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import svm, linear_model, ensemble
from sklearn import model_selection
import warnings
warnings.filterwarnings("ignore")
from sklearn.metrics import accuracy_score

In [3]:
labels = pd.read_csv("lable(3.1).csv", index_col=0 )
data = pd.read_csv("mibig(3.1).csv", index_col=0 )

In [4]:
def get_sample_responses(vectors, responses):
    samples = vectors
    index = list(set(samples.index).intersection(set(labels.index)))
    responses = responses.loc[index]
    samples = samples.loc[index]
    return samples, responses

In [5]:
X , y = get_sample_responses(data, labels)
bgc_ids = pd.Series(y.index)

In [6]:
def objective(trial): 
    param = {
        "alpha": trial.suggest_uniform('alpha',0.1,10.0),
        "normalize": trial.suggest_categorical('normalize', [False, True]),
        "copy_X": trial.suggest_categorical('copy_X', [False, True]),
        "class_weight": trial.suggest_categorical('class_weight', ['balanced',None]),
        "solver": trial.suggest_categorical('solver', ['auto','svd','cholesky','lsqr','sparse_cg','sag','saga']) 
    }
    
    clf = linear_model.RidgeClassifier(**param)
    
    acc_kf = []
    
    cv = KFold(n_splits=5,random_state=0,shuffle=True)
    for i, (id_train, id_val) in enumerate(cv.split(bgc_ids)):
        train_ids, val_ids = bgc_ids[id_train], bgc_ids[id_val]
        X_train, X_val = X.loc[train_ids].values, X.loc[val_ids].values
        y_train, y_val = y.loc[train_ids].values, y.loc[val_ids].values
        #print('Train:', len(X_train), 'Test:', len(X_val))
    
        clf.fit(X_train, y_train)
        pred = clf.predict(X_val)
    
        acc = accuracy_score(pred, y_val)
        acc_kf.append(acc)
        
    acc_kf = np.array(acc_kf).mean()
    
    return acc_kf

In [7]:
study = optuna.create_study(direction="maximize",sampler=TPESampler(),pruner=optuna.pruners.HyperbandPruner())

[I 2024-07-01 04:39:35,458] A new study created in memory with name: no-name-dd5d284f-2d3e-468b-9ba2-3864dbf7f20e


In [8]:
%%time
study.optimize(objective, n_trials=1)

[I 2024-07-01 04:39:48,560] Trial 0 finished with value: 0.7177182539682538 and parameters: {'alpha': 3.029271990850269, 'normalize': True, 'copy_X': False, 'class_weight': 'balanced', 'solver': 'svd'}. Best is trial 0 with value: 0.7177182539682538.


CPU times: user 43.1 s, sys: 55.8 s, total: 1min 38s
Wall time: 6.57 s
