In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import svm, linear_model, ensemble
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [3]:
labels = pd.read_csv("antibacterial_lable.csv", index_col=0 )
data = pd.read_csv("mibig(antismash).csv", index_col=0 )

In [4]:
def get_sample_responses(vectors, responses):
    samples = vectors
    index = list(set(samples.index).intersection(set(labels.index)))
    responses = responses.loc[index]
    samples = samples.loc[index]
    return samples, responses

In [5]:
X , y = get_sample_responses(data, labels)
bgc_ids = pd.Series(y.index)

In [6]:
def objective(param):
    
    clf = linear_model.RidgeClassifier(**param)
    
    acc_kf = []
    
    cv = KFold(n_splits=5,random_state=0,shuffle=True)
    for i, (id_train, id_val) in enumerate(cv.split(bgc_ids)):
        train_ids, val_ids = bgc_ids[id_train], bgc_ids[id_val]
        X_train, X_val = X.loc[train_ids].values, X.loc[val_ids].values
        y_train, y_val = y.loc[train_ids].values, y.loc[val_ids].values
        #print('Train:', len(X_train), 'Test:', len(X_val))
    
        clf.fit(X_train, y_train)
        pred = clf.predict(X_val)
    
        acc = accuracy_score(pred, y_val)
        acc_kf.append(acc)
        
    acc_kf = np.array(acc_kf).mean()
    
    return {'loss': -acc_kf, 'status': STATUS_OK}

In [7]:
space = {
    'alpha': hp.uniform('alpha', 0.1, 10.0),
    'normalize': hp.choice('normalize', [False, True]),
    'copy_X': hp.choice('copy_X', [False, True]),
    'class_weight': hp.choice('class_weight', ['balanced',None]),
    'solver': hp.choice('solver', ['auto','svd','cholesky','lsqr','sparse_cg','sag','saga'])
}

In [8]:
trials = Trials()

In [9]:
%%time
best = fmin(
    fn=objective,            
    space=space,             
    algo=tpe.suggest,        
    max_evals=1000,           
    trials=trials            
)

100%|█████████████████████████████████████████| 1000/1000 [2:30:05<00:00,  9.01s/trial, best loss: -0.7265753968253968]
CPU times: total: 1h 53min 8s
Wall time: 2h 30min 5s


In [10]:
best_params = {
    'alpha': best['alpha'], 
    'normalize': [False, True][best['normalize']],  
    'copy_X': [False, True][best['copy_X']], 
    'class_weight': ['balanced',None][best['class_weight']],   
    'solver': ['auto','svd','cholesky','lsqr','sparse_cg','sag','saga'][best['solver']]
}

In [11]:
print("Best parameters found: ", best_params)

Best parameters found:  {'alpha': 8.475205712772372, 'normalize': True, 'copy_X': True, 'class_weight': None, 'solver': 'sag'}
