In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import svm, linear_model, ensemble
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [3]:
labels = pd.read_csv("antibacterial_lable.csv", index_col=0 )
data = pd.read_csv("mibig(antismash).csv", index_col=0 )

In [4]:
def get_sample_responses(vectors, responses):
    samples = vectors
    index = list(set(samples.index).intersection(set(labels.index)))
    responses = responses.loc[index]
    samples = samples.loc[index]
    return samples, responses

In [5]:
X , y = get_sample_responses(data, labels)
bgc_ids = pd.Series(y.index)

In [6]:
def objective(param):
    
    clf = ensemble.RandomForestClassifier(**param,bootstrap=False,n_jobs=-1)
    
    acc_kf = []
    
    cv = KFold(n_splits=5,random_state=0,shuffle=True)
    for i, (id_train, id_val) in enumerate(cv.split(bgc_ids)):
        train_ids, val_ids = bgc_ids[id_train], bgc_ids[id_val]
        X_train, X_val = X.loc[train_ids].values, X.loc[val_ids].values
        y_train, y_val = y.loc[train_ids].values, y.loc[val_ids].values
        #print('Train:', len(X_train), 'Test:', len(X_val))
    
        clf.fit(X_train, y_train)
        pred = clf.predict(X_val)
    
        acc = accuracy_score(pred, y_val)
        acc_kf.append(acc)
        
    acc_kf = np.array(acc_kf).mean()
    
    return {'loss': -acc_kf, 'status': STATUS_OK}

In [7]:
space = {
    'n_estimators': hp.choice('n_estimators', range(1, 200)),
    'criterion': hp.choice('criterion', ['gini','entropy']),
    'min_samples_split': hp.choice('min_samples_split', range(2, 50)),
    'min_samples_leaf': hp.choice('min_samples_leaf', range(1, 10)),
    'max_features': hp.choice('max_features', ['auto','sqrt','log2']),
    'class_weight': hp.choice('class_weight', ['balanced','balanced_subsample'])
}

In [8]:
trials = Trials()

In [9]:
%%time
best = fmin(
    fn=objective,            
    space=space,             
    algo=tpe.suggest,        
    max_evals=1000,           
    trials=trials            
)

100%|███████████████████████████████████████████| 1000/1000 [28:15<00:00,  1.70s/trial, best loss: -0.7506309523809525]
CPU times: total: 43min 18s
Wall time: 28min 15s


In [10]:
best_params = {
    'n_estimators': best['n_estimators'] + 1, 
    'criterion': ['gini','entropy'][best['criterion']],  
    'min_samples_split': best['min_samples_split'] + 2, 
    'min_samples_leaf': best['min_samples_leaf'] + 1, 
    'max_features': ['auto','sqrt','log2'][best['max_features']], 
    'class_weight': ['balanced','balanced_subsample'][best['class_weight']]
}

In [11]:
print("Best parameters found: ", best_params)

Best parameters found:  {'n_estimators': 122, 'criterion': 'gini', 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'log2', 'class_weight': 'balanced_subsample'}
