In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn import svm, linear_model, ensemble
from sklearn import model_selection
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings("ignore")

In [2]:
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

In [3]:
labels = pd.read_csv("lable(3.1).csv", index_col=0 )
data = pd.read_csv("mibig(3.1).csv", index_col=0 )

In [4]:
def get_sample_responses(vectors, responses):
    samples = vectors
    index = list(set(samples.index).intersection(set(labels.index)))
    responses = responses.loc[index]
    samples = samples.loc[index]
    return samples, responses

In [5]:
X , y = get_sample_responses(data, labels)
bgc_ids = pd.Series(y.index)

In [6]:
def objective(param):
    
    clf = svm.NuSVC(**param,probability = True)
    
    acc_kf = []
    
    cv = KFold(n_splits=5,random_state=0,shuffle=True)
    for i, (id_train, id_val) in enumerate(cv.split(bgc_ids)):
        train_ids, val_ids = bgc_ids[id_train], bgc_ids[id_val]
        X_train, X_val = X.loc[train_ids].values, X.loc[val_ids].values
        y_train, y_val = y.loc[train_ids].values, y.loc[val_ids].values
        #print('Train:', len(X_train), 'Test:', len(X_val))
    
        clf.fit(X_train, y_train)
        pred = clf.predict(X_val)
    
        acc = accuracy_score(pred, y_val)
        acc_kf.append(acc)
        
    acc_kf = np.array(acc_kf).mean()
    
    return {'loss': -acc_kf, 'status': STATUS_OK}

In [7]:
space = {
    'nu': hp.uniform('nu', 0.0, 0.5),
    'kernel': hp.choice('kernel', ['poly','rbf','sigmoid']),
    'shrinking': hp.choice('shrinking', [False, True]),
    'class_weight': hp.choice('class_weight', ['balanced',None])
}

In [8]:
trials = Trials()

In [9]:
%%time
best = fmin(
    fn=objective,            
    space=space,             
    algo=tpe.suggest,        
    max_evals=1000,           
    trials=trials            
)

100%|████████████████████████████████████████| 1000/1000 [22:47:40<00:00, 82.06s/trial, best loss: -0.7301785714285716]
CPU times: total: 14h 11min 41s
Wall time: 22h 47min 40s


In [10]:
best_params = {
    'nu': best['nu'] + 0.0, 
    'kernel': ['poly','rbf','sigmoid'][best['kernel']],  
    'shrinking': [False, True][best['shrinking']], 
    'class_weight': ['balanced',None][best['class_weight']],   
}

In [11]:
print("Best parameters found: ", best_params)

Best parameters found:  {'nu': 0.39791815898408434, 'kernel': 'rbf', 'shrinking': False, 'class_weight': 'balanced'}
