In [1]:
import pandas as pd, numpy as np
import sqlite3, nltk, string, math, pickle, sys

from sklearn.model_selection import KFold
from sklearn import svm, linear_model, ensemble
from sklearn.model_selection import cross_val_score
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from mlxtend.classifier import StackingClassifier

from pycm import *

import warnings
warnings.filterwarnings("ignore")

In [2]:
labels = pd.read_csv("lable(3.1).csv", index_col=0 )
data = pd.read_csv("mibig(3.1).csv", index_col=0 )

In [3]:
def get_sample_responses(vectors, responses):
    samples = vectors
    index = list(set(samples.index).intersection(set(labels.index)))
    responses = responses.loc[index]
    samples = samples.loc[index]
    return samples, responses

In [4]:
rf = ensemble.RandomForestClassifier()
c_rf = CalibratedClassifierCV(rf, method='sigmoid')

ets = ensemble.ExtraTreesClassifier()
c_ets = CalibratedClassifierCV(ets, method='sigmoid')

nusvm = svm.NuSVC(probability=True)
c_nusvm = CalibratedClassifierCV(nusvm, method='sigmoid')

ridge = linear_model.RidgeClassifier()
c_ridge = CalibratedClassifierCV(ridge, method='sigmoid')

In [5]:
class stacking_layer:
    def __init__(self,index):
        self.ind_layer=index
        self.classifier0=StackingClassifier(classifiers=[c_rf,c_ets,c_nusvm], meta_classifier= c_ridge)
        self.classifier1=StackingClassifier(classifiers=[c_rf,c_ets,c_nusvm], meta_classifier= c_ridge)
        self.classifier2=StackingClassifier(classifiers=[c_rf,c_ets,c_ridge], meta_classifier= c_nusvm)
        self.classifier3=StackingClassifier(classifiers=[c_rf,c_ets,c_ridge], meta_classifier= c_nusvm)
        
    def fit(self,X,y):
        self.classifier0.fit(X,y)
        self.classifier1.fit(X,y)
        self.classifier2.fit(X,y)
        self.classifier3.fit(X,y)

    def predict(self,X):
        y_prob=self.classifier0.predict_proba(X)
        y_prob+=self.classifier1.predict_proba(X)
        y_prob+=self.classifier2.predict_proba(X)
        y_prob+=self.classifier3.predict_proba(X)
        y_pred=[]
        for prob in y_prob/4:
            y_pred.append(np.argmax(prob))
        return np.array(y_pred)

    def predict_proba(self,X):
        y_prob=self.classifier0.predict_proba(X)
        y_prob=np.column_stack((y_prob,self.classifier1.predict_proba(X)))
        y_prob=np.column_stack((y_prob,self.classifier2.predict_proba(X)))
        y_prob=np.column_stack((y_prob,self.classifier3.predict_proba(X)))
        return y_prob

In [6]:
class cascade_layer:
    def __init__(self,num_layer):
        self.layers=[]
        self.num_layer = num_layer
              
    def fit(self,X,y):
        for num in range(self.num_layer):
            self.layers.append(stacking_layer(num))
        inputX=X
        for l in self.layers:
            l.fit(inputX,y)
            y0=l.predict_proba(inputX)
            inputX=np.column_stack((X,y0))
                      
    def predict(self,X):
        inputX=X
        for l in self.layers:
            y_pred=l.predict(inputX)
            y0=l.predict_proba(inputX)        
            inputX=np.column_stack((X,y0))
        return y_pred

In [7]:
class csel_model(object):
    def __init__(self, max_layer=100):
        
        self.max_layer = max_layer
        self.model = []    

    def fit(self, X, y):
        train_acc = 0.0
        layer = 1
        top_layer = 0
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        
        print('================================')
        print('Model starts...')
        print('--------------------------------')

        while layer < self.max_layer:
        
            new_layer = cascade_layer(layer)
            new_layer.fit(X_train,y_train)
            pred = new_layer.predict(X_test)
            temp_acc =accuracy_score(y_test, pred)
            
            print("layer : {} | acc : {:.6f}".format(layer, temp_acc))
 
            if train_acc < temp_acc:
                train_acc = temp_acc
                top_layer = layer
            else:
                break
            
            layer = layer + 1
            self.model.append(new_layer)
        print('================================')

        for index in range(len(self.model), top_layer + 1, -1):  
            self.model.pop()

    def predict(self, X):
        for layer in self.model:
            pred = layer.predict(X)
        return pred

In [8]:
%%time

X , y = get_sample_responses(data, labels)
bgc_ids = pd.Series(y.index)
model = csel_model()

metrics = []
for r in range(10):
    all_summary = []
    class_summary = []
    cv = KFold(n_splits=5,random_state=r,shuffle=True)
    for i, (id_train, id_val) in enumerate(cv.split(bgc_ids)):
        train_ids, val_ids = bgc_ids[id_train], bgc_ids[id_val]
        X_train, X_val = X.loc[train_ids].values, X.loc[val_ids].values
        y_train, y_val = y.loc[train_ids].values, y.loc[val_ids].values
        print('Train:', len(X_train), 'Test:', len(X_val))
    
        model.fit(X_train, y_train)

        y_pred = model.predict(X_val)
        y_true = y_val.flatten()
    
        cm = ConfusionMatrix(actual_vector=y_true, predict_vector=y_pred)
   
        all_columns = pd.DataFrame(cm.overall_stat).columns
        all_ = pd.DataFrame(cm.overall_stat)[1:2].values
        all_summary.append(all_[0])
    
        class_columns = pd.DataFrame(cm.class_stat).columns
        class_ = pd.DataFrame(cm.class_stat)[1:2].values
        class_summary.append(class_[0])   

    all_stat = pd.DataFrame(all_summary,columns=[all_columns]).mean()
    all_stat = all_stat.T
    class_stat = pd.DataFrame(class_summary,columns=[class_columns]).mean()
    class_stat = class_stat.T

    metric = pd.concat([all_stat, class_stat], axis=0)
    
    metrics.append(metric.values)

metrics = pd.DataFrame(metrics,columns=[metric.index.get_level_values(level=0)])

metrics.to_csv("metrics_CSEL_unHPO.csv")

Train: 898 Test: 225
Model starts...
--------------------------------
layer : 1 | acc : 0.672222
layer : 2 | acc : 0.672222
Train: 898 Test: 225
Model starts...
--------------------------------
layer : 1 | acc : 0.688889
layer : 2 | acc : 0.677778
Train: 898 Test: 225
Model starts...
--------------------------------
layer : 1 | acc : 0.688889
layer : 2 | acc : 0.688889
Train: 899 Test: 224
Model starts...
--------------------------------
layer : 1 | acc : 0.744444
layer : 2 | acc : 0.738889
Train: 899 Test: 224
Model starts...
--------------------------------
layer : 1 | acc : 0.666667
layer : 2 | acc : 0.683333
layer : 3 | acc : 0.672222
Train: 898 Test: 225
Model starts...
--------------------------------
layer : 1 | acc : 0.711111
layer : 2 | acc : 0.711111
Train: 898 Test: 225
Model starts...
--------------------------------
layer : 1 | acc : 0.683333
layer : 2 | acc : 0.683333
Train: 898 Test: 225
Model starts...
--------------------------------
layer : 1 | acc : 0.750000
layer : 