In [27]:
import utils.feature_extractors as utils
from utils.evaluation import action_evaluator
import numpy as np
import os
from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_predict
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from xgboost import XGBClassifier
from scipy import stats
from sklearn.svm import SVC
import joblib

In [2]:
config = {
    "train_pos_loc":"data/TR_pos_SPIDER.txt",
    "train_neg_loc":"data/TR_neg_SPIDER.txt",
    "test_pos_loc":"data/TS_pos_SPIDER.txt",
    "test_neg_loc":"data/TS_neg_SPIDER.txt",
    "model_save_loc":"optimized",
    "ensemble_loc":"ensemble",
    "random_seed":9
}

In [3]:
class SpiderDataGenerator(object):
    ALL_FEAT = ["AAC","DPC","CTD",
                "PAAC","APAAC","RSacid",
                "RSpolar","RSsecond","RScharge",
                "RSDHP"]
    def __init__(self, pos_data_file, neg_data_file,feat_type=None) -> None:
        super(SpiderDataGenerator).__init__()
        self.pos_data_file = pos_data_file
        self.neg_data_file = neg_data_file
        
        assert feat_type in SpiderDataGenerator.ALL_FEAT or feat_type == None
        
        self.feat_type = feat_type
        
        self.pos_data = utils.read_fasta(self.pos_data_file)
        self.neg_data = utils.read_fasta(self.neg_data_file)
        
        self.data = self.pos_data+self.neg_data
        self.targets = np.array([True]*len(self.pos_data)+[False]*len(self.neg_data))
        
        self.raw = [x[1] for x in self.data]
        
        self.feat_AAC = utils.AAC(self.data)[0]
        print("Generating AAC Feature .....")
        self.feat_DPC = utils.DPC(self.data,0)[0]
        print("Generating DPC Feature .....")
        self.feat_CTD = np.hstack((utils.CTDC(self.data)[0], 
                              utils.CTDD(self.data)[0], 
                              utils.CTDT(self.data)[0]))
        print("Generating CTD Feature .....")
        self.feat_PAAC = utils.PAAC(self.data,1)[0]
        print("Generating PAAC Feature .....")
        self.feat_APAAC = utils.APAAC(self.data,1)[0]
        print("Generating APAAC Feature .....")
        self.feat_RSacid = utils.reducedACID(self.data) 
        print("Generating reducedACID Feature .....")
        self.feat_RSpolar = utils.reducedPOLAR(self.data)
        print("Generating reducedPOLAR Feature .....")
        self.feat_RSsecond = utils.reducedSECOND(self.data)
        print("Generating reducedSECOND Feature .....")
        self.feat_RScharge = utils.reducedCHARGE(self.data)
        print("Generating reducedCHARGE Feature .....")
        self.feat_RSDHP = utils.reducedDHP(self.data)
        print("Generating reducedDHP Feature .....")
        
        
        
    
    def get_combination_feature(self,selected:list = None):
        
        all_feat =[self.feat_AAC,self.feat_DPC,self.feat_CTD,
                   self.feat_PAAC,self.feat_APAAC,self.feat_RSacid,
                   self.feat_RSpolar,self.feat_RSsecond,self.feat_RScharge,
                   self.feat_RSDHP]
        
        if selected:
            select_index = sorted([SpiderDataGenerator.ALL_FEAT.index(x) for x in selected])
            all_feat = [all_feat[x] for x in select_index]
            
        return np.concatenate(all_feat,axis=-1)
        
        
        
            
    def __len__(self) -> int:
        return len(self.data)

In [4]:
test_data = SpiderDataGenerator(pos_data_file=config["test_pos_loc"],neg_data_file=config["test_neg_loc"])

Generating AAC Feature .....
Generating DPC Feature .....
Generating CTD Feature .....
Generating PAAC Feature .....
Generating APAAC Feature .....
Generating reducedACID Feature .....
Generating reducedPOLAR Feature .....
Generating reducedSECOND Feature .....
Generating reducedCHARGE Feature .....
Generating reducedDHP Feature .....


In [5]:
ensemble_model_dir = "ensemble\DPC_RSDHP_RSacid_RSpolar_RSsecond_RScharge\RandomForestClassifier"
single_model_dir = "model_saves\SVC_DPC_RSDHP_RSacid_RSpolar_RSsecond_RScharge"

In [6]:
X_test = {
    "AAC":test_data.feat_AAC,
    "DPC":test_data.feat_DPC,
    "CTD":test_data.feat_CTD,
    "PAAC":test_data.feat_PAAC,
    "APAAC":test_data.feat_APAAC,
    "RSacid":test_data.feat_RSacid,
    "RSpolar":test_data.feat_RSpolar,
    "RSsecond":test_data.feat_RSsecond,
    "RScharge":test_data.feat_RScharge,
    "RSDHP":test_data.feat_RSDHP,
    "DPC_RSDHP_RSacid_RSpolar_RSsecond_RScharge":test_data.get_combination_feature(["DPC","RSDHP","RSacid","RSpolar","RSsecond","RScharge"]),
}

sel_model = {
    "AAC":"SVC",
    "DPC":"SVC",
    "CTD":"LGBMClassifier",
    "PAAC":"RandomForest",
    "APAAC":"SVC",
    "RSacid":"SVC",
    "RSpolar":"SVC",
    "RSsecond":"SVC",
    "RScharge":"SVC",
    "RSDHP":"SVC",
    "Combine":"LGBMClassifier",
}

## Single Model

In [7]:
X,y = X_test["DPC_RSDHP_RSacid_RSpolar_RSsecond_RScharge"],test_data.targets
X = joblib.load(os.path.join(single_model_dir,"pipeline.sav")).transform(X)
ypred_single_model = joblib.load(os.path.join(single_model_dir,"model_save.sav")).predict(X)

## Ensemble model

In [8]:
probabilities = []
for feat_type in "DPC_RSDHP_RSacid_RSpolar_RSsecond_RScharge".split("_"):
    pipeline = joblib.load(os.path.join(config["model_save_loc"],feat_type,sel_model[feat_type],"pipeline.sav"))
    clf = joblib.load(os.path.join(config["model_save_loc"],feat_type,sel_model[feat_type],"model_save.sav"))
    
    X = X_test[feat_type]
    X = pipeline.transform(X)
    
    y_pred = clf.decision_function(X).reshape(-1,1)
    probabilities.append(y_pred)
    
probabilities = np.concatenate(probabilities,axis=-1)

ensemble = joblib.load(os.path.join(ensemble_model_dir,"model_save.sav"))
ypred_ensemble_model = ensemble.predict(probabilities)

In [13]:
significance_test_model_pred = pd.DataFrame.from_dict(data={
    "target":y,
    "single_model":ypred_single_model,
    "ensemble_model":ypred_ensemble_model,
    },orient="columns")

significance_test_model_pred["single_model_correct"] = significance_test_model_pred["target"] == significance_test_model_pred["single_model"]
significance_test_model_pred["ensemble_model_correct"] = significance_test_model_pred["target"] == significance_test_model_pred["ensemble_model"]

In [17]:
significance_test_model_pred.head(10)

Unnamed: 0,target,single_model,ensemble_model,single_model_correct,ensemble_model_correct
0,True,True,True,True,True
1,True,True,True,True,True
2,True,False,False,False,False
3,True,True,True,True,True
4,True,True,True,True,True
5,True,True,True,True,True
6,True,True,True,True,True
7,True,False,False,False,False
8,True,False,False,False,False
9,True,True,True,True,True


In [22]:
contigency_table = [[0,0],[0,0]]
for i in significance_test_model_pred.index:
    _,_,_,single_model_correct,ensemble_model_correct = significance_test_model_pred.loc[i]
    
    if single_model_correct == ensemble_model_correct and single_model_correct:
         contigency_table[0][0] += 1
    elif single_model_correct == ensemble_model_correct and not single_model_correct:
        contigency_table[1][1] += 1
    elif single_model_correct != ensemble_model_correct and single_model_correct:
        contigency_table[0][1] += 1
    else:
        contigency_table[1][0] += 1
        
contigency_table = pd.DataFrame(
    data=contigency_table,
    columns=["ensemble_correct","ensemble_incorrect"],
    index=["single_correct","single_incorrect"])

In [23]:
contigency_table

Unnamed: 0,ensemble_correct,ensemble_incorrect
single_correct,379,21
single_incorrect,5,56


In [24]:
statistic = (contigency_table.loc["single_correct","ensemble_incorrect"]-contigency_table.loc["single_incorrect","ensemble_correct"])**2/(contigency_table.loc["single_correct","ensemble_incorrect"]+contigency_table.loc["single_incorrect","ensemble_correct"])

In [29]:
print("X^2 :- ",statistic)
p_value = 1 - stats.chi2.cdf(statistic, 1)
print("P Value :- ",p_value)

X^2 :-  9.846153846153847
P Value :-  0.001701872092673029


### Therefore the null hypothesis of the classifiers classification accuracy diffence being insignificant is false as P Value < 0.01 taking significant value as 0.01 

Thus the single classifier has better accuracy, this maybe due to the fact that the ensemble model gets overfitted to training data as each base line classifers best results hover close to 0.9 accuracy. Thus the ensemble model maybe modelling the train noise rather that the trend.