In [19]:
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import utils.feature_extractors as utils
import optuna
import joblib
import pandas as pd

from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from utils.utils import evaluate_classification
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier
from scipy import stats
from sklearn.metrics import accuracy_score


In [20]:
config_map = {
    "train_postive_location":"dataset/TR_pos_SPIDER.txt",
    "train_negative_location":"dataset/TR_neg_SPIDER.txt",
    "test_positive_location":"dataset/TS_pos_SPIDER.txt",
    "test_negative_location":"dataset/TS_neg_SPIDER.txt",
    "model_save_location":"./optimized_models",
    "feat_combo_model_save_location":"./feature_combo_models",
    "final_model_save_location":"./final_model",
    "random_seed":9
}

In [21]:
class ProteinFeatureGenerator:
    SELECTED_FEATURES = ["AAC", "DPC", "RScharge", "RSDHP", "RSpolar"]
    
    def __init__(self, positive_data_file: str, negative_data_file: str, feature_type: str = None) -> None:
        super().__init__()

        # Check feature param
        assert feature_type in ProteinFeatureGenerator.SELECTED_FEATURES or feature_type is None
        self.feature_type = feature_type

        # Data manipulation
        self.positive_data_file = positive_data_file
        self.negative_data_file = negative_data_file

        self.positive_data = utils.read_fasta(self.positive_data_file)
        self.negative_data = utils.read_fasta(self.negative_data_file)
        self.data = self.positive_data + self.negative_data

        self.targets = np.array([True]*len(self.positive_data) + [False]*len(self.negative_data))
        

        self.raw_sequences = [x[1] for x in self.data]
        
        
        print("Extracting AAC Feature ...")
        self.AAC_feature = utils.AAC(self.data)[0]

        print("Extracting DPC Feature ...")
        self.DPC_feature = utils.DPC(self.data, 0)[0]

        print("Extracting RScharge Feature ...")
        self.RScharge_feature = utils.reducedCHARGE(self.data)
        
        print("Extracting RSDHP Feature ...")
        self.RSDHP_feature = utils.reducedDHP(self.data)
        
        print("Extracting RSpolar Feature ...")
        self.RSpolar_feature = utils.reducedPOLAR(self.data)

    def get_feat_combo(self,selected:list = None):
        
        features =[self.AAC_feature,self.DPC_feature,self.RScharge_feature,self.RSDHP_feature,self.RSpolar_feature]
        
        if selected:
            select_index = sorted([ProteinFeatureGenerator.SELECTED_FEATURES.index(x) for x in selected])
            features = [features[x] for x in select_index]
            
        return np.concatenate(features,axis=-1)
        
    
    def __len__(self) -> int:
        return len(self.data)

In [22]:
test_data = ProteinFeatureGenerator(positive_data_file=config_map["test_positive_location"],negative_data_file=config_map["test_negative_location"])

Extracting AAC Feature ...
Extracting DPC Feature ...
Extracting RScharge Feature ...
Extracting RSDHP Feature ...
Extracting RSpolar Feature ...


In [23]:
X_test = {
    "AAC-DPC-RScharge-RSDHP-RSpolar":test_data.get_feat_combo(["AAC","DPC","RScharge","RSDHP","RSpolar"]),
    "AAC":test_data.AAC_feature,
}

In [24]:
single_model_dir = "optimized_models\AAC\SVC"
combine_model_dir = "final_model\AAC-DPC-RScharge-RSDHP-RSpolar\SVC"

In [25]:
feature_type="AAC"
X,y = X_test[feature_type],test_data.targets
pipeline =joblib.load(os.path.join(single_model_dir,"pipeline.sav")) 
clf = joblib.load(os.path.join(single_model_dir,"model.sav"))
X = pipeline.transform(X)
y_pred_single_model = clf.predict(X)

feature_combo="AAC-DPC-RScharge-RSDHP-RSpolar"
X,y = X_test[feature_combo],test_data.targets
pipeline =joblib.load(os.path.join(combine_model_dir,"pipeline.sav"))
clf = joblib.load(os.path.join(combine_model_dir,"model.sav"))
X = pipeline.transform(X)
y_pred_combined_model = clf.predict(X)

## McNemar's Test

McNemar's test compares the performance of two classifiers on a binary classification task by counting how often they agree or disagree. A contingency table is constructed to evaluate the difference between the two classifiers' error rates. If the p-value is below 0.05, it is concluded that the two classifiers have significantly different error rates/accuracies.

Null hypothesis of McNemar's test is that the two models have the same accuracy, while the alternative hypothesis is that they have different accuracies. The test computes a chi-squared statistic based on the number of discordant pairs (i.e., instances that are classified correctly by one model but incorrectly by the other) and uses a chi-squared distribution to calculate a p-value.

In [32]:
# Create a dictionary to store the data
data_dict = {
    "target": y,
    "single_model_predictions": y_pred_single_model,
    "combined_model_predictions": y_pred_combined_model
}

# Create a pandas DataFrame from the dictionary
significance_test_df = pd.DataFrame(data_dict)

# Create new columns to check if the single_model and combined_model predictions are correct
significance_test_df["single_model_correct"] = significance_test_df["target"] == significance_test_df["single_model_predictions"]
significance_test_df["combined_model_correct"] = significance_test_df["target"] == significance_test_df["combined_model_predictions"]

# Create the contingency table
contingency_table = pd.crosstab(significance_test_df["single_model_correct"], significance_test_df["combined_model_correct"])
print(contingency_table)
# Compute the test statistic and p-value
a = contingency_table.iloc[0,1]
b = contingency_table.iloc[1,0]

mcnemar_statistic = ((b - a) ** 2) / (b + a)
p_value = 1 - stats.chi2.cdf(mcnemar_statistic, df=1)

# Print the contingency table and test results
print('\n')
print("McNemar's test statistic:", mcnemar_statistic)
print("p-value:", p_value)





combined_model_correct  False  True 
single_model_correct                
False                      46      7
True                       16    392


McNemar's test statistic: 3.5217391304347827
p-value: 0.060568860202657615


In this case, the p-value obtained from the McNemar's test is 0.0605, which is greater than 0.05. Therefore, we fail to reject the null hypothesis at the 5% significance level. This means that we do not have sufficient evidence to conclude that the combined model is significantly different from the single model in terms of predictive accuracy at the 5% significance level.