In [8]:
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import utils.feature_extractors as utils
import optuna
import joblib

from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from utils.utils import evaluate_classification
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier

In [9]:
config_map = {
    "train_postive_location":"dataset/TR_pos_SPIDER.txt",
    "train_negative_location":"dataset/TR_neg_SPIDER.txt",
    "test_positive_location":"dataset/TS_pos_SPIDER.txt",
    "test_negative_location":"dataset/TS_neg_SPIDER.txt",
    "model_save_location":"./optimized_models",
    "random_seed":9
}

In [10]:
class ProteinFeatureGenerator:
    SELECTED_FEATURES = ["AAC", "DPC", "RScharge", "RSDHP", "RSpolar"]
    
    def __init__(self, positive_data_file: str, negative_data_file: str, feature_type: str = None) -> None:
        super().__init__()

        # Check feature param
        assert feature_type in ProteinFeatureGenerator.SELECTED_FEATURES or feature_type is None
        self.feature_type = feature_type

        # Data manipulation
        self.positive_data_file = positive_data_file
        self.negative_data_file = negative_data_file

        self.positive_data = utils.read_fasta(self.positive_data_file)
        self.negative_data = utils.read_fasta(self.negative_data_file)
        
        self.data = self.positive_data + self.negative_data
        self.targets = np.array([True]*len(self.positive_data) + [False]*len(self.negative_data))

        self.raw_sequences = [x[1] for x in self.data]
        
        
        # Extract features
        print("Extracting AAC Feature ...")
        self.AAC_feature = utils.AAC(self.data)[0]

        print("Extracting DPC Feature ...")
        self.DPC_feature = utils.DPC(self.data, 0)[0]

        print("Extracting RScharge Feature ...")
        self.RScharge_feature = utils.reducedCHARGE(self.data)
        
        print("Extracting RSDHP Feature ...")
        self.RSDHP_feature = utils.reducedDHP(self.data)
        
        print("Extracting RSpolar Feature ...")
        self.RSpolar_feature = utils.reducedPOLAR(self.data)
        
    
    def __len__(self) -> int:
        return len(self.data)

In [11]:
test_data = ProteinFeatureGenerator(positive_data_file=config_map["test_positive_location"],negative_data_file=config_map["test_negative_location"])

Extracting AAC Feature ...
Extracting DPC Feature ...
Extracting RScharge Feature ...
Extracting RSDHP Feature ...
Extracting RSpolar Feature ...


In [12]:
X_data = {
    "AAC":test_data.AAC_feature,
    "DPC":test_data.DPC_feature,
    "RScharge":test_data.RScharge_feature,
    "RSDHP":test_data.RSDHP_feature,
    "RSpolar":test_data.RSpolar_feature,
}

feature_model_map = {
    "AAC":"SVC",
    "DPC":"SVC",
    "RScharge":"RandomForest",
    "RSDHP":"SVC",
    "RSpolar":"ExtraTreesClassifier",
}

In [13]:
for feature_type in X_data.keys():
    print(f" Feaature Type :- {feature_type} | Model :- {feature_model_map[feature_type]}")
    
    os.makedirs(os.path.join(config_map["model_save_location"],feature_type,feature_model_map[feature_type]),exist_ok=True)
    model_dir = os.path.join(config_map["model_save_location"],feature_type,feature_model_map[feature_type])
    
    pipeline = joblib.load(os.path.join(config_map["model_save_location"],feature_type,feature_model_map[feature_type],"pipeline.sav"))
    clf = joblib.load(os.path.join(config_map["model_save_location"],feature_type,feature_model_map[feature_type],"model.sav"))
    
    X,y = X_data[feature_type],test_data.targets
    X,y = shuffle(X,y,random_state=config_map["random_seed"])
    X = pipeline.transform(X)
    
    y_pred = clf.predict(X)
    
    result_values = evaluate_classification(y_pred,y,class_names=["Not Druggable","Druggable"],save_outputs=model_dir)
    
    print(result_values)
    print("\n\n")

 Feaature Type :- AAC | Model :- SVC
{'accuracy': 0.8850325379609545, 'sensitivity': 0.8214285714285714, 'specificity': 0.9451476793248945, 'precision': 0.8912475003845562, 'f1': 0.8841604202521323}



 Feaature Type :- DPC | Model :- SVC
{'accuracy': 0.8611713665943601, 'sensitivity': 0.7723214285714286, 'specificity': 0.9451476793248945, 'precision': 0.8723264907135875, 'f1': 0.8594512195121953}



 Feaature Type :- RScharge | Model :- RandomForest
{'accuracy': 0.8785249457700651, 'sensitivity': 0.8035714285714286, 'specificity': 0.9493670886075949, 'precision': 0.8869656133828996, 'f1': 0.877356339312861}



 Feaature Type :- RSDHP | Model :- SVC
{'accuracy': 0.8720173535791758, 'sensitivity': 0.8125, 'specificity': 0.9282700421940928, 'precision': 0.8771337604050788, 'f1': 0.8711418108083779}



 Feaature Type :- RSpolar | Model :- ExtraTreesClassifier
{'accuracy': 0.8850325379609545, 'sensitivity': 0.8169642857142857, 'specificity': 0.9493670886075949, 'precision': 0.8921631000578