In [1]:
import numpy as np
import os
import json
import matplotlib.pyplot as plt
import utils.feature_extractors as utils
import optuna
import joblib

from sklearn.pipeline import make_pipeline
from sklearn.utils import shuffle
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_predict
from sklearn.ensemble import RandomForestClassifier
from utils.utils import evaluate_classification
from sklearn.svm import SVC
from sklearn.ensemble import ExtraTreesClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
config_map = {
    "train_postive_location":"dataset/TR_pos_SPIDER.txt",
    "train_negative_location":"dataset/TR_neg_SPIDER.txt",
    "test_positive_location":"dataset/TS_pos_SPIDER.txt",
    "test_negative_location":"dataset/TS_neg_SPIDER.txt",
    "model_save_location":"./optimized_models",
    "feat_combo_model_save_location":"./feature_combo_models",
    "final_model_save_location":"./final_model",
    "random_seed":9
}

In [3]:
class ProteinFeatureGenerator:
    SELECTED_FEATURES = ["AAC", "DPC", "RScharge", "RSDHP", "RSpolar"]
    
    def __init__(self, positive_data_file: str, negative_data_file: str, feature_type: str = None) -> None:
        super().__init__()

        # Check feature param
        assert feature_type in ProteinFeatureGenerator.SELECTED_FEATURES or feature_type is None
        self.feature_type = feature_type

        # Data manipulation
        self.positive_data_file = positive_data_file
        self.negative_data_file = negative_data_file

        self.positive_data = utils.read_fasta(self.positive_data_file)
        self.negative_data = utils.read_fasta(self.negative_data_file)
        self.data = self.positive_data + self.negative_data

        self.pos_targets = np.array([True]*len(self.positive_data))
        self.neg_targets = np.array([False]*len(self.negative_data))
        self.targets = np.array([True]*len(self.positive_data) + [False]*len(self.negative_data))
        

        self.raw_sequences = [x[1] for x in self.data]
        
        
        print("Extracting AAC Feature ...")
        self.AAC_feature = utils.AAC(self.data)[0]
        self.AAC_feature_pos = utils.AAC(self.positive_data)[0]
        self.AAC_feature_neg = utils.AAC(self.negative_data)[0]

        print("Extracting DPC Feature ...")
        self.DPC_feature = utils.DPC(self.data, 0)[0]
        self.DPC_feature_pos = utils.DPC(self.positive_data, 0)[0]
        self.DPC_feature_neg = utils.DPC(self.negative_data, 0)[0]

        print("Extracting RScharge Feature ...")
        self.RScharge_feature = utils.reducedCHARGE(self.data)
        self.RScharge_feature_pos = utils.reducedCHARGE(self.positive_data)
        self.RScharge_feature_neg = utils.reducedCHARGE(self.negative_data)
        
        print("Extracting RSDHP Feature ...")
        self.RSDHP_feature = utils.reducedDHP(self.data)
        self.RSDHP_feature_pos = utils.reducedDHP(self.positive_data)
        self.RSDHP_feature_neg = utils.reducedDHP(self.negative_data)
        
        print("Extracting RSpolar Feature ...")
        self.RSpolar_feature = utils.reducedPOLAR(self.data)
        self.RSpolar_feature_pos = utils.reducedPOLAR(self.positive_data)
        self.RSpolar_feature_neg = utils.reducedPOLAR(self.negative_data)

    def get_feat_combo(self,selected:list = None):
        
        features =[self.AAC_feature,self.DPC_feature,self.RScharge_feature,self.RSDHP_feature,self.RSpolar_feature]
        
        if selected:
            select_index = sorted([ProteinFeatureGenerator.SELECTED_FEATURES.index(x) for x in selected])
            features = [features[x] for x in select_index]
            
        return np.concatenate(features,axis=-1)
        
    
    def __len__(self) -> int:
        return len(self.data)

In [4]:
train_data = ProteinFeatureGenerator(positive_data_file=config_map["train_postive_location"],negative_data_file=config_map["train_negative_location"])

Extracting AAC Feature ...
Extracting DPC Feature ...
Extracting RScharge Feature ...
Extracting RSDHP Feature ...
Extracting RSpolar Feature ...


In [5]:
X_data = {
    "AAC-DPC-RScharge-RSDHP-RSpolar":train_data.get_feat_combo(["AAC","DPC","RScharge","RSDHP","RSpolar"]),
    "AAC-RScharge-RSDHP-RSpolar":train_data.get_feat_combo(["AAC","RScharge","RSDHP","RSpolar"]),
    "DPC-RScharge-RSDHP-RSpolar":train_data.get_feat_combo(["DPC","RScharge","RSDHP","RSpolar"]),
    "AAC-DPC-RScharge":train_data.get_feat_combo(["AAC","DPC","RScharge"]),
    "AAC-DPC-RSDHP":train_data.get_feat_combo(["AAC","DPC","RSDHP"]),
    "AAC-RSDHP-RSpolar":train_data.get_feat_combo(["AAC","RSDHP","RSpolar"]),
    "RScharge-RSDHP-RSpolar":train_data.get_feat_combo(["RScharge","RSDHP","RSpolar"]),
    "AAC-RSpolar":train_data.get_feat_combo(["AAC","RSpolar"]),
}


data_pipelines = {
    "AAC-DPC-RScharge-RSDHP-RSpolar":make_pipeline(StandardScaler()),
    "AAC-RScharge-RSDHP-RSpolar":make_pipeline(StandardScaler()),
    "DPC-RScharge-RSDHP-RSpolar":make_pipeline(StandardScaler()),
    "AAC-DPC-RScharge":make_pipeline(StandardScaler()),
    "AAC-DPC-RSDHP":make_pipeline(StandardScaler()),
    "AAC-RSDHP-RSpolar":make_pipeline(StandardScaler()),
    "RScharge-RSDHP-RSpolar":make_pipeline(StandardScaler()),
    "AAC-RSpolar":make_pipeline(StandardScaler()),
}

In [6]:
best_feature_combo = ""
best_acc = 0
for feature_type in X_data.keys():
    print(f"Feature Combo :- {feature_type} | Model :- SVC")
    
    os.makedirs(os.path.join(config_map["feat_combo_model_save_location"],feature_type,"SVC"),exist_ok=True)
    model_dir = os.path.join(config_map["feat_combo_model_save_location"],feature_type,"SVC")
    
    X,y = X_data[feature_type],train_data.targets
    X,y = shuffle(X,y,random_state=config_map["random_seed"])
    X = data_pipelines[feature_type].fit_transform(X,y)
    
    clf = SVC()
    y_pred = cross_val_predict(clf, X, y, cv=5)
    
    result_values = evaluate_classification(y_pred,y,class_names=["Not Druggable","Druggable"],save_outputs=model_dir)
    
    clf.fit(X,y)
    
    print(result_values)
    # Select best combo to optimize
    if float(result_values["accuracy"])>best_acc:
        best_acc = result_values["accuracy"]
        best_feature_combo = feature_type
    joblib.dump(data_pipelines[feature_type], os.path.join(model_dir,"pipeline.sav"))
    joblib.dump(clf, os.path.join(model_dir,"model.sav"))
    print("\n")
    
print("Best Feature Combo:",best_feature_combo)
print("Best Accuracy:",str(best_acc))

Feature Combo :- AAC-DPC-RScharge-RSDHP-RSpolar | Model :- SVC
{'accuracy': 0.9040125885129819, 'sensitivity': 0.8879803761242846, 'specificity': 0.9188779378316907, 'precision': 0.9043767169124066, 'f1': 0.9037761748264941}


Feature Combo :- AAC-RScharge-RSDHP-RSpolar | Model :- SVC
{'accuracy': 0.8941778127458694, 'sensitivity': 0.8765331152902698, 'specificity': 0.910538286580743, 'precision': 0.8945769728009547, 'f1': 0.89390458490862}


Feature Combo :- DPC-RScharge-RSDHP-RSpolar | Model :- SVC
{'accuracy': 0.9016522423288749, 'sensitivity': 0.8855273916598528, 'specificity': 0.9166034874905231, 'precision': 0.902007447694949, 'f1': 0.90141001519108}


Feature Combo :- AAC-DPC-RScharge | Model :- SVC
{'accuracy': 0.8977183320220299, 'sensitivity': 0.8789860997547015, 'specificity': 0.9150871872630781, 'precision': 0.898209694818094, 'f1': 0.8974417861254993}


Feature Combo :- AAC-DPC-RSDHP | Model :- SVC
{'accuracy': 0.9008654602675059, 'sensitivity': 0.8863450531479967, 'specif

## Optimizing Best Model

In [7]:
# Get the clf and best feature combo
model = "SVC"
best_feature_combo = best_feature_combo
    
# Print the current model and feature type being trained
print(f"Feature Type: {best_feature_combo} | Training Model: {model}")
    
# Shuffle the data and apply the data pipeline
X, y = shuffle(X_data[best_feature_combo], train_data.targets, random_state=config_map["random_seed"])
data_pipeline = data_pipelines[best_feature_combo]
X = data_pipeline.fit_transform(X, y)
    
# Define the objective function for the Optuna optimization
def obj_func_svc(trial: optuna.trial) -> SVC:
    c = trial.suggest_float('C', 1e-5, 1e2, log=True)
    kernel = 'rbf'
    gamma = 'auto'
        
    classifier = SVC(
        C=c, 
        kernel=kernel,
        class_weight={1: 0.482, 0: 0.518},
        gamma=gamma,
    )
        
    return classifier
def objective(trial):
    clf = obj_func_svc(trial)
    y_pred = cross_val_predict(clf, X, y, cv=5)
    result_values = evaluate_classification(
        y_pred, y, class_names=["Not Druggable", "Druggable"],
        save_outputs=None
    )
    return result_values["f1"]


# Create an Optuna study and optimize the objective function
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Print the best hyperparameters and f1 score obtained
print('Best F1 Score: {}'.format(study.best_trial.value))
print("Best Hyperparameters: {}".format(study.best_trial.params))


# Save the best hyperparameters, study object, data pipeline object, and trained model
params = study.best_trial.params
model_dir = os.path.join(config_map["final_model_save_location"], best_feature_combo, model)
os.makedirs(model_dir, exist_ok=True)

with open(os.path.join(model_dir, "hyperparams.json"), "w") as f:
    json.dump(params, f)

classifier = SVC(**params)
classifier.fit(X, y)

joblib.dump(classifier, os.path.join(model_dir, "model.sav"))
joblib.dump(study, os.path.join(model_dir, "optuna_study.sav"))
joblib.dump(data_pipeline, os.path.join(model_dir, "pipeline.sav"))

# Print newlines to separate the outputs for each feature type
print("\n")


[32m[I 2023-05-12 13:03:08,405][0m A new study created in memory with name: no-name-5efeb62e-6b04-4f85-b690-a9e76c78f47f[0m


Feature Type: AAC-DPC-RScharge-RSDHP-RSpolar | Training Model: SVC


[32m[I 2023-05-12 13:03:17,389][0m Trial 0 finished with value: 0.3416213416213416 and parameters: {'C': 0.0026400438899086822}. Best is trial 0 with value: 0.3416213416213416.[0m
[32m[I 2023-05-12 13:03:26,113][0m Trial 1 finished with value: 0.3416213416213416 and parameters: {'C': 0.00019463970501194542}. Best is trial 0 with value: 0.3416213416213416.[0m
[32m[I 2023-05-12 13:03:30,351][0m Trial 2 finished with value: 0.9045179675664936 and parameters: {'C': 2.0047704309453547}. Best is trial 2 with value: 0.9045179675664936.[0m
[32m[I 2023-05-12 13:03:34,858][0m Trial 3 finished with value: 0.8951320132013201 and parameters: {'C': 42.37872244247006}. Best is trial 2 with value: 0.9045179675664936.[0m
[32m[I 2023-05-12 13:03:39,247][0m Trial 4 finished with value: 0.8947416089297968 and parameters: {'C': 35.54681789495904}. Best is trial 2 with value: 0.9045179675664936.[0m
[32m[I 2023-05-12 13:03:48,124][0m Trial 5 finished with value: 0.3416213416213416 and parame

Best F1 Score: 0.9100493724613037
Best Hyperparameters: {'C': 5.0754020943221985}




## Test

In [8]:
test_data = ProteinFeatureGenerator(positive_data_file=config_map["test_positive_location"],negative_data_file=config_map["test_negative_location"])

Extracting AAC Feature ...
Extracting DPC Feature ...
Extracting RScharge Feature ...
Extracting RSDHP Feature ...
Extracting RSpolar Feature ...


In [9]:
X_test = {
    "AAC-DPC-RScharge-RSDHP-RSpolar":test_data.get_feat_combo(["AAC","DPC","RScharge","RSDHP","RSpolar"]),
    "AAC-RScharge-RSDHP-RSpolar":test_data.get_feat_combo(["AAC","RScharge","RSDHP","RSpolar"]),
    "DPC-RScharge-RSDHP-RSpolar":test_data.get_feat_combo(["DPC","RScharge","RSDHP","RSpolar"]),
    "AAC-DPC-RScharge":test_data.get_feat_combo(["AAC","DPC","RScharge"]),
    "AAC-DPC-RSDHP":test_data.get_feat_combo(["AAC","DPC","RSDHP"]),
    "AAC-RSDHP-RSpolar":test_data.get_feat_combo(["AAC","RSDHP","RSpolar"]),
    "RScharge-RSDHP-RSpolar":test_data.get_feat_combo(["RScharge","RSDHP","RSpolar"]),
    "AAC-RSpolar":test_data.get_feat_combo(["AAC","RSpolar"]),
}

In [10]:
feature_type = best_feature_combo
print(f" Feature Type :- {feature_type} | Model :- SVC")
    
model_dir = os.path.join(config_map["final_model_save_location"],feature_type,"SVC")
os.makedirs(model_dir,exist_ok=True)

pipeline = joblib.load(os.path.join(config_map["final_model_save_location"],feature_type,"SVC","pipeline.sav"))
clf = joblib.load(os.path.join(config_map["final_model_save_location"],feature_type,"SVC","model.sav"))

X,y = X_test[feature_type],test_data.targets
X,y = shuffle(X,y,random_state=config_map["random_seed"])
X = pipeline.transform(X)

y_pred = clf.predict(X)

result_values = evaluate_classification(y_pred,y,class_names=["Not Druggable","Druggable"],save_outputs=model_dir)

# Save predicted labels for positive and negative data
with open(os.path.join(model_dir, "predictions_pos.txt"), "w") as f:
    for i in range(len(y_pred)):
        if y[i] == True:
            val = "1"
            if y_pred[i]==False: val="0"
            f.write(f"{val}\n")

with open(os.path.join(model_dir, "predictions_neg.txt"), "w") as f:
    for i in range(len(y_pred)):
        if y[i] == False:
            val = "0"
            if y_pred[i]==True: val="1"
            f.write(f"{val}\n")

print(result_values)
print("\n\n")

 Feature Type :- AAC-DPC-RScharge-RSDHP-RSpolar | Model :- SVC
{'accuracy': 0.8655097613882863, 'sensitivity': 0.78125, 'specificity': 0.9451476793248945, 'precision': 0.8756819421713038, 'f1': 0.8639729678279078}



