In [1]:
import os
import tqdm
import pandas as pd
import numpy as np
np.random.seed(1)
import warnings
warnings.filterwarnings('ignore')
from rdkit.Chem import MACCSkeys
from rdkit.Chem import MolFromSmiles
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from sklearn.metrics import average_precision_score,roc_auc_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

#更换算法时修改
from sklearn.neighbors import KNeighborsClassifier
algorithm = 'KNN'

In [2]:
# PRAUC and ROCAUC
def nested_cv(X,y):

    # 更换算法时修改
    model = KNeighborsClassifier()
    out_fold_number = 3
    param_grid = {
        "n_neighbors":[x for x in range(1,21,2) if x <= len(X)/out_fold_number]
    }
    
    # Set up outer cross-validation loop
    outer_cv = StratifiedKFold(n_splits=out_fold_number, shuffle=True, random_state=1)

    prauc_scores = []
    rocauc_scores = []
    prauc_best_params = []
    rocauc_best_params = []

    # Execute outer splits
    for train_index, test_index in outer_cv.split(X, y):
        X_train, X_test = X[train_index, :], X[test_index, :]
        y_train, y_test = y[train_index], y[test_index]

        # Set up inner cross-validation loop
        inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

        # Create GridSearchCV object
        clf1 = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, scoring='average_precision', error_score='raise')
        clf2 = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, scoring='roc_auc', error_score='raise')

        # Fit GridSearchCV on the training set
        clf1.fit(X_train, y_train)
        clf2.fit(X_train, y_train)

        # Predict probabilities on the test set
        y_pred_proba1 = clf1.predict_proba(X_test)
        y_pred_proba2 = clf2.predict_proba(X_test)

        # Calculate PR AUC score for the fold
        prauc_score = average_precision_score(y_test, y_pred_proba1[:, 1])
        rocauc_score = roc_auc_score(y_test, y_pred_proba2[:, 1])

        prauc_scores.append(prauc_score)
        prauc_best_params.append(clf1.best_params_)
        rocauc_scores.append(rocauc_score)
        rocauc_best_params.append(clf2.best_params_)
        

    # Calculate average PR AUC score across all folds
    average_prauc_score = np.mean(prauc_scores)
    average_rocauc_score = np.mean(rocauc_scores)

    # Determine the best parameters based on the highest average PR AUC score
    best_prauc_params_overall = prauc_best_params[np.argmax(prauc_scores)]
    best_rocauc_params_overall = rocauc_best_params[np.argmax(rocauc_scores)]
    
    return best_prauc_params_overall, average_prauc_score, best_rocauc_params_overall, average_rocauc_score

In [3]:
# Read file information in a folder
def get_file_list(file_folder):
    for root, dirs, file_list in os.walk(file_folder):
        return file_list


# SMILES to fingerprint feature conversion / Different combinations of fingerprint features
def batchECFP(smiles, radius=3, nBits=2048):
    smiles = np.array(smiles)
    n = len(smiles)
    fingerprints_0 = np.zeros((n, nBits), dtype=int)
    fingerprints_1 = np.zeros((n, nBits), dtype=int)
    MACCSArray = []
    for i in range(n):
        mol = MolFromSmiles(smiles[i])
        # ecfp6
        fp = GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
        # fcfp6
        fp_1 = GetMorganFingerprintAsBitVect(
            mol, radius, nBits=nBits, useFeatures=True)
        # MACCS
        MACCSArray.append(MACCSkeys.GenMACCSKeys(mol))
        fingerprints_0[i] = np.array(list(fp.ToBitString()))
        fingerprints_1[i] = np.array(list(fp_1.ToBitString()))
    fingerprints_2 = np.array(MACCSArray)
    fingerprints = np.hstack((fingerprints_0, fingerprints_1, fingerprints_2))
    fingerprints_3 = np.hstack((fingerprints_0, fingerprints_1))
    fingerprints_4 = np.hstack((fingerprints_0, fingerprints_2))
    fingerprints_5 = np.hstack((fingerprints_1, fingerprints_2))
    if FingerPrint == "ecfp6fcfp6MACCS":
        fingerprints_out = fingerprints
    elif FingerPrint == "ecfp6":
        fingerprints_out = fingerprints_0
    elif FingerPrint == "fcfp6":
        fingerprints_out = fingerprints_1
    elif FingerPrint == "MACCS":
        fingerprints_out = fingerprints_2
    elif FingerPrint == "ecfp6fcfp6":
        fingerprints_out = fingerprints_3
    elif FingerPrint== "ecfp6MACCS":
        fingerprints_out = fingerprints_4
    elif FingerPrint == "fcfp6MACCS":
        fingerprints_out = fingerprints_5

    return fingerprints_out

In [4]:
# make results directory 
FingerPrintSet = ['ecfp6','ecfp6fcfp6','ecfp6fcfp6MACCS','ecfp6MACCS','fcfp6','fcfp6MACCS','MACCS']
for FingerPrint in FingerPrintSet:
    FingerPrint_path = './results/'+FingerPrint
    if os.path.isdir(FingerPrint_path):  
        pass
    else:
        os.makedirs(FingerPrint_path)


#文件路径准备
basePath = os.getcwd()
resultPath = basePath+'/results'
training_path = basePath+'/training_data'
training_list = os.listdir(training_path)


for training_list_set in training_list:

    pertarget_files = training_path+'/'+training_list_set

    # 测试时修改
    files_list = get_file_list(pertarget_files)

    #指纹组合
    FingerPrintSet = ['ecfp6','ecfp6fcfp6','ecfp6fcfp6MACCS','ecfp6MACCS','fcfp6','fcfp6MACCS','MACCS']


    # 遍历所有的指纹组合

    for FingerPrint in FingerPrintSet:    
        
        targets = []
        prauc_params = []
        prauc_score = []
        rocauc_params = []
        rocauc_score = []
        
        #遍历每个文件，编码features，执行nested_cv
        print(training_list_set, FingerPrint)
        for tar_id in tqdm.tqdm(files_list):
            data = pd.read_csv(pertarget_files+'/'+tar_id, header=0,index_col=False)
            features = batchECFP(data.iloc[:,2])
            y = data.iloc[:,4]
            result = nested_cv(features,y)

            #记录信息，为输出文件作准备
            chembl_id = tar_id.split('.')[0]
            targets.append(chembl_id)
            prauc_params.append(result[0])
            prauc_score.append(result[1])
            rocauc_params.append(result[2])
            rocauc_score.append(result[3])

        #整理文件内容,并输出
        data={'targets':targets,'prauc_params':prauc_params, 'prauc_score':prauc_score, 
            'rocauc_params':rocauc_params, 'rocauc_score':rocauc_score}
        roc_data = pd.DataFrame(data) 
        roc_data.to_csv(resultPath+'/'+FingerPrint+'/'+FingerPrint+'_'+algorithm+'_'+training_list_set+'_rocmean.csv',index=False)


TCM2000_100t ecfp6


100%|██████████| 30/30 [00:26<00:00,  1.13it/s]


TCM2000_100t ecfp6fcfp6


100%|██████████| 30/30 [00:30<00:00,  1.02s/it]


TCM2000_100t ecfp6fcfp6MACCS


100%|██████████| 30/30 [00:31<00:00,  1.04s/it]


TCM2000_100t ecfp6MACCS


100%|██████████| 30/30 [00:27<00:00,  1.10it/s]


TCM2000_100t fcfp6


100%|██████████| 30/30 [00:27<00:00,  1.10it/s]


TCM2000_100t fcfp6MACCS


100%|██████████| 30/30 [00:27<00:00,  1.10it/s]


TCM2000_100t MACCS


100%|██████████| 30/30 [00:22<00:00,  1.33it/s]


TCM2000_30t ecfp6


100%|██████████| 128/128 [01:01<00:00,  2.07it/s]


TCM2000_30t ecfp6fcfp6


100%|██████████| 128/128 [01:09<00:00,  1.84it/s]


TCM2000_30t ecfp6fcfp6MACCS


100%|██████████| 128/128 [01:08<00:00,  1.86it/s]


TCM2000_30t ecfp6MACCS


100%|██████████| 128/128 [01:01<00:00,  2.07it/s]


TCM2000_30t fcfp6


100%|██████████| 128/128 [01:00<00:00,  2.10it/s]


TCM2000_30t fcfp6MACCS


100%|██████████| 128/128 [01:01<00:00,  2.09it/s]


TCM2000_30t MACCS


100%|██████████| 128/128 [00:52<00:00,  2.42it/s]


TCM2000_50t ecfp6


100%|██████████| 75/75 [00:46<00:00,  1.61it/s]


TCM2000_50t ecfp6fcfp6


100%|██████████| 75/75 [00:52<00:00,  1.42it/s]


TCM2000_50t ecfp6fcfp6MACCS


100%|██████████| 75/75 [00:53<00:00,  1.41it/s]


TCM2000_50t ecfp6MACCS


100%|██████████| 75/75 [00:46<00:00,  1.60it/s]


TCM2000_50t fcfp6


100%|██████████| 75/75 [00:46<00:00,  1.62it/s]


TCM2000_50t fcfp6MACCS


100%|██████████| 75/75 [00:46<00:00,  1.60it/s]


TCM2000_50t MACCS


100%|██████████| 75/75 [00:39<00:00,  1.88it/s]


TCM2000_80t ecfp6


100%|██████████| 39/39 [00:31<00:00,  1.25it/s]


TCM2000_80t ecfp6fcfp6


100%|██████████| 39/39 [00:37<00:00,  1.05it/s]


TCM2000_80t ecfp6fcfp6MACCS


100%|██████████| 39/39 [00:37<00:00,  1.05it/s]


TCM2000_80t ecfp6MACCS


100%|██████████| 39/39 [00:34<00:00,  1.15it/s]


TCM2000_80t fcfp6


100%|██████████| 39/39 [00:31<00:00,  1.22it/s]


TCM2000_80t fcfp6MACCS


100%|██████████| 39/39 [00:41<00:00,  1.06s/it]


TCM2000_80t MACCS


100%|██████████| 39/39 [00:34<00:00,  1.12it/s]


TCM_100t ecfp6


100%|██████████| 23/23 [00:22<00:00,  1.03it/s]


TCM_100t ecfp6fcfp6


100%|██████████| 23/23 [00:22<00:00,  1.04it/s]


TCM_100t ecfp6fcfp6MACCS


100%|██████████| 23/23 [00:30<00:00,  1.34s/it]


TCM_100t ecfp6MACCS


100%|██████████| 23/23 [00:26<00:00,  1.15s/it]


TCM_100t fcfp6


100%|██████████| 23/23 [00:26<00:00,  1.17s/it]


TCM_100t fcfp6MACCS


100%|██████████| 23/23 [00:21<00:00,  1.09it/s]


TCM_100t MACCS


100%|██████████| 23/23 [00:16<00:00,  1.42it/s]


TCM_30t ecfp6


100%|██████████| 114/114 [00:50<00:00,  2.25it/s]


TCM_30t ecfp6fcfp6


100%|██████████| 114/114 [00:56<00:00,  2.02it/s]


TCM_30t ecfp6fcfp6MACCS


100%|██████████| 114/114 [00:57<00:00,  2.00it/s]


TCM_30t ecfp6MACCS


100%|██████████| 114/114 [00:51<00:00,  2.22it/s]


TCM_30t fcfp6


100%|██████████| 114/114 [00:51<00:00,  2.23it/s]


TCM_30t fcfp6MACCS


100%|██████████| 114/114 [00:51<00:00,  2.20it/s]


TCM_30t MACCS


100%|██████████| 114/114 [00:55<00:00,  2.04it/s]


TCM_50t ecfp6


100%|██████████| 56/56 [00:45<00:00,  1.23it/s]


TCM_50t ecfp6fcfp6


100%|██████████| 56/56 [00:48<00:00,  1.15it/s]


TCM_50t ecfp6fcfp6MACCS


100%|██████████| 56/56 [00:46<00:00,  1.21it/s]


TCM_50t ecfp6MACCS


100%|██████████| 56/56 [00:43<00:00,  1.29it/s]


TCM_50t fcfp6


100%|██████████| 56/56 [00:34<00:00,  1.62it/s]


TCM_50t fcfp6MACCS


100%|██████████| 56/56 [00:34<00:00,  1.61it/s]


TCM_50t MACCS


100%|██████████| 56/56 [00:29<00:00,  1.90it/s]


TCM_80t ecfp6


100%|██████████| 31/31 [00:23<00:00,  1.34it/s]


TCM_80t ecfp6fcfp6


100%|██████████| 31/31 [00:35<00:00,  1.14s/it]


TCM_80t ecfp6fcfp6MACCS


100%|██████████| 31/31 [00:32<00:00,  1.04s/it]


TCM_80t ecfp6MACCS


100%|██████████| 31/31 [00:24<00:00,  1.28it/s]


TCM_80t fcfp6


100%|██████████| 31/31 [00:23<00:00,  1.31it/s]


TCM_80t fcfp6MACCS


100%|██████████| 31/31 [00:24<00:00,  1.29it/s]


TCM_80t MACCS


100%|██████████| 31/31 [00:19<00:00,  1.56it/s]
