In [1]:
import os
import tqdm
import pandas as pd
import numpy as np
np.random.seed(1)
import warnings
warnings.filterwarnings('ignore')
from rdkit.Chem import MACCSkeys
from rdkit.Chem import MolFromSmiles
from rdkit.Chem import rdFingerprintGenerator
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3,fpSize=2048)
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

#更换算法时修改
from sklearn.ensemble import RandomForestClassifier
algorithm = 'RF'

In [2]:
# PRAUC and ROCAUC
def nested_cv(X,y):

    # 更换算法时修改
    model = RandomForestClassifier()
    param_grid = {
        'n_estimators':[100,300,900],
        'criterion':['gini','entropy'],
        'max_depth':[1,5,9]
    }
    
    # Set up outer cross-validation loop
    out_fold_number = 3
    outer_cv = StratifiedKFold(n_splits=out_fold_number, shuffle=True, random_state=1)

    f1_scores = []
    f1_best_params = []

    # Execute outer splits
    for train_index, test_index in outer_cv.split(X, y):
        X_train, X_test = X[train_index, :], X[test_index, :]
        y_train, y_test = y[train_index], y[test_index]

        # Set up inner cross-validation loop
        inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

        # Create GridSearchCV object
        clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, scoring='f1', error_score='raise')

        # Fit GridSearchCV on the training set
        clf.fit(X_train, y_train)

        # Predict probabilities on the test set
        y_pred_proba = clf.predict_proba(X_test)

        # Calculate f1 score for the fold
        f1_score_fold = f1_score(y_test, (y_pred_proba[:, 1] > 0.5).astype(int))
        f1_scores.append(f1_score_fold)
        f1_best_params.append(clf.best_params_)
        

    # Calculate average f1 score across all folds
    average_f1_score = np.mean(f1_scores)

    # Determine the best parameters based on the highest average f1 score
    best_f1_params_overall = f1_best_params[np.argmax(f1_scores)]
    
    return best_f1_params_overall, average_f1_score

In [3]:
# Read file information in a folder
def get_file_list(file_folder):
    for root, dirs, file_list in os.walk(file_folder):
        return file_list


# SMILES to fingerprint feature conversion / Different combinations of fingerprint features
def batchECFP(smiles, radius=3, nBits=2048):
    smiles = np.array(smiles)
    n = len(smiles)
    fingerprints_0 = np.zeros((n, nBits), dtype=int)
    fingerprints_1 = np.zeros((n, nBits), dtype=int)
    MACCSArray = []
    for i in range(n):
        mol = MolFromSmiles(smiles[i])
        # ecfp6
        fpgen.useFeatures = False
        fp = fpgen.GetFingerprint(mol)
        # fcfp6
        fpgen.useFeatures = True
        fp_1 = fpgen.GetFingerprint(mol)
        # MACCS
        MACCSArray.append(MACCSkeys.GenMACCSKeys(mol))
        fingerprints_0[i] = np.array(list(fp.ToBitString()))
        fingerprints_1[i] = np.array(list(fp_1.ToBitString()))
    fingerprints_2 = np.array(MACCSArray)
    fingerprints = np.hstack((fingerprints_0, fingerprints_1, fingerprints_2))
    fingerprints_3 = np.hstack((fingerprints_0, fingerprints_1))
    fingerprints_4 = np.hstack((fingerprints_0, fingerprints_2))
    fingerprints_5 = np.hstack((fingerprints_1, fingerprints_2))
    if FingerPrint == "ecfp6fcfp6MACCS":
        fingerprints_out = fingerprints
    elif FingerPrint == "ecfp6":
        fingerprints_out = fingerprints_0
    elif FingerPrint == "fcfp6":
        fingerprints_out = fingerprints_1
    elif FingerPrint == "MACCS":
        fingerprints_out = fingerprints_2
    elif FingerPrint == "ecfp6fcfp6":
        fingerprints_out = fingerprints_3
    elif FingerPrint== "ecfp6MACCS":
        fingerprints_out = fingerprints_4
    elif FingerPrint == "fcfp6MACCS":
        fingerprints_out = fingerprints_5

    return fingerprints_out

In [4]:
# make results directory 
FingerPrintSet = ['ecfp6','ecfp6fcfp6','ecfp6fcfp6MACCS','ecfp6MACCS','fcfp6','fcfp6MACCS','MACCS']
for FingerPrint in FingerPrintSet:
    FingerPrint_path = './results/'+FingerPrint
    if os.path.isdir(FingerPrint_path):  
        pass
    else:
        os.makedirs(FingerPrint_path)


#文件路径准备
basePath = os.getcwd()
resultPath = basePath+'/results'
training_path = basePath+'/../../training_data'
training_list = os.listdir(training_path)


for training_list_set in training_list:

    pertarget_files = training_path+'/'+training_list_set

    # 测试时修改
    files_list = get_file_list(pertarget_files)

    #指纹组合
    FingerPrintSet = ['ecfp6','ecfp6fcfp6','ecfp6fcfp6MACCS','ecfp6MACCS','fcfp6','fcfp6MACCS','MACCS']


    # 遍历所有的指纹组合

    for FingerPrint in FingerPrintSet:    
        
        targets = []
        data_f1_params = []
        data_f1_score = []
        
        #遍历每个文件，编码features，执行nested_cv
        print(training_list_set, FingerPrint)
        for tar_id in tqdm.tqdm(files_list):
            # print(tar_id)
            data = pd.read_csv(pertarget_files+'/'+tar_id, header=0,index_col=False)
            features = batchECFP(data.iloc[:,2])
            y = data.iloc[:,4]
            result = nested_cv(features,y)

            #记录信息，为输出文件作准备
            chembl_id = tar_id.split('.')[0]
            targets.append(chembl_id)
            data_f1_params.append(result[0])
            data_f1_score.append(result[1])


        #整理文件内容,并输出
        data={'targets':targets, 'f1_params':data_f1_params, 'f1_score':data_f1_score}
        data = pd.DataFrame(data) 
        data.to_csv(resultPath+'/'+FingerPrint+'/'+FingerPrint+'_'+algorithm+'_'+training_list_set+'_f1mean.csv',index=False)


TCM2000_100t ecfp6


100%|██████████| 19/19 [16:15<00:00, 51.34s/it]


TCM2000_100t ecfp6fcfp6


100%|██████████| 19/19 [17:24<00:00, 54.99s/it]


TCM2000_100t ecfp6fcfp6MACCS


100%|██████████| 19/19 [17:29<00:00, 55.22s/it]


TCM2000_100t ecfp6MACCS


100%|██████████| 19/19 [16:48<00:00, 53.10s/it]


TCM2000_100t fcfp6


100%|██████████| 19/19 [15:22<00:00, 48.55s/it]


TCM2000_100t fcfp6MACCS


100%|██████████| 19/19 [15:23<00:00, 48.60s/it]


TCM2000_100t MACCS


100%|██████████| 19/19 [14:52<00:00, 46.98s/it]


TCM2000_30t ecfp6


100%|██████████| 62/62 [43:56<00:00, 42.53s/it]


TCM2000_30t ecfp6fcfp6


100%|██████████| 62/62 [38:46<00:00, 37.53s/it]


TCM2000_30t ecfp6fcfp6MACCS


100%|██████████| 62/62 [38:51<00:00, 37.61s/it]


TCM2000_30t ecfp6MACCS


100%|██████████| 62/62 [38:13<00:00, 36.99s/it]


TCM2000_30t fcfp6


100%|██████████| 62/62 [37:54<00:00, 36.68s/it]


TCM2000_30t fcfp6MACCS


100%|██████████| 62/62 [36:21<00:00, 35.19s/it]


TCM2000_30t MACCS


100%|██████████| 62/62 [35:28<00:00, 34.34s/it]


TCM2000_50t ecfp6


100%|██████████| 41/41 [24:16<00:00, 35.52s/it]


TCM2000_50t ecfp6fcfp6


100%|██████████| 41/41 [25:07<00:00, 36.76s/it]


TCM2000_50t ecfp6fcfp6MACCS


100%|██████████| 41/41 [24:50<00:00, 36.36s/it]


TCM2000_50t ecfp6MACCS


100%|██████████| 41/41 [24:21<00:00, 35.65s/it]


TCM2000_50t fcfp6


100%|██████████| 41/41 [24:20<00:00, 35.62s/it]


TCM2000_50t fcfp6MACCS


100%|██████████| 41/41 [24:26<00:00, 35.78s/it]


TCM2000_50t MACCS


100%|██████████| 41/41 [23:42<00:00, 34.70s/it]


TCM2000_80t ecfp6


100%|██████████| 24/24 [14:30<00:00, 36.28s/it]


TCM2000_80t ecfp6fcfp6


100%|██████████| 24/24 [14:52<00:00, 37.19s/it]


TCM2000_80t ecfp6fcfp6MACCS


100%|██████████| 24/24 [14:56<00:00, 37.36s/it]


TCM2000_80t ecfp6MACCS


100%|██████████| 24/24 [14:39<00:00, 36.64s/it]


TCM2000_80t fcfp6


100%|██████████| 24/24 [14:29<00:00, 36.25s/it]


TCM2000_80t fcfp6MACCS


100%|██████████| 24/24 [14:34<00:00, 36.42s/it]


TCM2000_80t MACCS


100%|██████████| 24/24 [14:05<00:00, 35.22s/it]


TCM_100t ecfp6


100%|██████████| 15/15 [08:59<00:00, 35.99s/it]


TCM_100t ecfp6fcfp6


100%|██████████| 15/15 [09:24<00:00, 37.65s/it]


TCM_100t ecfp6fcfp6MACCS


100%|██████████| 15/15 [09:18<00:00, 37.25s/it]


TCM_100t ecfp6MACCS


100%|██████████| 15/15 [09:10<00:00, 36.68s/it]


TCM_100t fcfp6


100%|██████████| 15/15 [09:03<00:00, 36.23s/it]


TCM_100t fcfp6MACCS


100%|██████████| 15/15 [09:04<00:00, 36.30s/it]


TCM_100t MACCS


100%|██████████| 15/15 [08:47<00:00, 35.17s/it]


TCM_30t ecfp6


100%|██████████| 53/53 [30:49<00:00, 34.89s/it]


TCM_30t ecfp6fcfp6


100%|██████████| 53/53 [31:27<00:00, 35.62s/it]


TCM_30t ecfp6fcfp6MACCS


100%|██████████| 53/53 [31:27<00:00, 35.61s/it]


TCM_30t ecfp6MACCS


100%|██████████| 53/53 [31:02<00:00, 35.15s/it]


TCM_30t fcfp6


100%|██████████| 53/53 [30:49<00:00, 34.89s/it]


TCM_30t fcfp6MACCS


100%|██████████| 53/53 [30:57<00:00, 35.04s/it]


TCM_30t MACCS


100%|██████████| 53/53 [30:34<00:00, 34.61s/it]


TCM_50t ecfp6


100%|██████████| 28/28 [16:51<00:00, 36.11s/it]


TCM_50t ecfp6fcfp6


100%|██████████| 28/28 [17:08<00:00, 36.71s/it]


TCM_50t ecfp6fcfp6MACCS


100%|██████████| 28/28 [17:05<00:00, 36.61s/it]


TCM_50t ecfp6MACCS


100%|██████████| 28/28 [16:46<00:00, 35.93s/it]


TCM_50t fcfp6


100%|██████████| 28/28 [16:52<00:00, 36.18s/it]


TCM_50t fcfp6MACCS


100%|██████████| 28/28 [16:00<00:00, 34.31s/it]


TCM_50t MACCS


100%|██████████| 28/28 [16:11<00:00, 34.68s/it]


TCM_80t ecfp6


100%|██████████| 20/20 [14:01<00:00, 42.06s/it]


TCM_80t ecfp6fcfp6


100%|██████████| 20/20 [14:46<00:00, 44.32s/it]


TCM_80t ecfp6fcfp6MACCS


100%|██████████| 20/20 [15:03<00:00, 45.15s/it]


TCM_80t ecfp6MACCS


100%|██████████| 20/20 [12:29<00:00, 37.50s/it]


TCM_80t fcfp6


100%|██████████| 20/20 [12:31<00:00, 37.56s/it]


TCM_80t fcfp6MACCS


100%|██████████| 20/20 [13:15<00:00, 39.79s/it]


TCM_80t MACCS


100%|██████████| 20/20 [13:41<00:00, 41.05s/it]
