In [1]:
import os
import tqdm
import pandas as pd
import numpy as np
np.random.seed(1)
import warnings
warnings.filterwarnings('ignore')
from rdkit.Chem import MACCSkeys
from rdkit.Chem import MolFromSmiles
from rdkit.Chem import rdFingerprintGenerator
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=3,fpSize=2048)
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, StratifiedKFold

#更换算法时修改
from sklearn.neighbors import KNeighborsClassifier
algorithm = 'KNN'

In [2]:
# 用f1当指标
def nested_cv(X,y):

    # 更换算法时修改
    model = KNeighborsClassifier()
    param_grid = {
        "n_neighbors":[x for x in range(1,21,2) if x <= len(X)/out_fold_number]
    }
    
    # Set up outer cross-validation loop
    out_fold_number = 3
    outer_cv = StratifiedKFold(n_splits=out_fold_number, shuffle=True, random_state=1)

    f1_scores = []
    f1_best_params = []

    # Execute outer splits
    for train_index, test_index in outer_cv.split(X, y):
        X_train, X_test = X[train_index, :], X[test_index, :]
        y_train, y_test = y[train_index], y[test_index]

        # Set up inner cross-validation loop
        inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)

        # Create GridSearchCV object
        clf = GridSearchCV(estimator=model, param_grid=param_grid, cv=inner_cv, scoring='f1', error_score='raise')

        # Fit GridSearchCV on the training set
        clf.fit(X_train, y_train)

        # Predict probabilities on the test set
        y_pred_proba = clf.predict_proba(X_test)

        # Calculate f1 score for the fold
        f1_score_fold = f1_score(y_test, (y_pred_proba[:, 1] > 0.5).astype(int))
        f1_scores.append(f1_score_fold)
        f1_best_params.append(clf.best_params_)
        

    # Calculate average f1 score across all folds
    average_f1_score = np.mean(f1_scores)

    # Determine the best parameters based on the highest average f1 score
    best_f1_params_overall = f1_best_params[np.argmax(f1_scores)]
    
    return best_f1_params_overall, average_f1_score

In [3]:
# Read file information in a folder
def get_file_list(file_folder):
    for root, dirs, file_list in os.walk(file_folder):
        return file_list


# SMILES to fingerprint feature conversion / Different combinations of fingerprint features
def batchECFP(smiles, radius=3, nBits=2048):
    smiles = np.array(smiles)
    n = len(smiles)
    fingerprints_0 = np.zeros((n, nBits), dtype=int)
    fingerprints_1 = np.zeros((n, nBits), dtype=int)
    MACCSArray = []
    for i in range(n):
        mol = MolFromSmiles(smiles[i])
        # ecfp6
        fpgen.useFeatures = False
        fp = fpgen.GetFingerprint(mol)
        # fcfp6
        fpgen.useFeatures = True
        fp_1 = fpgen.GetFingerprint(mol)
        # MACCS
        MACCSArray.append(MACCSkeys.GenMACCSKeys(mol))
        fingerprints_0[i] = np.array(list(fp.ToBitString()))
        fingerprints_1[i] = np.array(list(fp_1.ToBitString()))
    fingerprints_2 = np.array(MACCSArray)
    fingerprints = np.hstack((fingerprints_0, fingerprints_1, fingerprints_2))
    fingerprints_3 = np.hstack((fingerprints_0, fingerprints_1))
    fingerprints_4 = np.hstack((fingerprints_0, fingerprints_2))
    fingerprints_5 = np.hstack((fingerprints_1, fingerprints_2))
    if FingerPrint == "ecfp6fcfp6MACCS":
        fingerprints_out = fingerprints
    elif FingerPrint == "ecfp6":
        fingerprints_out = fingerprints_0
    elif FingerPrint == "fcfp6":
        fingerprints_out = fingerprints_1
    elif FingerPrint == "MACCS":
        fingerprints_out = fingerprints_2
    elif FingerPrint == "ecfp6fcfp6":
        fingerprints_out = fingerprints_3
    elif FingerPrint== "ecfp6MACCS":
        fingerprints_out = fingerprints_4
    elif FingerPrint == "fcfp6MACCS":
        fingerprints_out = fingerprints_5

    return fingerprints_out

In [4]:
# make results directory 
FingerPrintSet = ['ecfp6','ecfp6fcfp6','ecfp6fcfp6MACCS','ecfp6MACCS','fcfp6','fcfp6MACCS','MACCS']
for FingerPrint in FingerPrintSet:
    FingerPrint_path = './results/'+FingerPrint
    if os.path.isdir(FingerPrint_path):  
        pass
    else:
        os.makedirs(FingerPrint_path)


#文件路径准备
basePath = os.getcwd()
resultPath = basePath+'/results'
training_path = basePath+'/../../training_data'
training_list = os.listdir(training_path)


for training_list_set in training_list:

    pertarget_files = training_path+'/'+training_list_set

    # 测试时修改
    files_list = get_file_list(pertarget_files)

    #指纹组合
    FingerPrintSet = ['ecfp6','ecfp6fcfp6','ecfp6fcfp6MACCS','ecfp6MACCS','fcfp6','fcfp6MACCS','MACCS']


    # 遍历所有的指纹组合

    for FingerPrint in FingerPrintSet:    
        
        targets = []
        data_f1_params = []
        data_f1_score = []
        
        #遍历每个文件，编码features，执行nested_cv
        print(training_list_set, FingerPrint)
        for tar_id in tqdm.tqdm(files_list):
            # print(tar_id)
            data = pd.read_csv(pertarget_files+'/'+tar_id, header=0,index_col=False)
            features = batchECFP(data.iloc[:,2])
            y = data.iloc[:,4]
            result = nested_cv(features,y)

            #记录信息，为输出文件作准备
            chembl_id = tar_id.split('.')[0]
            targets.append(chembl_id)
            data_f1_params.append(result[0])
            data_f1_score.append(result[1])


        #整理文件内容,并输出
        data={'targets':targets, 'f1_params':data_f1_params, 'f1_score':data_f1_score}
        data = pd.DataFrame(data) 
        data.to_csv(resultPath+'/'+FingerPrint+'/'+FingerPrint+'_'+algorithm+'_'+training_list_set+'_f1mean.csv',index=False)


TCM2000_100t ecfp6


100%|██████████| 19/19 [00:12<00:00,  1.47it/s]


TCM2000_100t ecfp6fcfp6


100%|██████████| 19/19 [00:14<00:00,  1.34it/s]


TCM2000_100t ecfp6fcfp6MACCS


100%|██████████| 19/19 [00:14<00:00,  1.33it/s]


TCM2000_100t ecfp6MACCS


100%|██████████| 19/19 [00:12<00:00,  1.46it/s]


TCM2000_100t fcfp6


100%|██████████| 19/19 [00:12<00:00,  1.48it/s]


TCM2000_100t fcfp6MACCS


100%|██████████| 19/19 [00:12<00:00,  1.48it/s]


TCM2000_100t MACCS


100%|██████████| 19/19 [00:12<00:00,  1.57it/s]


TCM2000_30t ecfp6


100%|██████████| 62/62 [00:25<00:00,  2.48it/s]


TCM2000_30t ecfp6fcfp6


100%|██████████| 62/62 [00:26<00:00,  2.34it/s]


TCM2000_30t ecfp6fcfp6MACCS


100%|██████████| 62/62 [00:26<00:00,  2.32it/s]


TCM2000_30t ecfp6MACCS


100%|██████████| 62/62 [00:25<00:00,  2.48it/s]


TCM2000_30t fcfp6


100%|██████████| 62/62 [00:24<00:00,  2.49it/s]


TCM2000_30t fcfp6MACCS


100%|██████████| 62/62 [00:26<00:00,  2.36it/s]


TCM2000_30t MACCS


100%|██████████| 62/62 [00:23<00:00,  2.59it/s]


TCM2000_50t ecfp6


100%|██████████| 41/41 [00:20<00:00,  2.03it/s]


TCM2000_50t ecfp6fcfp6


100%|██████████| 41/41 [00:22<00:00,  1.86it/s]


TCM2000_50t ecfp6fcfp6MACCS


100%|██████████| 41/41 [00:22<00:00,  1.85it/s]


TCM2000_50t ecfp6MACCS


100%|██████████| 41/41 [00:21<00:00,  1.92it/s]


TCM2000_50t fcfp6


100%|██████████| 41/41 [00:21<00:00,  1.91it/s]


TCM2000_50t fcfp6MACCS


100%|██████████| 41/41 [00:20<00:00,  2.04it/s]


TCM2000_50t MACCS


100%|██████████| 41/41 [00:19<00:00,  2.10it/s]


TCM2000_80t ecfp6


100%|██████████| 24/24 [00:14<00:00,  1.61it/s]


TCM2000_80t ecfp6fcfp6


100%|██████████| 24/24 [00:16<00:00,  1.46it/s]


TCM2000_80t ecfp6fcfp6MACCS


100%|██████████| 24/24 [00:16<00:00,  1.45it/s]


TCM2000_80t ecfp6MACCS


100%|██████████| 24/24 [00:15<00:00,  1.60it/s]


TCM2000_80t fcfp6


100%|██████████| 24/24 [00:14<00:00,  1.60it/s]


TCM2000_80t fcfp6MACCS


100%|██████████| 24/24 [00:15<00:00,  1.57it/s]


TCM2000_80t MACCS


100%|██████████| 24/24 [00:14<00:00,  1.68it/s]


TCM_100t ecfp6


100%|██████████| 15/15 [00:09<00:00,  1.57it/s]


TCM_100t ecfp6fcfp6


100%|██████████| 15/15 [00:10<00:00,  1.45it/s]


TCM_100t ecfp6fcfp6MACCS


100%|██████████| 15/15 [00:10<00:00,  1.41it/s]


TCM_100t ecfp6MACCS


100%|██████████| 15/15 [00:09<00:00,  1.59it/s]


TCM_100t fcfp6


100%|██████████| 15/15 [00:09<00:00,  1.59it/s]


TCM_100t fcfp6MACCS


100%|██████████| 15/15 [00:09<00:00,  1.58it/s]


TCM_100t MACCS


100%|██████████| 15/15 [00:08<00:00,  1.67it/s]


TCM_30t ecfp6


100%|██████████| 53/53 [00:19<00:00,  2.67it/s]


TCM_30t ecfp6fcfp6


100%|██████████| 53/53 [00:21<00:00,  2.47it/s]


TCM_30t ecfp6fcfp6MACCS


100%|██████████| 53/53 [00:22<00:00,  2.34it/s]


TCM_30t ecfp6MACCS


100%|██████████| 53/53 [00:20<00:00,  2.60it/s]


TCM_30t fcfp6


100%|██████████| 53/53 [00:19<00:00,  2.68it/s]


TCM_30t fcfp6MACCS


100%|██████████| 53/53 [00:20<00:00,  2.62it/s]


TCM_30t MACCS


100%|██████████| 53/53 [00:19<00:00,  2.78it/s]


TCM_50t ecfp6


100%|██████████| 28/28 [00:14<00:00,  1.95it/s]


TCM_50t ecfp6fcfp6


100%|██████████| 28/28 [00:15<00:00,  1.80it/s]


TCM_50t ecfp6fcfp6MACCS


100%|██████████| 28/28 [00:15<00:00,  1.77it/s]


TCM_50t ecfp6MACCS


100%|██████████| 28/28 [00:14<00:00,  1.92it/s]


TCM_50t fcfp6


100%|██████████| 28/28 [00:14<00:00,  1.93it/s]


TCM_50t fcfp6MACCS


100%|██████████| 28/28 [00:14<00:00,  1.92it/s]


TCM_50t MACCS


100%|██████████| 28/28 [00:14<00:00,  1.97it/s]


TCM_80t ecfp6


100%|██████████| 20/20 [00:11<00:00,  1.72it/s]


TCM_80t ecfp6fcfp6


100%|██████████| 20/20 [00:12<00:00,  1.60it/s]


TCM_80t ecfp6fcfp6MACCS


100%|██████████| 20/20 [00:13<00:00,  1.48it/s]


TCM_80t ecfp6MACCS


100%|██████████| 20/20 [00:13<00:00,  1.46it/s]


TCM_80t fcfp6


100%|██████████| 20/20 [00:13<00:00,  1.49it/s]


TCM_80t fcfp6MACCS


100%|██████████| 20/20 [00:11<00:00,  1.69it/s]


TCM_80t MACCS


100%|██████████| 20/20 [00:11<00:00,  1.79it/s]
