## Importing libraries

In [1]:
from sys import argv
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import PandasTools
from statsmodels.stats.proportion import proportions_ztest

## Definig functions

In [2]:
def LoadDatasetFromCSV(CSV, ecfp4=True, maccs=True, label="ACTIVE"):
# This function requires a CSV file with the first two columns identified as ID and SMILES, additional columns will be ignored
# The file is loaded as a dataframe and by default both fingerprints, MACCS-166 and ECFP4-2048 are calculated and added as columns
# The last column is added as a label, which by default is "ACTIVE"
    Dataset = pd.read_csv(CSV, usecols=[1])
    if ecfp4 == True:
        ECFP4FP = [AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(row.SMILES),2,nBits=2048) for index, row in Dataset.iterrows()]
        Dataset["ECFP4FP"] = ECFP4FP
    if maccs == True:
        MACCSFP = [MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(row.SMILES)) for index, row in Dataset.iterrows()]
        Dataset["MACCSFP"] = MACCSFP
    LABEL = [label for index, row in Dataset.iterrows()]
    Dataset["LABEL"] = LABEL
    return Dataset

def DFP_Calc(DF, FP="ECFP4", FORMAT="RDKit"):
# This function requires a dataframe with a column identified as "ECFP4FP" or "MACCSFP" containing the respective fingerprints as RDKit objects
# The input dataframe can be taken from the LoadDatasetFromCSV function
# FP = "ECFP4" or "MACCS" according to the respective DFP
# FORMAT = "RDKit" or "TEXT" according to the output format, RDKit object or TEXT string
    
    if FP == "ECFP4":
        FPSTEXT = [DataStructs.BitVectToText(row.ECFP4FP) for index, row in DF.iterrows()]
        DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))]
        for i in FPSTEXT:
            b = [int(j) for j in i]
            DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)]
        DF_PROPORTIONS = [float(x)/DF.shape[0] for x in DF_COUNTS]
        DFP = []
        for i in range(0, len(DF_PROPORTIONS)):
            if DF_PROPORTIONS[i] > 0.5:
                DFP.append(1)
            else:
                DFP.append(0)
        DFP = [str(i) for i in DFP]
        DFP = "".join(DFP)
        DFP_RDKIT = DataStructs.CreateFromBitString(DFP)
    elif FP == "MACCS":
        FPSTEXT = [DataStructs.BitVectToText(row.MACCSFP) for index, row in DF.iterrows()]
        DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))]
        for i in FPSTEXT:
            b = [int(j) for j in i]
            DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)]
        DF_PROPORTIONS = [float(x)/DF.shape[0] for x in DF_COUNTS]
        DFP = []
        for i in range(0, len(DF_PROPORTIONS)):
            if DF_PROPORTIONS[i] > 0.5:
                DFP.append(1)
            else:
                DFP.append(0)
        DFP = [str(i) for i in DFP]
        DFP = "".join(DFP)
        DFP_RDKIT = DataStructs.CreateFromBitString(DFP)
    if FORMAT == "RDKit":
        return DFP_RDKIT
    elif FORMAT == "TEXT":
        return DFP

def SBDFP_Calc(DF, FP="ECFP4", FORMAT="RDKit"):
# This function requires a dataframe with a column identified as "ECFP4FP" or "MACCSFP" containing the respective fingerprints as RDKit objects
# The function also requires the files ECFP4.counts or MACCS.counts that contain the "1" Bit counts for the respective fingerprints
# The input dataframe can be taken from the LoadDatasetFromCSV function
# FP = "ECFP4" or "MACCS" according to the respective SB-DFP
# FORMAT = "RDKit" or "TEXT" according to the output format, RDKit object or TEXT string

    if FP == "ECFP4":
        FPSTEXT = [DataStructs.BitVectToText(row.ECFP4FP) for index, row in DF.iterrows()]
        DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))]
        for i in FPSTEXT:
            b = [int(j) for j in i]
            DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)]
        REF = open("ECFP4.counts")
        line = REF.readline()
        a = line.split(",")
        REF_COUNTS = [int(x) for x in a]
        SBDFP = []
        for i in range(len(REF_COUNTS)):
            stat, pval = proportions_ztest([REF_COUNTS[i], DF_COUNTS[i]], [15403690,DF.shape[0]], alternative='smaller')
            if pval < 0.01:
                SBDFP.append(1)
            else:
                SBDFP.append(0)
        SBDFP = [str(x) for x in SBDFP]
        SBDFP = "".join(SBDFP)
        SBDFP_RDKIT = DataStructs.CreateFromBitString(SBDFP)
    
    elif FP == "MACCS":
        FPSTEXT = [DataStructs.BitVectToText(row.MACCSFP) for index, row in DF.iterrows()]
        DF_COUNTS = [0 for i in range(len(FPSTEXT[0]))]
        for i in FPSTEXT:
            b = [int(j) for j in i]
            DF_COUNTS = [x + y for x, y in zip(DF_COUNTS, b)]
        REF = open("MACCS.counts")
        line = REF.readline()
        a = line.split(",")
        REF_COUNTS = [int(x) for x in a]
        SBDFP = []
        for i in range(len(REF_COUNTS)):
            stat, pval = proportions_ztest([REF_COUNTS[i], DF_COUNTS[i]], [15403690,DF.shape[0]], alternative='smaller')
            if pval < 0.01:
                SBDFP.append(1)
            else:
                SBDFP.append(0)
        SBDFP = [str(x) for x in SBDFP]
        SBDFP = "".join(SBDFP)
        SBDFP_RDKIT = DataStructs.CreateFromBitString(SBDFP)
    if FORMAT == "RDKit":
        return SBDFP_RDKIT
    elif FORMAT == "TEXT":
        return SBDFP

## Examples
For using the function, the following files are needed: MACCS.counts and ECFP4.counts.
For the execution of the scrip DNMT1.csv is also needed as example.
All of them are available in the GitHub repository.

In [10]:
# Loading a dataset with default parameters
Dataset = LoadDatasetFromCSV("DNMT1.csv")
# Showing the first 5 entries
Dataset.head(5)

Unnamed: 0,SMILES,ECFP4FP,MACCSFP,LABEL
SBSM154331,S1(C(=NC2(=C1C=CC=C2))C(=CC4(=CC([N+](=O)[O-])=C(N(C3(=CC=CC=C3))C)C=C4))CCC(=O)O),"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 1, ...]",ACTIVE
SBSM142655,S(CC1(=C(OC)C=CC(=C1)C(=O)C=CC3(=CC=C(C2(=CC=CC=C2))C=C3)))CC(=O)O,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, ...]",ACTIVE
SBSM22553,S(C(C2(=CC=C(C1(=CC=CC=C1))C=C2))(C3(=CC=CC=C3))C4(=CC=CC=C4))CC(N)C(=O)O,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...]",ACTIVE
SBSM122239,S(=O)(=O)(NC2(=CC=C(C(=O)OCC(=O)NCC=1(OC=CC=1))C=C2))C=3(SC=CC=3),"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, ...]",ACTIVE
SBSM19046,S(=O)(=O)(NC1(=NC(=CC=N1)C))C3(=CC=C(NC(=O)CC=2(SC=CC=2))C=C3),"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, ...]",ACTIVE


In [17]:
# Calculating DFP/MACCS for the generated dataframe
Dataset_DFP_MACCS = DFP_Calc(Dataset,FP="MACCS",FORMAT="TEXT")
# Showing the DFP
print ("Dataset_DFP_MACCS: "+Dataset_DFP_MACCS)

Dataset_DFP_MACCS: 00000000000000000000000000000000000000000000000000000000000000100000000000000000100101001000100111000100010000110000010011110101000111011101101111101011111111111111110


In [16]:
# Calculating SB-DFP/ECFP4 for the generated dataframe
Dataset_SBDFP_ECFP4 = SBDFP_Calc(Dataset,FP="ECFP4",FORMAT="TEXT")
# Showing the SB-DFP
print ("Dataset_SBDFP_ECFP4: "+Dataset_SBDFP_ECFP4)

Dataset_SBDFP_ECFP4: 0000110000000000000000000100000100000000000000000000000000000000000100000001000000000000000100010100010100100000100010000100000010000000011100000100000000000001000000010000000000000000100000010010001000100000110000000001000000000000010000000000000000010000100000000000001001000000010000000100000000000000000001000010001100000011000000000000100000000010001000000010000000000000000000100000010000000000000000000000001000100000000101001000000000000000000000101010000000000010001000010000000001000000000000100000010000011000010000000001001000000010000000000100000000001000000000001100000000001000000000000010000000110010000010000000100000001100001000000010000010000000000000000000000100000000000000100000100100001000000110000000011010000000100000010000000001000000000000000001000000000000001100000000000100000001001000000000000000000000000000100000100000000000000000011000100000010011000000000000000010000000000001000010000000001000001010001000100000000001010000000110001000000000000