In [1]:
import os
from rdkit import Chem, DataStructs, RDLogger
from rdkit.Chem import AllChem, MACCSkeys, rdFingerprintGenerator
import pandas as pd

# Suppressing Warnings

In [2]:
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)

# Directory Setup

In [3]:
preprocessed_data_directory = '/Users/Avi/Dissertation/Data/Non_Curated/Preprocessed'
fingerprinted_data_directory = '/Users/Avi/Dissertation/Data/Non_Curated/Fingerprinted'
os.makedirs(fingerprinted_data_directory, exist_ok=True)

# Defining Functions

In [4]:
def generate_ecfp(mol):
    return DataStructs.BitVectToText(AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024))

def generate_maccs_keys(mol):
    return DataStructs.BitVectToText(MACCSkeys.GenMACCSKeys(mol))

def generate_atom_pair(mol, min_distance, max_distance):
    generator = rdFingerprintGenerator.GetAtomPairGenerator(minDistance=min_distance, maxDistance=max_distance, fpSize=1024)
    return DataStructs.BitVectToText(generator.GetFingerprint(mol))

def combine_ap2_ap3(mol):
    ap2 = generate_atom_pair(mol, 2, 2)
    ap3 = generate_atom_pair(mol, 3, 3)
    return ap2 + ap3

def process_molecule(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * 5
    return [
        generate_ecfp(mol),
        generate_maccs_keys(mol),
        generate_atom_pair(mol, 2, 2),
        generate_atom_pair(mol, 3, 3),
        combine_ap2_ap3(mol)
    ]

# Reading preprocessed data

In [5]:
target_files = [
    'Target_CHEMBL4078_Non_Curated_Preprocessed.csv',
    'Target_CHEMBL279_Non_Curated_Preprocessed.csv',
    'Target_CHEMBL5763_Non_Curated_Preprocessed.csv',
    'Target_CHEMBL240_Non_Curated_Preprocessed.csv',
    'Target_CHEMBL4005_Non_Curated_Preprocessed.csv'
]

# Generating and Saving Fingerprinted Data

In [6]:
for file in target_files:
    target = pd.read_csv(os.path.join(preprocessed_data_directory, file))
    
    # Processing molecules sequentially
    results = [process_molecule(smi) for smi in target['canonical_smiles']]
    
    # Assigning results to dataframe
    target['ECFP'], target['MACCS Keys'], target['AP2'], target['AP3'], target['AP2+AP3'] = zip(*results)
    
    # Saving fingerprinted data
    output_file = file.replace('Preprocessed', 'Fingerprinted')
    target.to_csv(os.path.join(fingerprinted_data_directory, output_file), index=False)
    print(f"Saved fingerprinted data for {file}")

print("Fingerprinting complete. Files saved in:", fingerprinted_data_directory)

Saved fingerprinted data for Target_CHEMBL4078_Non_Curated_Preprocessed.csv
Saved fingerprinted data for Target_CHEMBL279_Non_Curated_Preprocessed.csv
Saved fingerprinted data for Target_CHEMBL5763_Non_Curated_Preprocessed.csv
Saved fingerprinted data for Target_CHEMBL240_Non_Curated_Preprocessed.csv
Saved fingerprinted data for Target_CHEMBL4005_Non_Curated_Preprocessed.csv
Fingerprinting complete. Files saved in: /Users/Avi/Dissertation/Data/Non_Curated/Fingerprinted
