In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2023.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.4/34.4 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2023.9.4


In [None]:
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
from rdkit import DataStructs

In [None]:
# Load the accepted SMILES from a file
with open('Enamine_MiniFrag.txt') as f:
    SMILES = list(f)

print(f"Total SMILES: {len(SMILES)}")

Total SMILES: 80


In [None]:
# Define the reference molecule CHEMBL5090394
CHEMBL5090394 = Chem.MolFromSmiles('CNc1ncc2cc(-c3ccc(-c4ncccc4F)cc3Cl)c(=O)n(C[C@H]3OC[C@H](N)CO3)c2n1')

In [None]:
# Filter out invalid SMILES
mols = [smile for smile in SMILES if Chem.MolFromSmiles(smile) is not None]

# Convert valid SMILES to RDKit molecules
rdkit_mols = []

for mol in mols:
    rdkit_mols.append(Chem.MolFromSmiles(mol))

print(f"Valid SMILES: {len(rdkit_mols)}")

Valid SMILES: 80


In [None]:
# Define a Tanimoto similarity function
def tanimoto(reference_molecule, researched_molecule):

    fp = AllChem.GetMorganFingerprintAsBitVect(reference_molecule, 2, nBits=2048)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(researched_molecule, 2, nBits=2048)

    same_parts = set(fp.GetOnBits()) & set(fp2.GetOnBits())
    different_parts = set(fp.GetOnBits()) | set(fp2.GetOnBits())

    return len(same_parts) / len(different_parts)

In [None]:
# # Define a Tanimoto similarity function
# def tanimoto(reference_molecule, researched_molecule):

#     fp = MACCSkeys.GenMACCSKeys(reference_molecule)
#     fp2 = MACCSkeys.GenMACCSKeys(researched_molecule)

#     same_parts = set(fp.GetOnBits()) & set(fp2.GetOnBits())
#     different_parts = set(fp.GetOnBits()) | set(fp2.GetOnBits())

#     return len(same_parts) / len(different_parts)

In [None]:
# # Function to calculate MQN fingerprint
# def calculate_mqn_fingerprint(molecule):
#     num_bins = 42
#     mqn = np.zeros(num_bins, dtype=int)

#     for atom in molecule.GetAtoms():
#         atom_type = atom.GetAtomicNum()
#         if atom_type < num_bins:
#             mqn[atom_type] += 1

#     return mqn

In [None]:
# # Define a Tanimoto similarity function
# def tanimoto(reference_molecule, researched_molecule):

#     fp = calculate_mqn_fingerprint(reference_molecule)
#     fp2 = calculate_mqn_fingerprint(researched_molecule)

#     same_parts = np.sum(np.minimum(fp, fp2))
#     different_parts = np.sum(np.maximum(fp, fp2))

#     return same_parts / different_parts

In [None]:
# Calculate Tanimoto similarities with the reference molecule CHEMBL5090394
CHEMBL5090394_similarity = {}

for mol in rdkit_mols:
    sim = tanimoto(CHEMBL5090394, mol)
    CHEMBL5090394_similarity[mol] = sim

In [None]:
# Define the updated Tanimoto coefficient range
min_threshold = 0.01
max_threshold = 0.95

In [None]:
# Filter molecules based on the Tanimoto coefficient range
filtered_molecules = {}

for mol, similarity in CHEMBL5090394_similarity.items():
    if min_threshold <= similarity <= max_threshold:
        filtered_molecules[mol] = similarity
print(f"Molecules within Tanimoto coefficient range ({min_threshold} - {max_threshold}): {len(filtered_molecules)}")

Molecules within Tanimoto coefficient range (0.01 - 0.95): 80


In [None]:
# # save the SMILES of filtered molecules to a file
# filtered_smiles = [Chem.MolToSmiles(mol) for mol in filtered_molecules.keys()]

# with open('filtered_smiles_Zinc.txt', 'w', encoding='utf-8') as f:
#     for item in filtered_smiles:
#         f.write("%s\n" % item)

In [None]:
# Sort molecules by similarity and select the best 1000 SMILES
sorted_molecules = sorted(CHEMBL5090394_similarity.items(), key=lambda x: x[1], reverse=True)
selected_molecules = [mol for mol, similarity in sorted_molecules if min_threshold <= similarity <= max_threshold][:250]

# Save the SMILES of selected molecules to a file
selected_smiles = [Chem.MolToSmiles(mol) for mol in selected_molecules]

with open('best_250_frag_Enamine_MiniFrag.txt', 'w', encoding='utf-8') as f:
    for item in selected_smiles:
        f.write("%s\n" % item)

print(f"file created'")

file created'
