In [1]:
import rdkit
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import MACCSkeys, AllChem
from rdkit.Chem import rdFingerprintGenerator
from scipy.spatial.distance import pdist

In [2]:
#from google.colab import drive
#drive.mount('/content/drive')

In [3]:
print(f"rdkit_version: {rdkit.__version__}")

rdkit_version: 2024.03.5


In [4]:
# LANaPDB RO3
url_data = "https://raw.githubusercontent.com/DIFACQUIM/Fragment-libraries-from-large-synthetic-compounds-and-natural-products-collections/refs/heads/main/DATA_SET/DATA_FRAGMENTS_RO3/LANaPDB_RO3_Moleculardescriptors.csv"
lanapdb_RO3 = pd.read_csv(url_data)
lanapdb_RO3 = lanapdb_RO3[["ID", "SMILES_chiral"]]
lanapdb_RO3["Database"] = "LANaPDB"
lanapdb_RO3.tail(2)

Unnamed: 0,ID,SMILES_chiral,Database
1830,LANaPDB_fragments_73605,CC1(C)[C@H]2CCC[C@]3(CO3)[C@]2(C)C[C@H](O)[C@]...,LANaPDB
1831,LANaPDB_fragments_74033,CCC(C)=CC(=O)O,LANaPDB


In [5]:
# COCONUT RO3
url_data = "https://raw.githubusercontent.com/DIFACQUIM/Fragment-libraries-from-large-synthetic-compounds-and-natural-products-collections/refs/heads/main/DATA_SET/DATA_FRAGMENTS_RO3/COCONUT_RO3_Moleculardescriptors.csv"
coconut_RO3 = pd.read_csv(url_data)
coconut_RO3 = coconut_RO3[["ID", "SMILES_chiral","Database"]]
coconut_RO3.tail(2)

Unnamed: 0,ID,SMILES_chiral,Database
38745,COCONUT_fragments_2583030,O=c1cco[nH]1,COCONUT
38746,COCONUT_fragments_2583083,O=CC1C=CCC1,COCONUT


In [6]:
# Enamine RO3
url_data = "https://raw.githubusercontent.com/DIFACQUIM/Fragment-libraries-from-large-synthetic-compounds-and-natural-products-collections/refs/heads/main/DATA_SET/DATA_FRAGMENTS_RO3/Enamine_RO3_Moleculardescriptors.csv"
Enamine_RO3 = pd.read_csv(url_data)
Enamine_RO3 = Enamine_RO3[["ID", "SMILES_chiral"]]
Enamine_RO3["Database"] = "Enamine"
Enamine_RO3.tail(2)

Unnamed: 0,ID,SMILES_chiral,Database
8384,Z212848872,CC1CN(C(=O)Cc2cccc(F)c2)CCO1,Enamine
8385,Z228588126,CC(C(=O)N1CCCCCC1)N1CCNCC1,Enamine


In [7]:
# ChemDiv RO3
url_data = "https://raw.githubusercontent.com/DIFACQUIM/Fragment-libraries-from-large-synthetic-compounds-and-natural-products-collections/refs/heads/main/DATA_SET/DATA_FRAGMENTS_RO3/ChemDiv_RO3_Moleculardescriptors.csv"
ChemDiv_RO3 = pd.read_csv(url_data)
ChemDiv_RO3 = ChemDiv_RO3[["ID", "SMILES_chiral","Database"]]
ChemDiv_RO3.tail(2)

Unnamed: 0,ID,SMILES_chiral,Database
16721,BB01-0446,CC(C)N1CCC2(CC1)NC(=O)c1ccccc1O2,ChemDiv
16722,BB01-0451,CC(=O)N1C(C(=O)O)CSC1c1ccc(C)cc1,ChemDiv


In [8]:
# Maybridge RO3
url_data = "https://raw.githubusercontent.com/DIFACQUIM/Fragment-libraries-from-large-synthetic-compounds-and-natural-products-collections/refs/heads/main/DATA_SET/DATA_FRAGMENTS_RO3/Maybridge_RO3_Moleculardescriptors.csv"
Maybridge_RO3 = pd.read_csv(url_data)
Maybridge_RO3 = Maybridge_RO3[["ID", "SMILES_chiral", "Database"]]
Maybridge_RO3.tail(2)

Unnamed: 0,ID,SMILES_chiral,Database
5910,KM08807,O=C1CC2CCCC(C1)N2Cc1ccccc1,Maybridge
5911,SEW04324,O=C(c1ccccc1)C1CCN(C(=O)C(F)(F)F)CC1,Maybridge


In [9]:

# Life Chemicals RO3
url_data = "https://raw.githubusercontent.com/DIFACQUIM/Fragment-libraries-from-large-synthetic-compounds-and-natural-products-collections/refs/heads/main/DATA_SET/DATA_FRAGMENTS_RO3/LifeChemicals_RO3_Moleculardescriptors.csv"
LifeChemicals_RO3 = pd.read_csv(url_data)
LifeChemicals_RO3 = LifeChemicals_RO3[["ID", "SMILES_chiral"]]
LifeChemicals_RO3["Database"] = "Life Chemicals"
LifeChemicals_RO3.tail(2)

Unnamed: 0,ID,SMILES_chiral,Database
14732,F6782-0679,Cn1cc(C(=O)c2ccccc2)c(=O)c2cc(F)ccc21,Life Chemicals
14733,F0788-0005,c1cc[n+]2cc3n(c2c1)Cc1c[n+]2ccccc2n1C3,Life Chemicals


In [10]:
# CRAFT RO3
url_data = "https://raw.githubusercontent.com/DIFACQUIM/Fragment-libraries-from-large-synthetic-compounds-and-natural-products-collections/refs/heads/main/DATA_SET/DATA_FRAGMENTS_RO3/CRAFT_RO3_Moleculardescriptors.csv"
CRAFT_RO3 = pd.read_csv(url_data)
CRAFT_RO3 = CRAFT_RO3[["ID", "SMILES_chiral", "Database"]]
CRAFT_RO3.tail(2)

Unnamed: 0,ID,SMILES_chiral,Database
174,QHM-0000005,O=C1C=C(Cl)C(=O)c2ccccc21,CRAFT
175,QHM-0000002,CC1(C)CCC2=C(O1)c1ccccc1C(=O)C2=O,CRAFT


In [11]:
# Definir SMILES para fragmentos
lanapdb_RO3_smi = list(lanapdb_RO3["SMILES_chiral"])
coconut_RO3_smi = list(coconut_RO3["SMILES_chiral"])
Enamine_RO3_smi = list(Enamine_RO3["SMILES_chiral"])
ChemDiv_RO3_smi = list(ChemDiv_RO3["SMILES_chiral"])
Maybridge_RO3_smi = list(Maybridge_RO3["SMILES_chiral"])
LifeChemicals_RO3_smi = list(LifeChemicals_RO3["SMILES_chiral"])
CRAFT_RO3_smi = list(CRAFT_RO3["SMILES_chiral"])

#### Functions

In [12]:
def ECFP (smi, r):
    fps = pd.DataFrame([[int(y) for y in rdFingerprintGenerator.GetMorganGenerator(radius=r, fpSize=1024).GetFingerprint(Chem.MolFromSmiles(x)).ToBitString()] for x in smi])
    SimMat = 1 - pdist(fps[[x for x in range(1024)]], metric="jaccard") # Similarity Matrix
    #print(SimMat.shape)
    SimMat = round(np.median(SimMat), 3)
    return SimMat

def MACCSkeys_fp (smi):
    fps = pd.DataFrame([[int(y) for y in MACCSkeys.GenMACCSKeys(Chem.MolFromSmiles(x)).ToBitString()] for x in smi])
    SimMat = 1 - pdist(fps[[x for x in range(167)]], metric="jaccard") # Similarity Matrix
    #print(SimMat.shape)
    SimMat = round(np.median(SimMat), 3)
    return SimMat

### ECFP4

In [13]:
ecfp_2 = [ECFP(coconut_RO3_smi, 2), ECFP(lanapdb_RO3_smi, 2), ECFP(CRAFT_RO3_smi, 2),
          ECFP(Enamine_RO3_smi, 2), ECFP(ChemDiv_RO3_smi, 2), ECFP(Maybridge_RO3_smi, 2),
          ECFP(LifeChemicals_RO3_smi, 2)]
print(ecfp_2)

ecfp_3 = [ECFP(coconut_RO3_smi, 3), ECFP(lanapdb_RO3_smi, 3), ECFP(CRAFT_RO3_smi, 3),
          ECFP(Enamine_RO3_smi, 3), ECFP(ChemDiv_RO3_smi, 3), ECFP(Maybridge_RO3_smi, 3), 
          ECFP(LifeChemicals_RO3_smi, 3)]
print(ecfp_3)

MACCS_keys = [MACCSkeys_fp(coconut_RO3_smi), MACCSkeys_fp(lanapdb_RO3_smi), MACCSkeys_fp(CRAFT_RO3_smi),
              MACCSkeys_fp(Enamine_RO3_smi), MACCSkeys_fp(ChemDiv_RO3_smi), MACCSkeys_fp(Maybridge_RO3_smi), 
              MACCSkeys_fp(LifeChemicals_RO3_smi)]
print(MACCS_keys)

Collection = ["coconut_RO3", "lanapdb_RO3", "CRAFT_RO3", "EnamineSolWat_RO3", 
              "ChemDiv_RO3", "Maybridge_RO3", "LifeChemicals_RO3",
              ]
Fragments = [len(coconut_RO3), len(lanapdb_RO3), len(CRAFT_RO3),
             len(Enamine_RO3), len(ChemDiv_RO3), len(Maybridge_RO3), 
             len(LifeChemicals_RO3)]

[0.085, 0.094, 0.108, 0.121, 0.096, 0.1, 0.109]
[0.072, 0.079, 0.087, 0.099, 0.08, 0.082, 0.09]
[0.222, 0.282, 0.257, 0.333, 0.255, 0.231, 0.326]


In [14]:
arr = np.array([Collection, Fragments, MACCS_keys, ecfp_2, ecfp_3])
arr = np.transpose(arr)
arr

array([['coconut_RO3', '38747', '0.222', '0.085', '0.072'],
       ['lanapdb_RO3', '1832', '0.282', '0.094', '0.079'],
       ['CRAFT_RO3', '176', '0.257', '0.108', '0.087'],
       ['EnamineSolWat_RO3', '8386', '0.333', '0.121', '0.099'],
       ['ChemDiv_RO3', '16723', '0.255', '0.096', '0.08'],
       ['Maybridge_RO3', '5912', '0.231', '0.1', '0.082'],
       ['LifeChemicals_RO3', '14734', '0.326', '0.109', '0.09']],
      dtype='<U32')

In [15]:
FINGERPRINTS = pd.DataFrame(arr, columns = ["Collection", "Fragments RO3", "MACCS keys", "Morgan2", "Morgan3"])
FINGERPRINTS

Unnamed: 0,Collection,Fragments RO3,MACCS keys,Morgan2,Morgan3
0,coconut_RO3,38747,0.222,0.085,0.072
1,lanapdb_RO3,1832,0.282,0.094,0.079
2,CRAFT_RO3,176,0.257,0.108,0.087
3,EnamineSolWat_RO3,8386,0.333,0.121,0.099
4,ChemDiv_RO3,16723,0.255,0.096,0.08
5,Maybridge_RO3,5912,0.231,0.1,0.082
6,LifeChemicals_RO3,14734,0.326,0.109,0.09


In [16]:
FINGERPRINTS.to_csv("Fingerprints_median_similarty_Fragments_RO3.csv", sep=",", index=False)