In [3]:
import pandas as pd
from rdkit import Chem
from chembl_structure_pipeline import standardizer as sdz


# Leer Archivo con Fármacos Orales

In [4]:
# Cargar el archivo CSV en un DataFrame
file_path = "/workspaces/Interacciones-Proteina-Farmaco/drugs.csv"
df = pd.read_csv(file_path, delimiter=";")

# Mostrar las primeras filas del DataFrame
df.head()

Unnamed: 0,hmdb_id,status,name,ccl,source,bioab,sol,logs,pkasa,pkasb,...,hbd,hba,mw,qed,nring,naring,fsp3,nat,icl,comp_set
0,DB14505,,Sodium borate,Other,,0,586.0,0.23,11.254374,-2.981084,...,2,7,158.017274,0.362805,2,0,0.0,13,Neutral,DrugBank
1,DB11326,,Boric acid,Other,,1,47.4,-0.6,,0.485975,...,3,3,62.017524,0.282794,0,0,0.0,7,Neutral,DrugBank
2,DB06119,,Cenobamate,Other,,1,0.0254,-3.87,,8.673062,...,1,6,267.052302,0.896686,2,2,0.2,28,Basic,DrugBank
3,HMDB0251697|DB12243,detected,Edaravone,Organoheterocyclic compounds,Unknown,1,0.939,-2.27,13.44572,-1.477147,...,0,2,174.079313,0.638544,2,1,0.2,23,Neutral,DrugBank
4,DB00359,,Sulfadiazine,Benzenoids,,1,147.0,-0.03,4.54344,,...,2,5,250.052447,0.78714,2,2,0.0,27,Acid,DrugBank


In [5]:
df.columns

Index(['hmdb_id', 'status', 'name', 'ccl', 'source', 'bioab', 'sol', 'logs',
       'pkasa', 'pkasb', 'pchar', 'inchi', 'set', 'tpsa', 'logp', 'rb', 'hbd',
       'hba', 'mw', 'qed', 'nring', 'naring', 'fsp3', 'nat', 'icl',
       'comp_set'],
      dtype='object')

## Estandarización de los datos

In [6]:
# Generar las moléculas a partir de la cadena InChI
# Chem.MolFromInchi -> Convierte la cadena Inchi en un objeto mol de RDKit
df["mol"] = df.inchi.apply(Chem.MolFromInchi)

In [7]:
# Estandarizar las estructuras y obtener la molécula "padre"
df["pmol"] = df.mol.apply(lambda x: sdz.get_parent_mol(sdz.standardize_mol(x))[0])

[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Rule applied: Badamidetautomer1
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running No

[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Rule applied: Badamidetautomer2
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Uncharger
[14:34:58] Running Normalizer
[14:34:58] Running Uncharger
[14:34:58] Running Un

-   sdz.standardize_mol(x): Estandariza la molécula.
-   sdz.get_parent_mol(...): Obtiene la estructura "padre" de la molécula estandarizada.

In [8]:
# Asegurarse de que no quedan compuestos con varias moléculas comprobando la ausencia de puntos "." en su SMILES
df = df[df.pmol.apply(lambda x: "." not in Chem.MolToSmiles(x))]

-   Chem.MolToSmiles(x): Convierte la molécula en una cadena SMILES.
-   "." not in ...: Filtra moléculas que no contienen el punto (.), indicando que son moléculas únicas.

In [9]:
# Eliminar compuestos duplicados usando la cadena InChIKey del compuesto estandarizado y parentizado
df["inchi_key"] = df.pmol.apply(lambda x: Chem.MolToInchiKey(x))
df = df.drop_duplicates(subset=["inchi_key"])

-   Chem.MolToInchiKey(x): Convierte la molécula en una cadena InChIKey.
-   df.drop_duplicates: Elimina filas duplicadas basándose en la columna inchi_key.

-   El InChI es una cadena que describe la estructura de una molécula. Se genera a partir de la estructura química utilizando software especializado.
-   El InChIKey es una cadena más corta y fija (27 caracteres) que permite una búsqueda más sencilla y rápida en bases de datos.

In [10]:
# Filtrar los compuestos que tengan más de 6 carbonos
def has_more_than_six_carbons(mol):
    num_carbons = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == 'C')
    return num_carbons > 6

df = df[df.pmol.apply(has_more_than_six_carbons)]

-   has_more_than_six_carbons(mol): Función que cuenta los átomos de carbono en la molécula.
-   df[df.pmol.apply(...)]: Filtra el DataFrame para conservar solo las moléculas con más de 6 carbonos.

In [11]:
df.head()

Unnamed: 0,hmdb_id,status,name,ccl,source,bioab,sol,logs,pkasa,pkasb,...,qed,nring,naring,fsp3,nat,icl,comp_set,mol,pmol,inchi_key
2,DB06119,,Cenobamate,Other,,1,0.0254,-3.87,,8.673062,...,0.896686,2,2,0.2,28,Basic,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x7d51ee8758c0>,<rdkit.Chem.rdchem.Mol object at 0x7d51ee6817e0>,GFHAXPJGXSQLPT-VIFPVBQESA-N
3,HMDB0251697|DB12243,detected,Edaravone,Organoheterocyclic compounds,Unknown,1,0.939,-2.27,13.44572,-1.477147,...,0.638544,2,1,0.2,23,Neutral,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x7d51ee875930>,<rdkit.Chem.rdchem.Mol object at 0x7d51ee6810c0>,QELUYTUMUWHWMC-UHFFFAOYSA-N
4,DB00359,,Sulfadiazine,Benzenoids,,1,147.0,-0.03,4.54344,,...,0.78714,2,2,0.0,27,Acid,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x7d51ee8759a0>,<rdkit.Chem.rdchem.Mol object at 0x7d51ee6814e0>,SEEPANYCNGTZFQ-UHFFFAOYSA-N
5,DB00432,,Trifluridine,"Nucleosides, nucleotides, and analogues",,1,0.000149,-6.37,4.070792,,...,0.662292,2,1,0.6,31,Acid,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x7d51ee875a10>,<rdkit.Chem.rdchem.Mol object at 0x7d51ee681900>,VSQQQLOSPVPRAZ-RRKCRQDMSA-N
6,DB01015,,Sulfamethoxazole,Benzenoids,,0,0.627,-2.93,8.009664,10.028458,...,0.804737,2,2,0.1,28,Basic,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x7d51ee875a80>,<rdkit.Chem.rdchem.Mol object at 0x7d51ee681780>,JLKIGFTWXXRPMT-UHFFFAOYSA-N


# Comparación con Ligand-Expo PDB

## Inchi

In [12]:
# Leer el archivo InChI
inchi_file_url = "http://ligand-expo.rcsb.org/dictionaries/Components-inchi.ich"
inchi_data = pd.read_csv(inchi_file_url, sep='\t', header = None)

# Asignar nombres de columnas si no están presentes
inchi_data.columns = ['InChI', 'PDB_ID', 'Name']

# Mostrar primeras filas
inchi_data.head()

Unnamed: 0,InChI,PDB_ID,Name
0,"InChI=1S/C2H4O3/c1-5-2(3)4/h1H3,(H,3,4)",0,methyl hydrogen carbonate
1,InChI=1S/C35H42F2N2O6/c1-42-30-22-27(23-31(43-...,1,"1-[2,2-difluoro-2-(3,4,5-trimethoxy-phenyl)-ac..."
2,InChI=1S/C23H35N3O6/c1-5-15(4)20(22(29)24-18(2...,2,n-[(2r)-2-benzyl-4-(hydroxyamino)-4-oxobutanoy...
3,InChI=1S/C26H25N5O2/c1-17(2)15-30-24-22(25(32)...,3,5-methyl-7-(2-methylpropyl)-2-(naphthalen-1-yl...
4,InChI=1S/C8H9NO2/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,4,(2s)-amino(phenyl)ethanoic acid


In [13]:
# Crear un conjunto de InChIs de los compuestos del archivo CSV
# En los conjuntos la búsqueda es más eficiente
inchi_set = set(df.inchi)

# Filtrar las entradas del PDB que tienen estos InChIs
matching_inchis = inchi_data[inchi_data['InChI'].isin(inchi_set)]

El DataFrame matching_inchis contiene solo las filas del archivo del PDB que tienen InChIs que coinciden con los de tus compuestos de interés.

In [14]:
# Mostrar las coincidencias
matching_inchis

Unnamed: 0,InChI,PDB_ID,Name
35,"InChI=1S/C7H8O/c8-6-7-4-2-1-3-5-7/h1-5,8H,6H2",010,phenylmethanol
42,"InChI=1S/C27H37N3O7S/c1-18(2)15-30(38(33,34)21...",017,"(3r,3as,6ar)-hexahydrofuro[2,3-b]furan-3-yl(1s..."
107,"InChI=1S/C23H18ClF2N3O3S/c1-2-9-33(31,32)29-19...",032,"n-(3-{[5-(4-chlorophenyl)-1h-pyrrolo[2,3-b]pyr..."
206,InChI=1S/C19H18F3N3O2/c1-25-7-5-11(6-8-25)18(2...,05X,"2,4,6-tris(fluoranyl)-n-[6-(1-methylpiperidin-..."
263,InChI=1S/C26H31Cl2N7O3/c1-5-34-10-12-35(13-11-...,07J,"3-(2,6-dichloro-3,5-dimethoxyphenyl)-1-(6-{[4-..."
...,...,...,...
43097,InChI=1S/C21H29FO5/c1-18-7-5-13(24)9-12(18)3-4...,ZK5,9alpha-fluorocortisol
43130,InChI=1S/C16H20FN3O4/c1-11(21)18-9-13-10-20(16...,ZLD,n-{[(5s)-3-(3-fluoro-4-morpholin-4-ylphenyl)-2...
43137,InChI=1S/C31H33N3O6S/c1-20-8-4-7-11-29(20)41(3...,ZLK,zafirlukast
43172,InChI=1S/C12H20N4O7/c1-4(18)15-8-5(16-12(13)14...,ZMR,zanamivir


In [15]:
print(f"Se han detectado {matching_inchis.shape[0]} compuestos que tienen entradas del Protein Data Bank. En el dataset original se distinguían {df.shape[0]}.")

Se han detectado 504 compuestos que tienen entradas del Protein Data Bank. En el dataset original se distinguían 1306.


## InchiKeys

También se podría haber realizado con el archivo de PDB que en vez de tener Inchi contiene InchiKey.

In [16]:
# Leer el archivo InChI desde el URL
inchi_key_url = "http://ligand-expo.rcsb.org/dictionaries/Components-inchikey.ich"
inchi_key = pd.read_csv(inchi_key_url, sep='\t', header=None)

# Asignar nombres de columnas si no están presentes
inchi_key.columns = ['inchi_key', 'PDB_ID', 'Name']

# Mostrar primeras filas para verificar los datos
inchi_key.head()

Unnamed: 0,inchi_key,PDB_ID,Name
0,CXHHBNMLPJOKQD-UHFFFAOYSA-N,0,methyl hydrogen carbonate
1,NBYCDVVSYOMFMS-VMPREFPWSA-N,1,"1-[2,2-difluoro-2-(3,4,5-trimethoxy-phenyl)-ac..."
2,MWZOULASPWUGJJ-NFBUACBFSA-N,2,n-[(2r)-2-benzyl-4-(hydroxyamino)-4-oxobutanoy...
3,NNZDBCPMOOEFTE-UHFFFAOYSA-N,3,5-methyl-7-(2-methylpropyl)-2-(naphthalen-1-yl...
4,ZGUNAGUHMKGQNY-ZETCQYMHSA-N,4,(2s)-amino(phenyl)ethanoic acid


In [17]:
# Para propósitos de demostración, supongamos que tienes una lista de InChIKeys:
inchi_key_set = set(df['inchi_key']) 

# Filtrar las entradas del archivo InChIKey que están en el conjunto
matching_inchis_keys = inchi_key[inchi_key['inchi_key'].isin(inchi_key_set)]

# Mostrar resultados filtrados
matching_inchis_keys


Unnamed: 0,inchi_key,PDB_ID,Name
35,WVDDGKGOMKODPV-UHFFFAOYSA-N,010,phenylmethanol
42,CJBJHOAVZSMMDJ-HEXNFIEUSA-N,017,"(3r,3as,6ar)-hexahydrofuro[2,3-b]furan-3-yl(1s..."
107,GPXBXXGIAQBQNI-UHFFFAOYSA-N,032,"n-(3-{[5-(4-chlorophenyl)-1h-pyrrolo[2,3-b]pyr..."
206,XEDHVZKDSYZQBF-UHFFFAOYSA-N,05X,"2,4,6-tris(fluoranyl)-n-[6-(1-methylpiperidin-..."
263,QADPYRIHXKWUSV-UHFFFAOYSA-N,07J,"3-(2,6-dichloro-3,5-dimethoxyphenyl)-1-(6-{[4-..."
...,...,...,...
43097,AAXVEMMRQDVLJB-BULBTXNYSA-N,ZK5,9alpha-fluorocortisol
43130,TYZROVQLWOKYKF-ZDUSSCGKSA-N,ZLD,n-{[(5s)-3-(3-fluoro-4-morpholin-4-ylphenyl)-2...
43137,YEEZWCHGZNKEEK-UHFFFAOYSA-N,ZLK,zafirlukast
43172,ARAIBEBZBOPLMB-UFGQHTETSA-N,ZMR,zanamivir


In [18]:
print(f"Se han detectado {matching_inchis_keys.shape[0]} compuestos que tienen entradas del Protein Data Bank. En el dataset original se distinguían {df.shape[0]}.")

Se han detectado 506 compuestos que tienen entradas del Protein Data Bank. En el dataset original se distinguían 1306.


In [19]:
# Unir los DataFrames
result = pd.merge(
    matching_inchis,          
    matching_inchis_keys,     
    on=['PDB_ID', 'Name']             
)

result

Unnamed: 0,InChI,PDB_ID,Name,inchi_key
0,"InChI=1S/C7H8O/c8-6-7-4-2-1-3-5-7/h1-5,8H,6H2",010,phenylmethanol,WVDDGKGOMKODPV-UHFFFAOYSA-N
1,"InChI=1S/C27H37N3O7S/c1-18(2)15-30(38(33,34)21...",017,"(3r,3as,6ar)-hexahydrofuro[2,3-b]furan-3-yl(1s...",CJBJHOAVZSMMDJ-HEXNFIEUSA-N
2,"InChI=1S/C23H18ClF2N3O3S/c1-2-9-33(31,32)29-19...",032,"n-(3-{[5-(4-chlorophenyl)-1h-pyrrolo[2,3-b]pyr...",GPXBXXGIAQBQNI-UHFFFAOYSA-N
3,InChI=1S/C19H18F3N3O2/c1-25-7-5-11(6-8-25)18(2...,05X,"2,4,6-tris(fluoranyl)-n-[6-(1-methylpiperidin-...",XEDHVZKDSYZQBF-UHFFFAOYSA-N
4,InChI=1S/C26H31Cl2N7O3/c1-5-34-10-12-35(13-11-...,07J,"3-(2,6-dichloro-3,5-dimethoxyphenyl)-1-(6-{[4-...",QADPYRIHXKWUSV-UHFFFAOYSA-N
...,...,...,...,...
498,InChI=1S/C21H29FO5/c1-18-7-5-13(24)9-12(18)3-4...,ZK5,9alpha-fluorocortisol,AAXVEMMRQDVLJB-BULBTXNYSA-N
499,InChI=1S/C16H20FN3O4/c1-11(21)18-9-13-10-20(16...,ZLD,n-{[(5s)-3-(3-fluoro-4-morpholin-4-ylphenyl)-2...,TYZROVQLWOKYKF-ZDUSSCGKSA-N
500,InChI=1S/C31H33N3O6S/c1-20-8-4-7-11-29(20)41(3...,ZLK,zafirlukast,YEEZWCHGZNKEEK-UHFFFAOYSA-N
501,InChI=1S/C12H20N4O7/c1-4(18)15-8-5(16-12(13)14...,ZMR,zanamivir,ARAIBEBZBOPLMB-UFGQHTETSA-N
