In [3]:
import pandas as pd
from rdkit import Chem
from chembl_structure_pipeline import standardizer as sdz


In [9]:
# Cargar el archivo CSV en un DataFrame
file_path = "/workspaces/Interacciones-Proteina-Farmaco/drugs.csv"
df = pd.read_csv(file_path, delimiter=";")

# Mostrar las primeras filas del DataFrame
df.head()

Unnamed: 0,hmdb_id,status,name,ccl,source,bioab,sol,logs,pkasa,pkasb,...,hbd,hba,mw,qed,nring,naring,fsp3,nat,icl,comp_set
0,DB14505,,Sodium borate,Other,,0,586.0,0.23,11.254374,-2.981084,...,2,7,158.017274,0.362805,2,0,0.0,13,Neutral,DrugBank
1,DB11326,,Boric acid,Other,,1,47.4,-0.6,,0.485975,...,3,3,62.017524,0.282794,0,0,0.0,7,Neutral,DrugBank
2,DB06119,,Cenobamate,Other,,1,0.0254,-3.87,,8.673062,...,1,6,267.052302,0.896686,2,2,0.2,28,Basic,DrugBank
3,HMDB0251697|DB12243,detected,Edaravone,Organoheterocyclic compounds,Unknown,1,0.939,-2.27,13.44572,-1.477147,...,0,2,174.079313,0.638544,2,1,0.2,23,Neutral,DrugBank
4,DB00359,,Sulfadiazine,Benzenoids,,1,147.0,-0.03,4.54344,,...,2,5,250.052447,0.78714,2,2,0.0,27,Acid,DrugBank


In [10]:
# Generar las moléculas a partir de la cadena InChI
# Chem.MolFromInchi -> Convierte la cadena Inchi en un objeto mol de RDKit
df["mol"] = df.inchi.apply(Chem.MolFromInchi)

In [11]:
# Estandarizar las estructuras y obtener la molécula "padre"
df["pmol"] = df.mol.apply(lambda x: sdz.get_parent_mol(sdz.standardize_mol(x))[0])

[14:59:34] Running Normalizer
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Normalizer
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Normalizer
[14:59:34] Rule applied: Badamidetautomer1
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Normalizer
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Normalizer
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Normalizer
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Normalizer
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Normalizer
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running Uncharger
[14:59:34] Running No

-   sdz.standardize_mol(x): Estandariza la molécula.
-   sdz.get_parent_mol(...): Obtiene la estructura "padre" de la molécula estandarizada.

In [12]:
# Asegurarse de que no quedan compuestos con varias moléculas comprobando la ausencia de puntos "." en su SMILES
df = df[df.pmol.apply(lambda x: "." not in Chem.MolToSmiles(x))]

-   Chem.MolToSmiles(x): Convierte la molécula en una cadena SMILES.
-   "." not in ...: Filtra moléculas que no contienen el punto (.), indicando que son moléculas únicas.

In [13]:
# Eliminar compuestos duplicados usando la cadena InChIKey del compuesto estandarizado y parentizado
df["inchi_key"] = df.pmol.apply(lambda x: Chem.MolToInchiKey(x))
df = df.drop_duplicates(subset=["inchi_key"])

-   Chem.MolToInchiKey(x): Convierte la molécula en una cadena InChIKey.
-   df.drop_duplicates: Elimina filas duplicadas basándose en la columna inchi_key.

In [14]:
# Filtrar los compuestos que tengan más de 6 carbonos
def has_more_than_six_carbons(mol):
    num_carbons = sum(1 for atom in mol.GetAtoms() if atom.GetSymbol() == 'C')
    return num_carbons > 6

df = df[df.pmol.apply(has_more_than_six_carbons)]

-   has_more_than_six_carbons(mol): Función que cuenta los átomos de carbono en la molécula.
-   df[df.pmol.apply(...)]: Filtra el DataFrame para conservar solo las moléculas con más de 6 carbonos.

In [15]:
df.head()

Unnamed: 0,hmdb_id,status,name,ccl,source,bioab,sol,logs,pkasa,pkasb,...,qed,nring,naring,fsp3,nat,icl,comp_set,mol,pmol,inchi_key
2,DB06119,,Cenobamate,Other,,1,0.0254,-3.87,,8.673062,...,0.896686,2,2,0.2,28,Basic,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x749b17692e30>,<rdkit.Chem.rdchem.Mol object at 0x749b1740e140>,GFHAXPJGXSQLPT-VIFPVBQESA-N
3,HMDB0251697|DB12243,detected,Edaravone,Organoheterocyclic compounds,Unknown,1,0.939,-2.27,13.44572,-1.477147,...,0.638544,2,1,0.2,23,Neutral,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x749b17693370>,<rdkit.Chem.rdchem.Mol object at 0x749b1740e080>,QELUYTUMUWHWMC-UHFFFAOYSA-N
4,DB00359,,Sulfadiazine,Benzenoids,,1,147.0,-0.03,4.54344,,...,0.78714,2,2,0.0,27,Acid,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x749b17693530>,<rdkit.Chem.rdchem.Mol object at 0x749b1740e380>,SEEPANYCNGTZFQ-UHFFFAOYSA-N
5,DB00432,,Trifluridine,"Nucleosides, nucleotides, and analogues",,1,0.000149,-6.37,4.070792,,...,0.662292,2,1,0.6,31,Acid,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x749b17693450>,<rdkit.Chem.rdchem.Mol object at 0x749b1740e200>,VSQQQLOSPVPRAZ-RRKCRQDMSA-N
6,DB01015,,Sulfamethoxazole,Benzenoids,,0,0.627,-2.93,8.009664,10.028458,...,0.804737,2,2,0.1,28,Basic,DrugBank,<rdkit.Chem.rdchem.Mol object at 0x749b176935a0>,<rdkit.Chem.rdchem.Mol object at 0x749b1740e0e0>,JLKIGFTWXXRPMT-UHFFFAOYSA-N


In [16]:
import requests
import zipfile
import io

In [23]:
# Leer el archivo InChI
inchi_file_url = "http://ligand-expo.rcsb.org/dictionaries/Components-inchi.ich"
inchi_data = pd.read_csv(inchi_file_url, sep='\t')

# Mostrar primeras filas
inchi_data.head()

Unnamed: 0,"InChI=1S/C2H4O3/c1-5-2(3)4/h1H3,(H,3,4)",000,methyl hydrogen carbonate
0,InChI=1S/C35H42F2N2O6/c1-42-30-22-27(23-31(43-...,1,"1-[2,2-difluoro-2-(3,4,5-trimethoxy-phenyl)-ac..."
1,InChI=1S/C23H35N3O6/c1-5-15(4)20(22(29)24-18(2...,2,n-[(2r)-2-benzyl-4-(hydroxyamino)-4-oxobutanoy...
2,InChI=1S/C26H25N5O2/c1-17(2)15-30-24-22(25(32)...,3,5-methyl-7-(2-methylpropyl)-2-(naphthalen-1-yl...
3,InChI=1S/C8H9NO2/c9-7(8(10)11)6-4-2-1-3-5-6/h1...,4,(2s)-amino(phenyl)ethanoic acid
4,InChI=1S/C10H13NO3/c11-8(9(12)10(13)14)6-7-4-2...,5,"(2s,3s)-3-amino-2-hydroxy-4-phenylbutanoic acid"


In [22]:
# Leer el archivo InChIKey
inchikey_file_url = "http://ligand-expo.rcsb.org/dictionaries/Components-inchikey.ich"
inchikey_data = pd.read_csv(inchikey_file_url, sep='\t')

# Mostrar primeras filas
inchikey_data.head()

Unnamed: 0,CXHHBNMLPJOKQD-UHFFFAOYSA-N,000,methyl hydrogen carbonate
0,NBYCDVVSYOMFMS-VMPREFPWSA-N,1,"1-[2,2-difluoro-2-(3,4,5-trimethoxy-phenyl)-ac..."
1,MWZOULASPWUGJJ-NFBUACBFSA-N,2,n-[(2r)-2-benzyl-4-(hydroxyamino)-4-oxobutanoy...
2,NNZDBCPMOOEFTE-UHFFFAOYSA-N,3,5-methyl-7-(2-methylpropyl)-2-(naphthalen-1-yl...
3,ZGUNAGUHMKGQNY-ZETCQYMHSA-N,4,(2s)-amino(phenyl)ethanoic acid
4,LDSJMFGYNFIFRK-IUCAKERBSA-N,5,"(2s,3s)-3-amino-2-hydroxy-4-phenylbutanoic acid"
