In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title Instalar librerías

!pip install rdkit
!pip install Biopython
!pip install pandarallel
!pip install py3Dmol

Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl (34.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.3/34.3 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.9.6
Collecting Biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Biopython
Successfully installed Biopython-1.85
Collecting pandarallel
  Downloading pandarallel-1.6.5.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting dill>=0.3.1 (from pandarallel)
  Downloading dill-0.3.9-py3-none-a

In [None]:
#@title Abrir el dataframe

import os
import pandas as pd

input_folder = "/content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition"
input_file = os.path.join(input_folder, "df_final_familias.csv")
df_harm = pd.read_csv(input_file, sep = ',')
print(df_harm.shape[0],df_harm.columns)

40181 Index(['PDB_entry_id', 'Classification', 'Organism', 'Uniprot_id', 'Ligand_id',
       'Ligand_InChi', 'Experimental_method', 'Resolution',
       'Adding_Classification', 'Affinity', 'Coordenadas', 'Count_general',
       'Count_diff', 'Ligand_smiles'],
      dtype='object')


In [None]:
#@title Expander el Dataframe por la columna "Coordenadas"

import pandas as pd
import ast

df_harm['Coordenadas'] = df_harm['Coordenadas'].apply(ast.literal_eval)

df_expandido = df_harm.explode('Coordenadas').reset_index(drop=True)
df_expandido.shape[0]

96717

El script está en ente enlace: https://gist.github.com/anku255/03dc35c5233a3fc59d60fdf62c3cda24

In [None]:
#@title Función Ultrafast Shape Recognition

import math, os
from tqdm import tqdm

# --------- LECTOR PDB ---------
class Atom:
    def __init__(self, x, y, z):
        self.x = float(x)
        self.y = float(y)
        self.z = float(z)

    def getXCoordinate(self): return self.x
    def getYCoordinate(self): return self.y
    def getZCoordinate(self): return self.z

class Molecule:
    def __init__(self):
        self.atoms = []

    def addAtom(self, atom):
        self.atoms.append(atom)

    def getAtoms(self): return self.atoms
    def getAtom(self, i): return self.atoms[i]

def readPDBLigand(filepath):
    molecule = Molecule()
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith("HETATM") or line.startswith("ATOM"):
                try:
                    x = float(line[30:38])
                    y = float(line[38:46])
                    z = float(line[46:54])
                    molecule.addAtom(Atom(x, y, z))
                except:
                    continue
    return molecule

# --------- CÓDIGO USR ---------
class Point():
    def __init__(self, x=0, y=0, z=0):
        self.xCoordinate = x
        self.yCoordinate = y
        self.zCoordinate = z

class AtomIndexAndDistance():
    def __init__(self, index, distance):
        self.index = index
        self.distance = distance

class USR():
    def CalculateCentroid(self, aMolecule):
        centroid = Point()
        atoms = aMolecule.getAtoms()
        if not atoms:
            return centroid
        sx, sy, sz = 0.0, 0.0, 0.0
        for atom in atoms:
            sx += atom.getXCoordinate()
            sy += atom.getYCoordinate()
            sz += atom.getZCoordinate()
        n = len(atoms)
        centroid.xCoordinate = sx / n
        centroid.yCoordinate = sy / n
        centroid.zCoordinate = sz / n
        return centroid

    def EuclideanDistanceMeasure(self, aMolecule, point, i):
        atom = aMolecule.getAtom(i)
        dx = atom.getXCoordinate() - point.xCoordinate
        dy = atom.getYCoordinate() - point.yCoordinate
        dz = atom.getZCoordinate() - point.zCoordinate
        return math.sqrt(dx*dx + dy*dy + dz*dz)

    def ClosestAndFurthestAtomToCentroid(self, aMolecule):
        centroid = self.CalculateCentroid(aMolecule)
        distances = []
        for i in range(len(aMolecule.getAtoms())):
            dist = self.EuclideanDistanceMeasure(aMolecule, centroid, i)
            distances.append(AtomIndexAndDistance(i, dist))
        distances.sort(key=lambda x: x.distance)
        return [distances[0], distances[-1]]

    def FurthestAtomFromFurthestAtom(self, aMolecule, furthestAtom):
        index = furthestAtom.index
        target = aMolecule.getAtom(index)
        distances = []
        for i in range(len(aMolecule.getAtoms())):
            atom = aMolecule.getAtom(i)
            dx = atom.getXCoordinate() - target.getXCoordinate()
            dy = atom.getYCoordinate() - target.getYCoordinate()
            dz = atom.getZCoordinate() - target.getZCoordinate()
            dist = math.sqrt(dx*dx + dy*dy + dz*dz)
            distances.append(AtomIndexAndDistance(i, dist))
        distances.sort(key=lambda x: x.distance)
        return distances[-1].index

    def MomentToCentroid(self, momentId, aMolecule, centroid):
        atoms = aMolecule.getAtoms()
        distances = []
        for atom in atoms:
            dx = atom.getXCoordinate() - centroid.xCoordinate
            dy = atom.getYCoordinate() - centroid.yCoordinate
            dz = atom.getZCoordinate() - centroid.zCoordinate
            distances.append(math.sqrt(dx*dx + dy*dy + dz*dz))
        return self._moment(momentId, distances)

    def MomentToX(self, momentId, aMolecule, refIndex):
        ref = aMolecule.getAtom(refIndex)
        distances = []
        for atom in aMolecule.getAtoms():
            dx = atom.getXCoordinate() - ref.getXCoordinate()
            dy = atom.getYCoordinate() - ref.getYCoordinate()
            dz = atom.getZCoordinate() - ref.getZCoordinate()
            distances.append(math.sqrt(dx*dx + dy*dy + dz*dz))
        return self._moment(momentId, distances)

    def _moment(self, momentId, distances):
        n = len(distances)
        mean = sum(distances) / n
        if momentId == 1:
            return mean
        elif momentId == 2:
            return sum((d - mean)**2 for d in distances) / n
        elif momentId == 3:
            variance = sum((d - mean)**2 for d in distances) / n
            skewness = sum(abs(d - mean)**3 for d in distances) / n
            return skewness / (variance ** 1.5) if variance > 0 else 0.0

def getUSRDescriptor(aMolecule):
    usr = USR()
    centroid = usr.CalculateCentroid(aMolecule)
    v = usr.ClosestAndFurthestAtomToCentroid(aMolecule)
    ffaIndex = usr.FurthestAtomFromFurthestAtom(aMolecule, v[1])
    return [
        usr.MomentToCentroid(1, aMolecule, centroid),
        usr.MomentToCentroid(2, aMolecule, centroid),
        usr.MomentToCentroid(3, aMolecule, centroid),
        usr.MomentToX(1, aMolecule, v[0].index),
        usr.MomentToX(2, aMolecule, v[0].index),
        usr.MomentToX(3, aMolecule, v[0].index),
        usr.MomentToX(1, aMolecule, v[1].index),
        usr.MomentToX(2, aMolecule, v[1].index),
        usr.MomentToX(3, aMolecule, v[1].index),
        usr.MomentToX(1, aMolecule, ffaIndex),
        usr.MomentToX(2, aMolecule, ffaIndex),
        usr.MomentToX(3, aMolecule, ffaIndex),
    ]


In [None]:
#@title Función USR para los ligandos agrupados

import os
import glob
from collections import defaultdict
from tqdm import tqdm
import math

#############################
# LECTOR PDB Y DESCRIPTORES #
#############################

# --------- LECTOR PDB ---------
class Atom:
    def __init__(self, x, y, z):
        self.x = float(x)
        self.y = float(y)
        self.z = float(z)

    def getXCoordinate(self):
        return self.x
    def getYCoordinate(self):
        return self.y
    def getZCoordinate(self):
        return self.z

class Molecule:
    def __init__(self):
        self.atoms = []

    def addAtom(self, atom):
        self.atoms.append(atom)

    def getAtoms(self):
        return self.atoms
    def getAtom(self, i):
        return self.atoms[i]

def readPDBLigand(filepath):
    molecule = Molecule()
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith("HETATM") or line.startswith("ATOM"):
                try:
                    x = float(line[30:38])
                    y = float(line[38:46])
                    z = float(line[46:54])
                    molecule.addAtom(Atom(x, y, z))
                except Exception as ex:
                    # Si ocurre algún error al convertir las coordenadas, se salta la línea.
                    continue
    return molecule

# --------- CÓDIGO USR ---------
class Point:
    def __init__(self, x=0, y=0, z=0):
        self.xCoordinate = x
        self.yCoordinate = y
        self.zCoordinate = z

class AtomIndexAndDistance:
    def __init__(self, index, distance):
        self.index = index
        self.distance = distance

class USR:
    def CalculateCentroid(self, aMolecule):
        centroid = Point()
        atoms = aMolecule.getAtoms()
        if not atoms:
            return centroid
        sx, sy, sz = 0.0, 0.0, 0.0
        for atom in atoms:
            sx += atom.getXCoordinate()
            sy += atom.getYCoordinate()
            sz += atom.getZCoordinate()
        n = len(atoms)
        centroid.xCoordinate = sx / n
        centroid.yCoordinate = sy / n
        centroid.zCoordinate = sz / n
        return centroid

    def EuclideanDistanceMeasure(self, aMolecule, point, i):
        atom = aMolecule.getAtom(i)
        dx = atom.getXCoordinate() - point.xCoordinate
        dy = atom.getYCoordinate() - point.yCoordinate
        dz = atom.getZCoordinate() - point.zCoordinate
        return math.sqrt(dx*dx + dy*dy + dz*dz)

    def ClosestAndFurthestAtomToCentroid(self, aMolecule):
        centroid = self.CalculateCentroid(aMolecule)
        distances = []
        for i in range(len(aMolecule.getAtoms())):
            dist = self.EuclideanDistanceMeasure(aMolecule, centroid, i)
            distances.append(AtomIndexAndDistance(i, dist))
        distances.sort(key=lambda x: x.distance)
        return [distances[0], distances[-1]]

    def FurthestAtomFromFurthestAtom(self, aMolecule, furthestAtom):
        index = furthestAtom.index
        target = aMolecule.getAtom(index)
        distances = []
        for i in range(len(aMolecule.getAtoms())):
            atom = aMolecule.getAtom(i)
            dx = atom.getXCoordinate() - target.getXCoordinate()
            dy = atom.getYCoordinate() - target.getYCoordinate()
            dz = atom.getZCoordinate() - target.getZCoordinate()
            dist = math.sqrt(dx*dx + dy*dy + dz*dz)
            distances.append(AtomIndexAndDistance(i, dist))
        distances.sort(key=lambda x: x.distance)
        return distances[-1].index

    def MomentToCentroid(self, momentId, aMolecule, centroid):
        atoms = aMolecule.getAtoms()
        distances = []
        for atom in atoms:
            dx = atom.getXCoordinate() - centroid.xCoordinate
            dy = atom.getYCoordinate() - centroid.yCoordinate
            dz = atom.getZCoordinate() - centroid.zCoordinate
            distances.append(math.sqrt(dx*dx + dy*dy + dz*dz))
        return self._moment(momentId, distances)

    def MomentToX(self, momentId, aMolecule, refIndex):
        ref = aMolecule.getAtom(refIndex)
        distances = []
        for atom in aMolecule.getAtoms():
            dx = atom.getXCoordinate() - ref.getXCoordinate()
            dy = atom.getYCoordinate() - ref.getYCoordinate()
            dz = atom.getZCoordinate() - ref.getZCoordinate()
            distances.append(math.sqrt(dx*dx + dy*dy + dz*dz))
        return self._moment(momentId, distances)

    def _moment(self, momentId, distances):
        n = len(distances)
        mean = sum(distances) / n
        if momentId == 1:
            return mean
        elif momentId == 2:
            return sum((d - mean)**2 for d in distances) / n
        elif momentId == 3:
            variance = sum((d - mean)**2 for d in distances) / n
            skewness = sum(abs(d - mean)**3 for d in distances) / n
            return skewness / (variance ** 1.5) if variance > 0 else 0.0

def getUSRDescriptor(aMolecule):
    usr = USR()
    centroid = usr.CalculateCentroid(aMolecule)
    v = usr.ClosestAndFurthestAtomToCentroid(aMolecule)
    ffaIndex = usr.FurthestAtomFromFurthestAtom(aMolecule, v[1])
    return [
        usr.MomentToCentroid(1, aMolecule, centroid),
        usr.MomentToCentroid(2, aMolecule, centroid),
        usr.MomentToCentroid(3, aMolecule, centroid),
        usr.MomentToX(1, aMolecule, v[0].index),
        usr.MomentToX(2, aMolecule, v[0].index),
        usr.MomentToX(3, aMolecule, v[0].index),
        usr.MomentToX(1, aMolecule, v[1].index),
        usr.MomentToX(2, aMolecule, v[1].index),
        usr.MomentToX(3, aMolecule, v[1].index),
        usr.MomentToX(1, aMolecule, ffaIndex),
        usr.MomentToX(2, aMolecule, ffaIndex),
        usr.MomentToX(3, aMolecule, ffaIndex),
    ]

#################################
# AGRUPAMIENTO Y PROCESAMIENTO  #
#################################

# Ruta de la carpeta de entrada con archivos PDB
input_folder = "/content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb"

# Ruta de salida para los archivos CSV de USR
output_folder = "/content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR_1"
os.makedirs(output_folder, exist_ok=True)

# 1. Agrupar por el nombre del ligando
ligandos_por_nombre = defaultdict(list)
pdb_files = sorted(glob.glob(os.path.join(input_folder, "*.pdb")))

for filepath in pdb_files:
    base = os.path.basename(filepath)
    parts = base.split("_")
    if len(parts) < 2:
        # Si no se puede dividir correctamente, se ignora
        continue
    ligand_name = parts[1]
    ligandos_por_nombre[ligand_name].append(filepath)

# 2. Procesar cada grupo y generar un CSV para cada ligando
for ligand_name, archivos in ligandos_por_nombre.items():
    output_path = os.path.join(output_folder, f"USR_{ligand_name}.csv")
    with open(output_path, 'w') as out:
        out.write("conformation," + ",".join([f"USR_{i+1}" for i in range(12)]) + "\n")
        for pdb_file in tqdm(archivos, desc=f"Procesando {ligand_name}"):
            try:
                mol = readPDBLigand(pdb_file)
                descriptor = getUSRDescriptor(mol)
                desc_line = ",".join([f"{d:.5f}" for d in descriptor])
                out.write(f"{os.path.basename(pdb_file)},{desc_line}\n")
            except Exception as e:
                print(f"Error en {pdb_file}: {e}")


Procesando NBN: 100%|██████████| 16/16 [00:19<00:00,  1.20s/it]
Procesando GCP: 100%|██████████| 123/123 [01:34<00:00,  1.30it/s]
Procesando AMP: 100%|██████████| 1188/1188 [16:38<00:00,  1.19it/s]
Procesando SAS: 100%|██████████| 13/13 [00:10<00:00,  1.29it/s]
Procesando 3PG: 100%|██████████| 121/121 [01:30<00:00,  1.34it/s]
Procesando GTX: 100%|██████████| 80/80 [00:58<00:00,  1.36it/s]
Procesando BNZ: 100%|██████████| 17/17 [00:13<00:00,  1.25it/s]
Procesando HED: 100%|██████████| 171/171 [02:08<00:00,  1.33it/s]
Procesando IND: 100%|██████████| 46/46 [00:29<00:00,  1.54it/s]
Procesando PXY: 100%|██████████| 4/4 [00:02<00:00,  1.58it/s]
Procesando OXE: 100%|██████████| 11/11 [00:08<00:00,  1.32it/s]
Procesando PMP: 100%|██████████| 215/215 [02:32<00:00,  1.41it/s]
Procesando BEN: 100%|██████████| 518/518 [06:16<00:00,  1.37it/s]
Procesando FAR: 100%|██████████| 18/18 [00:12<00:00,  1.45it/s]
Procesando EST: 100%|██████████| 84/84 [01:59<00:00,  1.43s/it]
Procesando STR: 100%|███████

Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/1QOV_BCL_L_1303_ligando.pdb: list index out of range


Procesando BCL: 100%|██████████| 1263/1263 [14:35<00:00,  1.44it/s]
Procesando BPH: 100%|██████████| 212/212 [02:30<00:00,  1.41it/s]
Procesando LDA: 100%|██████████| 1060/1060 [13:00<00:00,  1.36it/s]
Procesando PH2: 100%|██████████| 35/35 [00:23<00:00,  1.48it/s]
Procesando SAN: 100%|██████████| 7/7 [00:05<00:00,  1.34it/s]
Procesando NOV: 100%|██████████| 17/17 [00:12<00:00,  1.34it/s]
Procesando PLA: 100%|██████████| 6/6 [00:04<00:00,  1.39it/s]
Procesando AP5: 100%|██████████| 92/92 [01:06<00:00,  1.38it/s]
Procesando OAA: 100%|██████████| 123/123 [01:32<00:00,  1.33it/s]
Procesando ISA: 100%|██████████| 3/3 [00:02<00:00,  1.07it/s]
Procesando MIC: 100%|██████████| 2/2 [00:02<00:00,  1.32s/it]
Procesando GUA: 100%|██████████| 19/19 [00:13<00:00,  1.44it/s]
Procesando CB3:  37%|███▋      | 11/30 [00:07<00:12,  1.57it/s]

Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/1QZF_CB3_D_616_ligando.pdb: list index out of range


Procesando CB3: 100%|██████████| 30/30 [00:18<00:00,  1.61it/s]
Procesando BE2: 100%|██████████| 39/39 [00:27<00:00,  1.40it/s]
Procesando INS: 100%|██████████| 64/64 [00:52<00:00,  1.21it/s]
Procesando PTE: 100%|██████████| 6/6 [00:03<00:00,  1.52it/s]
Procesando OCT: 100%|██████████| 341/341 [04:19<00:00,  1.31it/s]
Procesando STU: 100%|██████████| 100/100 [01:10<00:00,  1.41it/s]
Procesando RTL: 100%|██████████| 27/27 [00:19<00:00,  1.37it/s]
Procesando SFG: 100%|██████████| 206/206 [02:29<00:00,  1.37it/s]
Procesando TCH: 100%|██████████| 18/18 [00:12<00:00,  1.45it/s]
Procesando A3P: 100%|██████████| 167/167 [01:58<00:00,  1.41it/s]
Procesando PDC: 100%|██████████| 58/58 [00:42<00:00,  1.36it/s]
Procesando NPL: 100%|██████████| 2/2 [00:01<00:00,  1.40it/s]
Procesando PCT: 100%|██████████| 12/12 [00:09<00:00,  1.22it/s]
Procesando BOG: 100%|██████████| 692/692 [08:32<00:00,  1.35it/s]
Procesando AGS: 100%|██████████| 360/360 [04:23<00:00,  1.37it/s]
Procesando AP2: 100%|██████████|

Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/1PQ2_PLM_A_502_ligando.pdb: list index out of range


Procesando PLM: 100%|██████████| 720/720 [08:57<00:00,  1.34it/s]
Procesando CYH: 100%|██████████| 10/10 [00:07<00:00,  1.35it/s]
Procesando DTT: 100%|██████████| 285/285 [03:25<00:00,  1.39it/s]
Procesando XYP: 100%|██████████| 230/230 [02:56<00:00,  1.30it/s]
Procesando XYS: 100%|██████████| 77/77 [00:56<00:00,  1.36it/s]
Procesando M1A: 100%|██████████| 2/2 [00:01<00:00,  1.34it/s]
Procesando GPS: 100%|██████████| 12/12 [00:08<00:00,  1.44it/s]
Procesando DHB: 100%|██████████| 63/63 [00:45<00:00,  1.38it/s]
Procesando 4IP: 100%|██████████| 38/38 [00:27<00:00,  1.38it/s]
Procesando PGH: 100%|██████████| 41/41 [00:28<00:00,  1.42it/s]
Procesando DCM: 100%|██████████| 51/51 [00:37<00:00,  1.38it/s]
Procesando COD: 100%|██████████| 35/35 [00:23<00:00,  1.47it/s]
Procesando DGN: 100%|██████████| 2/2 [00:01<00:00,  1.48it/s]
Procesando DG2: 100%|██████████| 13/13 [00:09<00:00,  1.38it/s]
Procesando ACO: 100%|██████████| 389/389 [04:42<00:00,  1.38it/s]
Procesando IMH: 100%|██████████| 35/

Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/3DZI_N_B_301_ligando.pdb: list index out of range


Procesando N:  83%|████████▎ | 5/6 [00:03<00:00,  1.38it/s]

Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/5Z2V_N_A_204_ligando.pdb: list index out of range


Procesando N: 100%|██████████| 6/6 [00:04<00:00,  1.30it/s]


Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/8U0Z_N_A_502_ligando.pdb: list index out of range


Procesando RQ3: 100%|██████████| 4/4 [00:02<00:00,  1.52it/s]
Procesando ISQ: 100%|██████████| 4/4 [00:02<00:00,  1.57it/s]
Procesando HDD: 100%|██████████| 50/50 [00:33<00:00,  1.48it/s]
Procesando 120: 100%|██████████| 3/3 [00:02<00:00,  1.32it/s]
Procesando BMZ: 100%|██████████| 6/6 [00:03<00:00,  1.55it/s]
Procesando 123: 100%|██████████| 2/2 [00:01<00:00,  1.35it/s]
Procesando GYP: 100%|██████████| 21/21 [00:14<00:00,  1.44it/s]
Procesando D5M: 100%|██████████| 29/29 [00:24<00:00,  1.19it/s]
Procesando 132: 100%|██████████| 4/4 [00:02<00:00,  1.47it/s]
Procesando 135: 100%|██████████| 3/3 [00:02<00:00,  1.25it/s]
Procesando FER: 100%|██████████| 39/39 [00:31<00:00,  1.25it/s]
Procesando GTB: 100%|██████████| 33/33 [00:24<00:00,  1.33it/s]
Procesando CAG: 100%|██████████| 3/3 [00:02<00:00,  1.45it/s]
Procesando 3GP: 100%|██████████| 27/27 [00:18<00:00,  1.47it/s]
Procesando DQH: 100%|██████████| 7/7 [00:04<00:00,  1.53it/s]
Procesando GMC: 100%|██████████| 3/3 [00:01<00:00,  1.58it

Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/7QOQ_U_A_402_ligando.pdb: list index out of range


Procesando U:  83%|████████▎ | 25/30 [00:17<00:03,  1.41it/s]

Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/7QOQ_U_B_402_ligando.pdb: list index out of range


Procesando U:  93%|█████████▎| 28/30 [00:19<00:01,  1.41it/s]

Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/9BF2_U_A_601_ligando.pdb: list index out of range


Procesando U:  97%|█████████▋| 29/30 [00:20<00:00,  1.44it/s]

Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/9BF2_U_B_601_ligando.pdb: list index out of range


Procesando U: 100%|██████████| 30/30 [00:20<00:00,  1.44it/s]


Error en /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb/9BF2_U_C_601_ligando.pdb: list index out of range


Procesando BU2: 100%|██████████| 28/28 [00:21<00:00,  1.32it/s]
Procesando XMP: 100%|██████████| 50/50 [00:36<00:00,  1.39it/s]
Procesando FCN: 100%|██████████| 27/27 [00:20<00:00,  1.33it/s]
Procesando PTY: 100%|██████████| 80/80 [01:01<00:00,  1.30it/s]
Procesando CLR: 100%|██████████| 998/998 [13:26<00:00,  1.24it/s]
Procesando NPO: 100%|██████████| 101/101 [01:13<00:00,  1.37it/s]
Procesando U05: 100%|██████████| 3/3 [00:02<00:00,  1.43it/s]
Procesando OAD: 100%|██████████| 7/7 [00:05<00:00,  1.28it/s]
Procesando MTL: 100%|██████████| 28/28 [00:21<00:00,  1.29it/s]
Procesando DEX: 100%|██████████| 22/22 [00:16<00:00,  1.37it/s]
Procesando POA: 100%|██████████| 4/4 [00:02<00:00,  1.45it/s]
Procesando RIO: 100%|██████████| 25/25 [00:18<00:00,  1.36it/s]
Procesando 3PE: 100%|██████████| 53/53 [00:43<00:00,  1.23it/s]
Procesando M6P: 100%|██████████| 19/19 [00:14<00:00,  1.33it/s]
Procesando THG: 100%|██████████| 35/35 [00:26<00:00,  1.30it/s]
Procesando CLW: 100%|██████████| 9/9 [00:0

## Segundo intento

Este código salta los archivos ya analizados para evitar repetir la investigación.

In [None]:
import os
import glob
from collections import defaultdict
from tqdm import tqdm
import math

#############################
# LECTOR PDB Y DESCRIPTORES #
#############################

# --------- LECTOR PDB ---------
class Atom:
    def __init__(self, x, y, z):
        self.x = float(x)
        self.y = float(y)
        self.z = float(z)

    def getXCoordinate(self):
        return self.x
    def getYCoordinate(self):
        return self.y
    def getZCoordinate(self):
        return self.z

class Molecule:
    def __init__(self):
        self.atoms = []

    def addAtom(self, atom):
        self.atoms.append(atom)

    def getAtoms(self):
        return self.atoms
    def getAtom(self, i):
        return self.atoms[i]

def readPDBLigand(filepath):
    molecule = Molecule()
    with open(filepath, 'r') as f:
        for line in f:
            if line.startswith("HETATM") or line.startswith("ATOM"):
                try:
                    x = float(line[30:38])
                    y = float(line[38:46])
                    z = float(line[46:54])
                    molecule.addAtom(Atom(x, y, z))
                except Exception as ex:
                    # Si ocurre algún error al convertir las coordenadas, se salta la línea.
                    continue
    return molecule

# --------- CÓDIGO USR ---------
class Point:
    def __init__(self, x=0, y=0, z=0):
        self.xCoordinate = x
        self.yCoordinate = y
        self.zCoordinate = z

class AtomIndexAndDistance:
    def __init__(self, index, distance):
        self.index = index
        self.distance = distance

class USR:
    def CalculateCentroid(self, aMolecule):
        centroid = Point()
        atoms = aMolecule.getAtoms()
        if not atoms:
            return centroid
        sx, sy, sz = 0.0, 0.0, 0.0
        for atom in atoms:
            sx += atom.getXCoordinate()
            sy += atom.getYCoordinate()
            sz += atom.getZCoordinate()
        n = len(atoms)
        centroid.xCoordinate = sx / n
        centroid.yCoordinate = sy / n
        centroid.zCoordinate = sz / n
        return centroid

    def EuclideanDistanceMeasure(self, aMolecule, point, i):
        atom = aMolecule.getAtom(i)
        dx = atom.getXCoordinate() - point.xCoordinate
        dy = atom.getYCoordinate() - point.yCoordinate
        dz = atom.getZCoordinate() - point.zCoordinate
        return math.sqrt(dx*dx + dy*dy + dz*dz)

    def ClosestAndFurthestAtomToCentroid(self, aMolecule):
        centroid = self.CalculateCentroid(aMolecule)
        distances = []
        for i in range(len(aMolecule.getAtoms())):
            dist = self.EuclideanDistanceMeasure(aMolecule, centroid, i)
            distances.append(AtomIndexAndDistance(i, dist))
        distances.sort(key=lambda x: x.distance)
        return [distances[0], distances[-1]]

    def FurthestAtomFromFurthestAtom(self, aMolecule, furthestAtom):
        index = furthestAtom.index
        target = aMolecule.getAtom(index)
        distances = []
        for i in range(len(aMolecule.getAtoms())):
            atom = aMolecule.getAtom(i)
            dx = atom.getXCoordinate() - target.getXCoordinate()
            dy = atom.getYCoordinate() - target.getYCoordinate()
            dz = atom.getZCoordinate() - target.getZCoordinate()
            dist = math.sqrt(dx*dx + dy*dy + dz*dz)
            distances.append(AtomIndexAndDistance(i, dist))
        distances.sort(key=lambda x: x.distance)
        return distances[-1].index

    def MomentToCentroid(self, momentId, aMolecule, centroid):
        atoms = aMolecule.getAtoms()
        distances = []
        for atom in atoms:
            dx = atom.getXCoordinate() - centroid.xCoordinate
            dy = atom.getYCoordinate() - centroid.yCoordinate
            dz = atom.getZCoordinate() - centroid.zCoordinate
            distances.append(math.sqrt(dx*dx + dy*dy + dz*dz))
        return self._moment(momentId, distances)

    def MomentToX(self, momentId, aMolecule, refIndex):
        ref = aMolecule.getAtom(refIndex)
        distances = []
        for atom in aMolecule.getAtoms():
            dx = atom.getXCoordinate() - ref.getXCoordinate()
            dy = atom.getYCoordinate() - ref.getYCoordinate()
            dz = atom.getZCoordinate() - ref.getZCoordinate()
            distances.append(math.sqrt(dx*dx + dy*dy + dz*dz))
        return self._moment(momentId, distances)

    def _moment(self, momentId, distances):
        n = len(distances)
        mean = sum(distances) / n
        if momentId == 1:
            return mean
        elif momentId == 2:
            return sum((d - mean)**2 for d in distances) / n
        elif momentId == 3:
            variance = sum((d - mean)**2 for d in distances) / n
            skewness = sum(abs(d - mean)**3 for d in distances) / n
            return skewness / (variance ** 1.5) if variance > 0 else 0.0

def getUSRDescriptor(aMolecule):
    usr = USR()
    centroid = usr.CalculateCentroid(aMolecule)
    v = usr.ClosestAndFurthestAtomToCentroid(aMolecule)
    ffaIndex = usr.FurthestAtomFromFurthestAtom(aMolecule, v[1])
    return [
        usr.MomentToCentroid(1, aMolecule, centroid),
        usr.MomentToCentroid(2, aMolecule, centroid),
        usr.MomentToCentroid(3, aMolecule, centroid),
        usr.MomentToX(1, aMolecule, v[0].index),
        usr.MomentToX(2, aMolecule, v[0].index),
        usr.MomentToX(3, aMolecule, v[0].index),
        usr.MomentToX(1, aMolecule, v[1].index),
        usr.MomentToX(2, aMolecule, v[1].index),
        usr.MomentToX(3, aMolecule, v[1].index),
        usr.MomentToX(1, aMolecule, ffaIndex),
        usr.MomentToX(2, aMolecule, ffaIndex),
        usr.MomentToX(3, aMolecule, ffaIndex),
    ]

In [None]:
import os
import glob
from collections import defaultdict
from tqdm import tqdm

#################################
# AGRUPAMIENTO Y PROCESAMIENTO  #
#################################

# Ruta de la carpeta de entrada con archivos PDB
input_folder = "/content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/ligandos_pdb"

# Ruta de salida para los archivos CSV de USR
output_folder = "/content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR"
os.makedirs(output_folder, exist_ok=True)

# 1. Agrupar por el nombre del ligando, que se encuentra en el índice [1]
ligandos_por_nombre = defaultdict(list)
pdb_files = sorted(glob.glob(os.path.join(input_folder, "*.pdb")))

for filepath in pdb_files:
    base = os.path.basename(filepath)
    parts = base.split("_")
    if len(parts) < 2:
        # Si no se puede dividir correctamente, se ignora
        continue
    ligand_name = parts[1]  # Ejemplo: "GNP"
    ligandos_por_nombre[ligand_name].append(filepath)

# 2. Procesar cada grupo y generar un CSV para cada ligando, omitiendo si ya existe
for ligand_name, archivos in ligandos_por_nombre.items():
    output_path = os.path.join(output_folder, f"USR_{ligand_name}.csv")

    # Si el CSV para este ligando ya existe, se salta el procesamiento
    if os.path.exists(output_path):
        print(f"El archivo {output_path} ya existe. Saltando {ligand_name}.")
        continue

    with open(output_path, 'w') as out:
        out.write("conformation," + ",".join([f"USR_{i+1}" for i in range(12)]) + "\n")
        for pdb_file in tqdm(archivos, desc=f"Procesando {ligand_name}"):
            try:
                mol = readPDBLigand(pdb_file)
                descriptor = getUSRDescriptor(mol)
                desc_line = ",".join([f"{d:.5f}" for d in descriptor])
                out.write(f"{os.path.basename(pdb_file)},{desc_line}\n")
            except Exception as e:
                print(f"Error en {pdb_file}: {e}")


El archivo /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR/USR_NBN.csv ya existe. Saltando NBN.
El archivo /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR/USR_GCP.csv ya existe. Saltando GCP.
El archivo /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR/USR_AMP.csv ya existe. Saltando AMP.
El archivo /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR/USR_SAS.csv ya existe. Saltando SAS.
El archivo /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR/USR_3PG.csv ya existe. Saltando 3PG.
El archivo /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR/USR_GTX.csv ya existe. Saltando GTX.
El archivo /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR/USR_BNZ.csv ya existe. Saltando BNZ.
El archivo /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recognition/Output_USR/USR_HED.csv ya existe. Saltando HED.
El archivo /content/drive/MyDrive/TFM/T2/Ultrafast_Shape_Recogni