In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Abrir los dataframes

import os
import pandas as pd

input_folder = "/content/drive/MyDrive/TFM/T3/Red_Neuronal/intento2"
input_file_df_final = os.path.join(input_folder, "df_FINAL_tot.csv")
df_harm = pd.read_csv(input_file_df_final, sep = ',')
print(df_harm.shape[0],df_harm.columns)

70378 Index(['PDB_entry_id', 'Classification', 'Organism', 'Uniprot_id', 'Ligand_id',
       'Ligand_InChi', 'Experimental_method', 'Resolution',
       'Adding_Classification', 'Affinity', 'Coordenadas', 'Ligand_smiles',
       'Mol_Weight', 'n_atoms', 'Nombre', 'Ligand_Class', 'Pfam_Names',
       'SUPFAM_Names', 'intepro_class'],
      dtype='object')


In [None]:
input_neg_dec = os.path.join(input_folder, "decoys_full.csv")
df_neg_dec = pd.read_csv(input_neg_dec, sep = ',')
print(df_neg_dec.shape[0],df_neg_dec.columns)

70378 Index(['instancia', 'Ligand_smiles', 'smiles_neg', 'tanimoto', 'usrr_sim',
       'USR_neg_1', 'USR_neg_2', 'USR_neg_3', 'USR_neg_4', 'USR_neg_5',
       'USR_neg_6', 'USR_neg_7', 'USR_neg_8', 'USR_neg_9', 'USR_neg_10',
       'USR_neg_11', 'USR_neg_12'],
      dtype='object')


In [None]:
df_pos = df_harm[['Nombre','Ligand_smiles']]
df_pos = df_pos.rename(columns={'Nombre':'instancia'})
df_neg = df_neg_dec[['instancia','smiles_neg']]
df_neg = df_neg.rename(columns={'smiles_neg':'Ligand_smiles'})

In [None]:
df_pos['instancia'] = df_pos['instancia'].astype(str) + '_positive'
df_neg['instancia'] = df_neg['instancia'].astype(str) + '_negative'

In [None]:
df_total = pd.concat([df_pos, df_neg], ignore_index=True)

In [None]:
df_total.shape[0]

140756

In [None]:
!pip install torch
!pip install torch_geometric

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Downloading rdkit-2025.3.2-cp311-cp311-manylinux_2_28_x86_64.whl (35.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.2/35.2 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.3.2


In [None]:

import os
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs
import torch
import torch.nn.functional as F
from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from sklearn.model_selection import train_test_split

# ----------------------------------------------------------
# 0) Funciones para grafo PyG
# ----------------------------------------------------------
def one_hot_encoding(x, choices):
    if x not in choices: x = choices[-1]
    return [int(x==c) for c in choices]

def atom_features(atom):
    feats = [
      atom.GetAtomicNum(),
      atom.GetDegree(),
      atom.GetFormalCharge(),
      atom.GetTotalNumHs(),
      int(atom.GetIsAromatic()),
      int(atom.IsInRing())
    ]
    hybs = [Chem.HybridizationType.SP, Chem.HybridizationType.SP2,
            Chem.HybridizationType.SP3, Chem.HybridizationType.SP3D,
            Chem.HybridizationType.SP3D2, Chem.HybridizationType.UNSPECIFIED]
    feats += one_hot_encoding(atom.GetHybridization(), hybs)
    chirs = [Chem.ChiralType.CHI_UNSPECIFIED,
             Chem.ChiralType.CHI_TETRAHEDRAL_CW,
             Chem.ChiralType.CHI_TETRAHEDRAL_CCW]
    feats += one_hot_encoding(atom.GetChiralTag(), chirs)
    return feats

def mol_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"SMILES inválido: {smiles}")
    x = torch.tensor([atom_features(a) for a in mol.GetAtoms()], dtype=torch.float)
    ei, ea = [], []
    for b in mol.GetBonds():
        i,j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
        bt = b.GetBondTypeAsDouble()
        conj, ring = int(b.GetIsConjugated()), int(b.IsInRing())
        ei += [[i,j],[j,i]]
        ea += [[bt,conj,ring],[bt,conj,ring]]
    edge_index = torch.tensor(ei, dtype=torch.long).t().contiguous()
    edge_attr  = torch.tensor(ea, dtype=torch.float)
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

class SMILESDataset(InMemoryDataset):
    def __init__(self, data_list):
        super().__init__('.', None, None, None)
        self.data, self.slices = self.collate(data_list)

# ----------------------------------------------------------
# 1) Cálculo fingerprint de Morgan (128 bits)
# ----------------------------------------------------------
def smiles_to_morgan(smiles, radius=2, nBits=128):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"SMILES inválido: {smiles}")
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    arr = np.zeros((nBits,), dtype=float)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

# ----------------------------------------------------------
# 2) Definición del modelo de pre-entrenamiento
# ----------------------------------------------------------
class GNNPretrain(torch.nn.Module):
    def __init__(self, in_channels, hidden_dim=128, out_dim=128):
        super().__init__()
        self.conv1 = GCNConv(in_channels,  hidden_dim)
        self.conv2 = GCNConv(hidden_dim,   hidden_dim)
        self.decoder = torch.nn.Linear(hidden_dim, out_dim)
    def forward(self, data):
        x, ei, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, ei))
        x = F.relu(self.conv2(x, ei))
        hg = global_mean_pool(x, batch)    # (batch, hidden_dim)
        return self.decoder(hg)            # (batch, out_dim)

# ----------------------------------------------------------
# 3) Cargar SMILES y preparar DataList
# ----------------------------------------------------------
#df = pd.read_csv("/content/drive/MyDrive/TFM/T3/Archivos/df_FINAL_tot.csv")
df = df_total
if 'Ligand_smiles' not in df.columns:
    raise ValueError("Falta columna 'Ligand_smiles' en el CSV")
smiles_list = df['Ligand_smiles'].dropna().unique().tolist()
print(f"{len(smiles_list)} SMILES únicos encontrados")

data_list = []
for smi in smiles_list:
    try:
        g = mol_to_graph(smi)
        fp = smiles_to_morgan(smi)                     # np.array (128,)
        g.y = torch.tensor(fp, dtype=torch.float).unsqueeze(0)  # shape (1,128)
        data_list.append(g)
    except Exception as e:
        print(f"  - Omitido {smi}: {e}")

print(f"{len(data_list)} grafos + fingerprints listos")

# ----------------------------------------------------------
# 4) División train/val y DataLoaders
# ----------------------------------------------------------
idx = list(range(len(data_list)))
i_train, i_val = train_test_split(idx, test_size=0.2, random_state=42)
train_ds = SMILESDataset([data_list[i] for i in i_train])
val_ds   = SMILESDataset([data_list[i] for i in i_val])
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=32)

# ----------------------------------------------------------
# 5) Instanciar modelo y optimizador
# ----------------------------------------------------------
in_ch = train_ds[0].x.shape[1]
model = GNNPretrain(in_channels=in_ch, hidden_dim=128, out_dim=128)
optim = torch.optim.Adam(model.parameters(), lr=1e-3)
best_val = float('inf')

# ----------------------------------------------------------
# 6) Bucle de pre-entrenamiento
# ----------------------------------------------------------
for epoch in range(1, 51):
    model.train()
    lt = 0
    for batch in train_loader:
        optim.zero_grad()
        pred = model(batch)               # (B,128)
        loss = F.mse_loss(pred, batch.y)  # both (B,128)
        loss.backward()
        optim.step()
        lt += loss.item()
    lt /= len(train_loader)

    model.eval()
    lv = 0
    with torch.no_grad():
        for batch in val_loader:
            lv += F.mse_loss(model(batch), batch.y).item()
    lv /= len(val_loader)

    print(f"Epoch {epoch:02d} — train_loss {lt:.4f} | val_loss {lv:.4f}")
    if lv < best_val:
        best_val = lv
        torch.save(model.state_dict(), "/content/drive/MyDrive/TFM/T3/Red_Neuronal/intento2/gnn_weights.pth")
        print(" → Guardados mejores pesos en gnn_weights.pth")

print("Pre-entrenamiento completado")



5701 SMILES únicos encontrados


[1;30;43mSe han truncado las últimas 5000 líneas del flujo de salida.[0m


5701 grafos + fingerprints listos




Epoch 01 — train_loss 0.1817 | val_loss 0.1710
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 02 — train_loss 0.1692 | val_loss 0.1672
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 03 — train_loss 0.1663 | val_loss 0.1649
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 04 — train_loss 0.1639 | val_loss 0.1629
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 05 — train_loss 0.1622 | val_loss 0.1609
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 06 — train_loss 0.1608 | val_loss 0.1601
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 07 — train_loss 0.1597 | val_loss 0.1593
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 08 — train_loss 0.1588 | val_loss 0.1582
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 09 — train_loss 0.1578 | val_loss 0.1575
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 10 — train_loss 0.1570 | val_loss 0.1568
 → 🔖 Guardados mejores pesos en gnn_weights.pth
Epoch 11 — train_loss 0.1565 | val_loss 0.1562
 → 

In [None]:
#@title Generar Embedding ligandos

import pandas as pd
import torch
import torch.nn.functional as F
from rdkit import Chem
from torch_geometric.data import Data, InMemoryDataset, DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool

# -------------------------------
# 1) Funciones y clases
# -------------------------------
def one_hot_encoding(x, choices):
    if x not in choices: x = choices[-1]
    return [int(x == c) for c in choices]

def atom_features(atom: Chem.rdchem.Atom) -> list:
    feats = [
        atom.GetAtomicNum(),
        atom.GetDegree(),
        atom.GetFormalCharge(),
        atom.GetTotalNumHs(),
        int(atom.GetIsAromatic()),
        int(atom.IsInRing())
    ]
    hybrid_types = [
        Chem.HybridizationType.SP, Chem.HybridizationType.SP2,
        Chem.HybridizationType.SP3, Chem.HybridizationType.SP3D,
        Chem.HybridizationType.SP3D2, Chem.HybridizationType.UNSPECIFIED
    ]
    feats += one_hot_encoding(atom.GetHybridization(), hybrid_types)
    chiral_types = [
        Chem.ChiralType.CHI_UNSPECIFIED,
        Chem.ChiralType.CHI_TETRAHEDRAL_CW,
        Chem.ChiralType.CHI_TETRAHEDRAL_CCW
    ]
    feats += one_hot_encoding(atom.GetChiralTag(), chiral_types)
    return feats

def mol_to_graph(smiles: str) -> Data:
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        raise ValueError(f"Invalid SMILES: {smiles}")
    x = torch.tensor([atom_features(a) for a in mol.GetAtoms()], dtype=torch.float)
    edge_index, edge_attr = [], []
    for b in mol.GetBonds():
        i, j = b.GetBeginAtomIdx(), b.GetEndAtomIdx()
        bt = b.GetBondTypeAsDouble()
        conj, ring = int(b.GetIsConjugated()), int(b.IsInRing())
        edge_index += [[i, j], [j, i]]
        edge_attr  += [[bt, conj, ring], [bt, conj, ring]]
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr  = torch.tensor(edge_attr,  dtype=torch.float)
    return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)

class SMILESDataset(InMemoryDataset):
    def __init__(self, data_list):
        super().__init__('.', None, None, None)
        self.data, self.slices = self.collate(data_list)

class GNNEmbedding(torch.nn.Module):
    def __init__(self, in_channels, hidden_dim=128):
        super().__init__()
        self.conv1 = GCNConv(in_channels, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)
    def forward(self, data):
        x, ei, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, ei))
        x = F.relu(self.conv2(x, ei))
        return global_mean_pool(x, batch)

# -------------------------------
# 2) Cargar datos
# -------------------------------
df = df_total  # tu DataFrame en memoria
assert 'instancia' in df.columns and 'Ligand_smiles' in df.columns

# -------------------------------
# 3) Preparar grafos por instancia
# -------------------------------
data_list = []
instances = []
for _, row in df.iterrows():
    inst = row['instancia']
    smi = row['Ligand_smiles']
    try:
        g = mol_to_graph(smi)
        g.instancia = inst
        data_list.append(g)
        instances.append(inst)
    except Exception:
        continue

# -------------------------------
# 4) Dataset y DataLoader
# -------------------------------
dataset = SMILESDataset(data_list)
loader  = DataLoader(dataset, batch_size=32, shuffle=False)

# -------------------------------
# 5) Cargar pesos pre-entrenados (filtrando decoder)
# -------------------------------
gnn_weights = "/content/drive/MyDrive/TFM/T3/Red_Neuronal/intento2/gnn_weights.pth"
in_ch = dataset[0].x.shape[1]
model = GNNEmbedding(in_channels=in_ch, hidden_dim=128)

# Cargar state_dict completo
pretrained = torch.load(gnn_weights, map_location='cpu')
# Filtrar solo las claves que existen en nuestro modelo
model_dict = model.state_dict()
filtered = {k: v for k, v in pretrained.items() if k in model_dict}
model_dict.update(filtered)
model.load_state_dict(model_dict)

model.eval()

# -------------------------------
# 6) Calcular embeddings
# -------------------------------
all_emb = []
with torch.no_grad():
    for batch in loader:
        emb = model(batch)  # (batch_size, 128)
        all_emb.append(emb)

embeddings = torch.cat(all_emb, dim=0).numpy()

# -------------------------------
# 7) DataFrame final y guardado
# -------------------------------
col_names = [f"emb_lig_{i}" for i in range(128)]
df_emb = pd.DataFrame(embeddings, columns=col_names)
df_emb.insert(0, 'instancia', instances)

out_path = "/content/drive/MyDrive/TFM/T3/Archivos/df_embeddings_ligandos_instancia_preentrenado.csv"
df_emb.to_csv(out_path, index=False)
print("Embeddings por instancia guardados en:", out_path)



✅ Embeddings por instancia guardados en: /content/drive/MyDrive/TFM/T3/Archivos/df_embeddings_ligandos_instancia_preentrenado.csv
