In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install Biopython
!pip install rdkit

In [None]:
#@title Abrir el dataframe

import os
import pandas as pd

input_folder = "/content"
input_file = os.path.join(input_folder, "df_corregido_SMILES.csv")
df_harm = pd.read_csv(input_file, sep = ',')
print(df_harm.shape[0],df_harm.columns)

In [None]:
df_harm = df_harm[['PDB_entry_id', 'Classification', 'Organism',
       'Uniprot_id', 'Ligand_id', 'Ligand_InChi', 'Experimental_method',
       'Resolution', 'Adding_Classification', 'Affinity', 'Coordenadas',
       'Ligand_smiles', 'Mol_Weight', 'n_atoms', 'Nombre']]

In [None]:
nombres_instancias = []

for nombre in df_harm['Nombre']:
    partes = nombre.split('_')
    if len(partes) >= 3:
        pdb = partes[0].lower()  # las primeras 4 letras (PDB code) en minúsculas
        cadena = partes[2]       # la cadena (A, B, etc.)
        nuevo_nombre = f"{pdb}_{cadena}"
        nombres_instancias.append(nuevo_nombre)

print(nombres_instancias[:10])

['4gcp_B', '4gcx_A', '4gcy_A', '4gde_A', '4gde_B', '4gde_C', '4gde_D', '4ge1_A', '4ge1_B', '4ge1_C']


In [None]:
from Bio import SeqIO

# Ruta al archivo fasta original
fasta_path = "/content/drive/MyDrive/TFM/T3/Red_Neuronal/Embedding_grafo_secuencia_sitio/protport.fasta"

# Lista de IDs que quieres conservar
ids_a_conservar = nombres_instancias

# Leer todas las secuencias
secuencias = list(SeqIO.parse(fasta_path, "fasta"))

# Filtrar sólo las que estén en tu lista
secuencias_filtradas = [seq for seq in secuencias if seq.id in ids_a_conservar]

# Guardar las secuencias filtradas en un nuevo archivo FASTA
salida_path = "/content/drive/MyDrive/TFM/T3/Red_Neuronal/Embedding_grafo_secuencia_sitio/proteinas_filtradas.fasta"
SeqIO.write(secuencias_filtradas, salida_path, "fasta")

print(f"Guardadas {len(secuencias_filtradas)} secuencias en {salida_path}")


Guardadas 52454 secuencias en /content/drive/MyDrive/TFM/T3/Red_Neuronal/Embedding_grafo_secuencia_sitio/proteinas_filtradas.fasta


In [None]:
ids_originales = set(nombres_instancias)

# 2. Las IDs que sí encontraste y guardaste
ids_encontrados = set(seq.id for seq in secuencias_filtradas)

# 3. IDs que faltan
ids_faltantes = ids_originales - ids_encontrados

print(f"Total IDs originales: {len(ids_originales)}")
print(f"Total secuencias encontradas: {len(ids_encontrados)}")
print(f"Faltan {len(ids_faltantes)} secuencias:")
print(ids_faltantes)

Total IDs originales: 52528
Total secuencias encontradas: 52431
Faltan 97 secuencias:
{'8vdn_C', '4x4v_B', '8fnp_L', '9c9m_O', '4ed3_P', '4ecw_P', '4m2z_D', '4ecx_P', '5yuz_C', '6p0a_C', '5cdp_H', '8j9w_D', '2j6u_P', '8f16_D', '8vuo_F', '7kr3_C', '4m2z_C', '5dto_B', '8f15_F', '4dl3_P', '8ywi_T', '9c9m_F', '8vzl_C', '8f15_D', '8fnl_L', '5yv0_C', '8j9v_D', '8f15_E', '6ig1_H', '2wiw_D', '8fnn_F', '3q8q_T', '4ecq_P', '6ig1_C', '5vzf_P', '4mfq_B', '4ed0_P', '3rae_H', '8f17_D', '5yv3_H', '4m30_D', '5yuy_H', '3rae_F', '4ecr_P', '5ewf_T', '4ecz_P', '8vzm_C', '8f16_C', '5v0a_B', '4tqr_T', '6p0b_C', '5zrf_F', '3avt_T', '6dt1_D', '6dt1_C', '7kr4_C', '3avw_T', '8f10_B', '5kub_B', '2va2_E', '8f14_B', '4ect_P', '4dl2_P', '5yuy_C', '4dl4_P', '4ecu_P', '4ed1_P', '8bek_M', '7r1f_M', '4qza_U', '8fno_L', '6is0_C', '4ed7_P', '8fno_F', '8fnl_F', '8fnp_F', '4ed2_P', '5yv3_C', '5zrf_D', '8vuo_E', '4g0v_D', '8j9v_F', '8f17_C', '4m30_C', '7l35_C', '8oiv_D', '5yuw_C', '8f12_B', '7r0e_M', '4ecv_P', '4ed8_P', '7l

In [None]:
# Instalar huggingface transformers y fair-esm
!pip install torch torchvision torchaudio
!pip install fair-esm
!pip install biopython

In [None]:
import torch
import esm
from Bio import SeqIO
import pandas as pd
from tqdm import tqdm
import os
import glob

# 1. Cargar modelo preentrenado
model, alphabet = esm.pretrained.esm2_t6_8M_UR50D()
batch_converter = alphabet.get_batch_converter()
model.eval()

# 2. Detectar dispositivo
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# 3. Cargar secuencias
fasta_path = '/content/drive/MyDrive/TFM/T3/Red_Neuronal/Embedding_grafo_secuencia_sitio/proteinas_filtradas.fasta'
sequences = list(SeqIO.parse(fasta_path, "fasta"))
data = [(seq_record.id, str(seq_record.seq)) for seq_record in sequences]

# 4. Parámetros
initial_batch_size = 32
batch_size = initial_batch_size
save_every = 1000
output_dir = '/content/drive/MyDrive/TFM/T3/Embeddings_proteinas/temp_batches/'
os.makedirs(output_dir, exist_ok=True)

embeddings = []
ids = []
batch_counter = 0

i = 0
pbar = tqdm(total=len(data), desc="Processing proteins")

while i < len(data):
    try:
        batch_data = data[i:i+batch_size]
        labels, strs, toks = batch_converter(batch_data)
        toks = toks.to(device)

        with torch.no_grad():
            outputs = model(toks, repr_layers=[6])

        token_representations = outputs["representations"][6]

        for j, (label, seq) in enumerate(batch_data):
            embedding = token_representations[j, 1:len(seq)+1].mean(0).cpu().numpy()
            embeddings.append(embedding)
            ids.append(label)

        i += batch_size
        pbar.update(batch_size)

        # Guardar cada X secuencias
        if len(ids) >= save_every:
            temp_df = pd.DataFrame(embeddings, index=ids)
            temp_df.to_csv(os.path.join(output_dir, f"batch_{batch_counter}.csv"))
            batch_counter += 1
            embeddings = []
            ids = []

    except RuntimeError as e:
        if 'out of memory' in str(e):
            batch_size = batch_size // 2
            print(f"Reduciendo batch_size a {batch_size} por falta de memoria...")
            if batch_size == 0:
                raise RuntimeError("No se puede reducir más batch_size. Memoria insuficiente.")
            torch.cuda.empty_cache()
        else:
            raise e

# Guardar cualquier resto
if len(ids) > 0:
    temp_df = pd.DataFrame(embeddings, index=ids)
    temp_df.to_csv(os.path.join(output_dir, f"batch_{batch_counter}.csv"))

pbar.close()
print("Todas las batches guardadas por separado.")

# 5. Combinar todos los archivos en uno solo
all_files = sorted(glob.glob(os.path.join(output_dir, "*.csv")))
df_final = pd.concat((pd.read_csv(f, index_col=0) for f in all_files))

output_path = '/content/drive/MyDrive/TFM/T3/Embeddings_proteinas/embeddings_proteinas_ESM2.csv'
df_final.to_csv(output_path)

print(f"Embeddings combinados guardados en {output_path}. Shape: {df_final.shape}")


Processing proteins:   2%|▏         | 896/52454 [00:17<16:01, 53.62it/s]

Reduciendo batch_size a 16 por falta de memoria...


Processing proteins:   2%|▏         | 1040/52454 [00:21<20:42, 41.39it/s]

Reduciendo batch_size a 8 por falta de memoria...


Processing proteins:  40%|████      | 21224/52454 [04:50<04:31, 115.04it/s]

Reduciendo batch_size a 4 por falta de memoria...


Processing proteins:  72%|███████▏  | 37592/52454 [07:56<02:58, 83.36it/s]

Reduciendo batch_size a 2 por falta de memoria...


Processing proteins: 100%|██████████| 52454/52454 [10:26<00:00, 83.69it/s] 


Todas las batches guardadas por separado.
Embeddings combinados guardados en /content/drive/MyDrive/TFM/T3/Embeddings_proteinas/embeddings_proteinas_ESM2.csv. Shape: (52454, 320)


In [None]:
df_final = pd.read_csv("/content/drive/MyDrive/TFM/T3/Embeddings_proteinas/embeddings_proteinas_ESM2.csv", sep= ",")
df_final.shape

(52454, 321)

In [None]:
df_final.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,310,311,312,313,314,315,316,317,318,319
9h44_A,0.008862,0.09566,0.171258,-0.03113,0.367787,-0.117803,0.060643,0.121781,-0.169501,0.072652,...,0.168641,-0.175794,0.032702,0.307737,-0.033894,-0.077866,-0.021696,0.322878,0.027995,-0.052928
1v8b_A,-0.05199,-0.103026,0.109812,0.169877,0.162606,-0.055945,0.008946,-0.157162,-0.028174,-0.136981,...,-0.029608,0.079444,-0.067253,0.00799,0.11407,-0.045025,-0.194175,0.049571,0.101347,0.090473
1v8b_B,-0.05199,-0.103026,0.109812,0.169877,0.162606,-0.055945,0.008946,-0.157162,-0.028174,-0.136981,...,-0.029608,0.079444,-0.067253,0.00799,0.11407,-0.045025,-0.194175,0.049571,0.101347,0.090473
1v8b_C,-0.05199,-0.103026,0.109812,0.169877,0.162606,-0.055945,0.008946,-0.157162,-0.028174,-0.136981,...,-0.029608,0.079444,-0.067253,0.00799,0.11407,-0.045025,-0.194175,0.049571,0.101347,0.090473
1v8b_D,-0.05199,-0.103026,0.109812,0.169877,0.162606,-0.055945,0.008946,-0.157162,-0.028174,-0.136981,...,-0.029608,0.079444,-0.067253,0.00799,0.11407,-0.045025,-0.194175,0.049571,0.101347,0.090473
4a9x_A,0.06453,-0.220621,0.084138,0.062814,-0.001588,-0.100163,0.166354,-0.020563,-0.176198,-0.17621,...,0.088554,0.047522,-0.10552,0.073929,0.026492,-0.166805,-0.197511,0.283274,0.007119,-0.251535
1fpj_A,-0.00943,0.005248,0.037468,0.185945,0.154226,-0.089278,0.006685,-0.160373,-0.037681,-0.114708,...,-0.002319,0.156023,-0.091327,-0.032276,0.149683,-0.124877,-0.16081,0.066322,0.051334,0.033752
1fpj_B,-0.00943,0.005248,0.037468,0.185945,0.154226,-0.089278,0.006685,-0.160373,-0.037681,-0.114708,...,-0.002319,0.156023,-0.091327,-0.032276,0.149683,-0.124877,-0.16081,0.066322,0.051334,0.033752
1wxg_A,-0.151978,-0.114026,0.096006,0.136479,0.204621,0.031628,0.073664,-0.244825,-0.055968,-0.112123,...,-0.129122,-0.020047,-0.123501,0.06031,0.178482,-0.196384,-0.063752,0.057802,0.004223,-0.000275
8ji5_A,-0.015532,-0.125557,-0.023856,0.109467,0.040925,0.038663,0.030253,-0.052676,-0.113604,-0.166807,...,0.049122,-0.131522,-0.09729,0.064145,0.172222,-0.198963,-0.153327,-0.013934,-0.1053,-0.049111
