In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#@title Abrir los dataframes

import os
import pandas as pd

input_folder = "/content/drive/MyDrive/TFM/T3/Archivos"

# Dataframe Final (positivos)

input_file_df_final = os.path.join(input_folder, "df_FINAL_tot.csv")
df_final_positive = pd.read_csv(input_file_df_final, sep = ',')

# Dataframe decoys negativos

input_file_decoys_negativos = os.path.join(input_folder, "decoys_full.csv")
df_decoys_negativos = pd.read_csv(input_file_decoys_negativos, sep = ',')
df_decoys_negativos = df_decoys_negativos.dropna()

# Embedding ligandos positivos y negativos

input_file_emb_ligandos = "/content/drive/MyDrive/TFM/T3/Archivos/df_embeddings_ligandos_instancia_preentrenado.csv"
df_emb_ligandos = pd.read_csv(input_file_emb_ligandos, sep = ',')

# Embedding de las proteínas

input_file_emb_proteinas = os.path.join(input_folder, "embeddings_proteinas_ESM2.csv")
df_emb_proteinas = pd.read_csv(input_file_emb_proteinas, sep = ',')

# USR sitios
df_usr_sitio = pd.read_csv("/content/drive/MyDrive/TFM/T3/Archivos/df_usr_sitio.csv", sep=',')

# USR ligandos
input_file_df_ligand_smiles_USR = os.path.join(input_folder, "df_ligand_smiles_USR.csv")
df_ligand_smiles_USR = pd.read_csv(input_file_df_ligand_smiles_USR, sep = ',')
df_ligand_smiles_USR.shape[0]

In [None]:
df_pos = df_final_positive[['Nombre','PDB_entry_id','Ligand_smiles']]
df_pos = df_pos.rename(columns={'Nombre':'instancia'})

In [None]:
import pandas as pd


# Función auxiliar para extraer "PDBID_CHAIN" de la instancia
def extract_pdb_chain(inst):
    pdb, _, chain, *_ = inst.split('_')
    return f"{pdb}_{chain}"

# 1) Preparar positivos
df_pos2 = df_pos.copy()
df_pos2['instancia'] = df_pos2['instancia'] + '_positive'
df_pos2['Label']     = 1
df_pos2['PDB_chain'] = df_pos2['instancia'].apply(extract_pdb_chain)

# 2) Preparar negativos
df_neg2 = df_decoys_negativos.copy()
df_neg2['instancia'] = df_neg2['instancia'] + '_negative'
df_neg2['Label']     = 0
# renombrar USR_neg_i → USR_i_ligando
df_neg2 = df_neg2.rename(columns={f'USR_neg_{i}': f'USR_{i}_ligando' for i in range(1, 13)})
df_neg2['PDB_chain'] = df_neg2['instancia'].apply(extract_pdb_chain)

# 3) USR del ligando (positivos)
df_usr_lig_pos = df_ligand_smiles_USR.copy()
df_usr_lig_pos['instancia'] = df_usr_lig_pos['instancia'] + '_positive'
df_usr_lig_pos = df_usr_lig_pos.rename(columns={f'USR_{i}': f'USR_{i}_ligando' for i in range(1, 13)})

# 4) USR del sitio (base, sin suffix aún)
df_usr_site = df_usr_sitio.copy()
df_usr_site = df_usr_site.rename(columns={
    'instacia': 'instancia',
    **{f'USR_{i}': f'USR_{i}_sitio' for i in range(1, 13)}
})

# 5) Construir DataFrames base para concatenar
lig_cols   = [f'USR_{i}_ligando' for i in range(1, 13)]
base_cols  = ['instancia', 'PDB_chain', 'Ligand_smiles', 'Label'] + lig_cols

# Positivos: merge con USR ligando
df_pos_base = (
    df_pos2[['instancia', 'PDB_chain', 'Ligand_smiles', 'Label']]
    .merge(
        df_usr_lig_pos[['instancia', 'Ligand_smiles'] + lig_cols],
        on=['instancia', 'Ligand_smiles'],
        how='left'
    )
)

# Negativos: ya tienen columnas USR_ligando
df_neg_base = df_neg2[base_cols]

# Concatenar positivos y negativos
df_main = pd.concat([df_pos_base, df_neg_base], ignore_index=True)

# 6) Añadir embeddings del ligando
df_emb_lig = df_emb_ligandos.copy()
df_emb_lig = df_emb_lig.rename(columns={str(i): f'emb_lig_{i}' for i in range(128)})
df_main = df_main.merge(
    df_emb_lig[['instancia'] + [f'emb_lig_{i}' for i in range(128)]],
    on='instancia', how='left'
)

# 7) Añadir embeddings de la proteína
df_emb_prot = df_emb_proteinas.copy()
df_emb_prot = df_emb_prot.rename(columns={
    'Unnamed: 0': 'PDB_entry_id',
    **{str(i): f'emb_prot_{i}' for i in range(320)}
})
df_emb_prot['PDB_chain'] = (
    df_emb_prot['PDB_entry_id'].str[:4].str.upper() + '_' +
    df_emb_prot['PDB_entry_id'].str[5:6]
)
df_emb_prot = df_emb_prot.drop(columns=['PDB_entry_id'])
df_main = df_main.merge(df_emb_prot, on='PDB_chain', how='left')

# 8) Añadir USR del sitio usando el nombre base de instancia
# Creamos columna auxiliar 'inst_base' sin sufijo
df_main['inst_base'] = (
    df_main['instancia']
        .str.replace('_positive', '', regex=False)
        .str.replace('_negative', '', regex=False)
)
# Para df_usr_site también
df_usr_site['inst_base'] = df_usr_site['instancia']

usr_site_cols = [f'USR_{i}_sitio' for i in range(1, 13)]
df_main = df_main.merge(
    df_usr_site[['inst_base'] + usr_site_cols],
    on='inst_base', how='left'
)
df_main = df_main.drop(columns=['inst_base'])

# 9) Limpiar sufijos _x/_y y eliminar duplicados
df_main.columns = (
    df_main.columns
        .str.replace(r'_x$', '', regex=True)
        .str.replace(r'_y$', '', regex=True)
)
df_main = df_main.loc[:, ~df_main.columns.duplicated()]

# 10) Filtrar columnas finales esperadas
final_cols = (
    ['instancia', 'Label'] +
    [f'emb_lig_{i}' for i in range(128)] +
    [f'emb_prot_{i}' for i in range(320)] +
    lig_cols +
    usr_site_cols
)
df_main = df_main[[c for c in final_cols if c in df_main.columns]]

# 11) Verificación final
print(f"df_main final: {df_main.shape[0]} filas × {df_main.shape[1]} columnas")
print("NaNs en USR_sitio tras merge:",
      df_main[usr_site_cols].isna().sum())


✅ df_main final: 140606 filas × 474 columnas
NaNs en USR_sitio tras merge: USR_1_sitio     28
USR_2_sitio     28
USR_3_sitio     28
USR_4_sitio     28
USR_5_sitio     28
USR_6_sitio     28
USR_7_sitio     28
USR_8_sitio     28
USR_9_sitio     28
USR_10_sitio    28
USR_11_sitio    28
USR_12_sitio    28
dtype: int64


In [None]:
df_main = df_main.dropna()

In [None]:
df_main.shape

(140348, 474)

In [None]:
df_main.to_csv("/content/drive/MyDrive/TFM/T3/Archivos/df_tot_pos_neg_usr_emb_prot_liga_preentrenado.csv")