# MolForge — Ejecución desde **preprocesado (5 filas, 2ª columna)**

Origen: `data/MolForge_input/fingerprints_test_repo_original/ECFP4.smiles.test`  
Este notebook:
1. Sitúa el *cwd* en la raíz del proyecto (para que `saved_models/` esté en vecindad).  
2. Lee el fichero original.  
3. **Preprocesa**: selecciona **las 5 primeras filas** y **la 2ª columna**.  
4. Muestra esa porción en pantalla.  
5. Crea una tabla con `id` e `indices` y ejecuta MolForge solo sobre esas 5 filas.  
6. Guarda resultados en `data/MolForge_output/molforge_outputs_preprocessed_5.csv`.


In [1]:
# === Bootstrap: cwd en la raíz del proyecto ===
import os
from pathlib import Path

CANDIDATES = [
    Path("/mnt/d/MolForge_Testing"),                 # PC local (WSL)
    Path("/export/home/ddiestre/MolForge_Testing"),  # Laboratorio Linux
]
root = next((p for p in CANDIDATES if p.exists()), None)

if root is None:
    # fallback: subir hasta encontrar scripts/ y saved_models/
    p = Path.cwd()
    for _ in range(8):
        if (p/"scripts").exists() and (p/"saved_models").exists():
            root = p
            break
        if p.parent == p:
            break
        p = p.parent

if root is None:
    root = Path.cwd()
    print("⚠️  No pude detectar automáticamente la raíz del proyecto; uso:", root)
else:
    os.chdir(root)

PROJECT_ROOT = Path.cwd().resolve()
DATA_DIR     = PROJECT_ROOT / "data"
MODELS_DIR   = PROJECT_ROOT / "saved_models"
OUTPUT_DIR   = DATA_DIR / "MolForge_output"
for d in (DATA_DIR, MODELS_DIR, OUTPUT_DIR):
    d.mkdir(parents=True, exist_ok=True)

print("📁 PROJECT_ROOT:", PROJECT_ROOT)
print("📁 DATA_DIR    :", DATA_DIR)
print("📁 MODELS_DIR  :", MODELS_DIR)
print("📁 OUTPUT_DIR  :", OUTPUT_DIR)


📁 PROJECT_ROOT: /mnt/d/MolForge_Testing
📁 DATA_DIR    : /mnt/d/MolForge_Testing/data
📁 MODELS_DIR  : /mnt/d/MolForge_Testing/saved_models
📁 OUTPUT_DIR  : /mnt/d/MolForge_Testing/data/MolForge_output


In [2]:
# === Parámetros ===
FP_NAME        = "ECFP4"
MODEL_TYPE     = "smiles"     # ["smiles", "selfies"]
DECODE         = "greedy"     # ["greedy", "beam"]
CHECKPOINT_NAME = "ECFP4_smiles_checkpoint.pth"  # SOLO nombre

ORIG_PATH = DATA_DIR / "MolForge_input" / "fingerprints_test_repo_original" / "ECFP4.smiles.test"
PREPROC_OUT = OUTPUT_DIR / "preprocessed_indices_5rows_secondcol.csv"
FINAL_OUT   = OUTPUT_DIR / "molforge_outputs_preprocessed_5.csv"

print("Origen:", ORIG_PATH, "| existe:", ORIG_PATH.exists())
print("Checkpoint:", MODELS_DIR / CHECKPOINT_NAME, "| existe:", (MODELS_DIR / CHECKPOINT_NAME).exists())


Origen: /mnt/d/MolForge_Testing/data/MolForge_input/fingerprints_test_repo_original/ECFP4.smiles.test | existe: True
Checkpoint: /mnt/d/MolForge_Testing/saved_models/ECFP4_smiles_checkpoint.pth | existe: True


In [3]:
# === Leer fichero original y preprocesar (5 filas, 2ª columna) ===
import pandas as pd
import os

def read_any(path):
    ext = os.path.splitext(str(path))[-1].lower()
    if ext == ".parquet":
        return pd.read_parquet(path)
    if ext == ".csv":
        return pd.read_csv(path)
    if ext == ".tsv":
        return pd.read_csv(path, sep="\t")
    # .test u otros: intentamos autodetectar separador
    return pd.read_csv(path, sep=None, engine="python", header="infer")

df_raw = read_any(ORIG_PATH)
print("Shape original:", df_raw.shape)
display(df_raw.head(8))

# Selecciona 5 primeras filas, segunda columna (índice 1)
second_col_series = df_raw.iloc[:5, 1]
# Normaliza a string por si vienen números
second_col_series = second_col_series.astype(str)

# Construimos tabla con 'id' e 'indices'
df_pre = pd.DataFrame({
    "id": second_col_series.index.astype(str),
    "indices": second_col_series.values
})

# Guardamos el preprocesado como CSV simple (id,indices)
PREPROC_OUT.parent.mkdir(parents=True, exist_ok=True)
df_pre.to_csv(PREPROC_OUT, index=False)

print("=== Porción preprocesada (5 filas, 2ª columna) ===")
display(df_pre)
print("Guardado preprocesado en:", PREPROC_OUT)


Shape original: (9999, 2)


Unnamed: 0,C C O C 1 = C ( C = C ( C = C 1 ) C ( C ( C ) ( C ) C ) N ) O C C,1 80 94 114 237 241 255 294 392 411 425 695 743 747 786 875 1057 1171 1238 1365 1380 1452 1544 1750 1773 1853 1873 1970
0,C 1 = C C = C ( C = C 1 ) C 2 C = C ( N C 3 = ...,97 101 314 378 389 442 501 650 728 817 896 909...
1,C O C 1 = C C = C ( C = C 1 ) C ( = O ) C 2 = ...,9 45 78 89 145 203 322 548 586 650 695 718 760...
2,C 1 C N ( C C N 1 C C 2 = C C 3 = C ( C = C 2 ...,74 80 116 197 255 265 310 341 378 407 437 441 ...
3,C C 1 = C C = C ( C = C 1 ) S ( = O ) ( = O ) ...,94 186 210 227 235 249 319 335 348 350 389 440...
4,C C 1 C C C C N 1 S ( = O ) ( = O ) C 2 = C C ...,13 80 145 147 319 350 354 374 500 606 650 676 ...
5,C 1 C N ( C C C 1 C C 2 = C C = C ( C = C 2 ) ...,13 29 80 216 230 310 322 458 561 562 573 653 6...
6,C O C 1 = C C = C C ( = C 1 ) C ( = O ) N 2 C ...,14 80 195 196 241 242 246 305 310 322 383 395 ...
7,C C 1 = C C = C C = C 1 N 2 C ( = O ) [C@@] 3 ...,92 119 134 184 249 255 274 314 371 566 599 644...


=== Porción preprocesada (5 filas, 2ª columna) ===


Unnamed: 0,id,indices
0,0,97 101 314 378 389 442 501 650 728 817 896 909...
1,1,9 45 78 89 145 203 322 548 586 650 695 718 760...
2,2,74 80 116 197 255 265 310 341 378 407 437 441 ...
3,3,94 186 210 227 235 249 319 335 348 350 389 440...
4,4,13 80 145 147 319 350 354 374 500 606 650 676 ...


Guardado preprocesado en: /mnt/d/MolForge_Testing/data/MolForge_output/preprocessed_indices_5rows_secondcol.csv


In [4]:
# === Envoltorio MolForge: usar basename del checkpoint ===
import sys, io
from contextlib import redirect_stdout
from MolForge.predict import main as molforge_main

def run_for_fp(indices_str: str, fp_name: str, model_type: str, checkpoint_name: str, decode: str):
    import os
    assert os.path.exists(os.path.join("saved_models", checkpoint_name)), f"No existe saved_models/{checkpoint_name}"
    argv_backup = sys.argv
    sys.argv = [
        "predict.py",
        f"--fp={fp_name}",
        f"--model_type={model_type}",
        f"--input={indices_str}",
        f"--checkpoint={checkpoint_name}",  # <-- solo nombre
        f"--decode={decode}",
    ]
    buf = io.StringIO()
    try:
        with redirect_stdout(buf):
            molforge_main()
    finally:
        sys.argv = argv_backup
    out = buf.getvalue()
    for line in out.splitlines():
        if line.strip().startswith("Result:"):
            return line.split("Result:", 1)[1].strip().replace(" ", "")
    return None

print("✅ run_for_fp listo (basename del checkpoint).")


✅ run_for_fp listo (basename del checkpoint).


In [5]:
# === Ejecutar MolForge sobre la tabla preprocesada (5 filas) y guardar salida ===
smiles_out = []
for i, (idx, indices_str) in enumerate(zip(df_pre["id"], df_pre["indices"]), start=1):
    s = run_for_fp(str(indices_str), FP_NAME, MODEL_TYPE, CHECKPOINT_NAME, DECODE)
    smiles_out.append((idx, s))
    print(f"[{i}/5] id={idx} -> {s}")

out_df = pd.DataFrame(smiles_out, columns=["id", "molforge_smiles"])
final_df = df_pre.merge(out_df, on="id", how="left")
final_df.to_csv(FINAL_OUT, index=False)
print("[OK] Guardado:", FINAL_OUT)
display(final_df)


[1/5] id=0 -> C1=CC=C(C=C1)C2C=C(NC(=O)C2C3=NC(=S)NC(=O)C34C(=NC(=S)NC4=O)C5=CC=C(C=C5)Br)C6=CC=C(C=C6)Br
[2/5] id=1 -> COC1=CC=C(C=C1)C(=O)C2=C(C(=C3N2C4=CC=CC=C4C=C3)C(=O)OC)C(=O)OC
[3/5] id=2 -> C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C4=CC(=NC(=N4)SCC5=CC(=CC=C5)C(=O)NCC6=CN=CC=C6)Cl
[4/5] id=3 -> CC1=CC=C(C=C1)S(=O)(=O)OC2=C(C=C(C=C2)C3C(C(C3C(=O)O)C4=CC(=C(C=C4)OS(=O)(=O)C5=CC=C(C=C5)C)OC)C(=O)O)OC
[5/5] id=4 -> CC1CCCCN1S(=O)(=O)C2=CC=C(C=C2)C(=O)OCCOC3=CC=CC=C3C(=O)C
[OK] Guardado: /mnt/d/MolForge_Testing/data/MolForge_output/molforge_outputs_preprocessed_5.csv


Unnamed: 0,id,indices,molforge_smiles
0,0,97 101 314 378 389 442 501 650 728 817 896 909...,C1=CC=C(C=C1)C2C=C(NC(=O)C2C3=NC(=S)NC(=O)C34C...
1,1,9 45 78 89 145 203 322 548 586 650 695 718 760...,COC1=CC=C(C=C1)C(=O)C2=C(C(=C3N2C4=CC=CC=C4C=C...
2,2,74 80 116 197 255 265 310 341 378 407 437 441 ...,C1CN(CCN1CC2=CC3=C(C=C2)OCO3)C4=CC(=NC(=N4)SCC...
3,3,94 186 210 227 235 249 319 335 348 350 389 440...,CC1=CC=C(C=C1)S(=O)(=O)OC2=C(C=C(C=C2)C3C(C(C3...
4,4,13 80 145 147 319 350 354 374 500 606 650 676 ...,CC1CCCCN1S(=O)(=O)C2=CC=C(C=C2)C(=O)OCCOC3=CC=...
