# MolForge – preprocess & run (Notebook listo)
Este notebook fija el *cwd* al root del proyecto, usa `saved_models/` por vecindad, y fuerza que el checkpoint se pase por **nombre de archivo** para que `MolForge/predict.py` lo encuentre.

In [1]:
# === Bootstrap: coloca el cwd en la raíz del proyecto ===
import os
from pathlib import Path

CANDIDATES = [
    Path("/mnt/d/MolForge_Testing"),                 # PC local (WSL)
    Path("/export/home/ddiestre/MolForge_Testing"),  # Laboratorio Linux
]

root = next((p for p in CANDIDATES if p.exists()), None)
if root is None:
    # fallback: sube hasta encontrar scripts/ y saved_models/
    p = Path.cwd()
    for _ in range(8):
        if (p/"scripts").exists() and (p/"saved_models").exists():
            root = p
            break
        if p.parent == p:
            break
        p = p.parent

if root is None:
    root = Path.cwd()
    print("⚠️  No pude detectar automáticamente la raíz del proyecto; uso:", root)
else:
    os.chdir(root)

PROJECT_ROOT = Path.cwd().resolve()
DATA_DIR     = PROJECT_ROOT / "data"
MODELS_DIR   = PROJECT_ROOT / "saved_models"
OUTPUT_DIR   = DATA_DIR / "MolForge_output"
for d in (DATA_DIR, MODELS_DIR, OUTPUT_DIR):
    d.mkdir(parents=True, exist_ok=True)

print("📁 PROJECT_ROOT:", PROJECT_ROOT)
print("📁 DATA_DIR    :", DATA_DIR)
print("📁 MODELS_DIR  :", MODELS_DIR)
print("📁 OUTPUT_DIR  :", OUTPUT_DIR)


📁 PROJECT_ROOT: /mnt/d/MolForge_Testing
📁 DATA_DIR    : /mnt/d/MolForge_Testing/data
📁 MODELS_DIR  : /mnt/d/MolForge_Testing/saved_models
📁 OUTPUT_DIR  : /mnt/d/MolForge_Testing/data/MolForge_output


In [2]:
# === Parámetros ===
FP_NAME     = "ECFP4"
MODEL_TYPE  = "smiles"   # ["smiles","selfies"]
DECODE      = "greedy"   # ["greedy","beam"]

# Nombre de checkpoint (NO ruta absoluta). Debe existir en saved_models/
CHECKPOINT_NAME = "ECFP4_smiles_checkpoint.pth"

# Entradas
INPUT_PATH  = (DATA_DIR / "MolForge_input" / "test_1.csv")
OUT_PATH    = (OUTPUT_DIR / "molforge_outputs_demo.csv")

print("Checkpoint en:", MODELS_DIR / CHECKPOINT_NAME, "| existe:", (MODELS_DIR / CHECKPOINT_NAME).exists())
print("Input en     :", INPUT_PATH, "| existe:", INPUT_PATH.exists())
print("Salida en    :", OUT_PATH)


Checkpoint en: /mnt/d/MolForge_Testing/saved_models/ECFP4_smiles_checkpoint.pth | existe: True
Input en     : /mnt/d/MolForge_Testing/data/MolForge_input/test_1.csv | existe: True
Salida en    : /mnt/d/MolForge_Testing/data/MolForge_output/molforge_outputs_demo.csv


In [3]:
# === Utilidades para leer el fichero de entrada y normalizar a 'indices' ===
import pandas as pd
import os

def _read_any_table(path: str) -> pd.DataFrame:
    ext = os.path.splitext(path)[1].lower()
    if ext == ".parquet":
        return pd.read_parquet(path)
    if ext == ".tsv":
        return pd.read_csv(path, sep="\t")
    if ext == ".csv":
        return pd.read_csv(path, sep=",")
    if ext == ".txt":
        return pd.read_csv(path, sep=None, engine="python", header="infer")
    return pd.read_csv(path, sep=None, engine="python", header="infer")

def _active_indices(row_bits):
    return " ".join(str(i) for i, v in enumerate(row_bits) if int(v) == 1)

def read_fps_as_indices_df(path):
    df = _read_any_table(str(path))
    if "indices" in df.columns:
        out = pd.DataFrame()
        out["id"] = df["id"].astype(str) if "id" in df.columns else df.index.astype(str)
        out["indices"] = df["indices"].astype(str).fillna("")
        return out
    if df.shape[1] == 1:
        col = df.columns[0]
        out = pd.DataFrame()
        out["id"] = df.index.astype(str)
        out["indices"] = df[col].astype(str).fillna("")
        return out
    bit_cols = [c for c in df.columns if c.startswith("fp_")]
    if bit_cols:
        bit_cols = sorted(bit_cols, key=lambda c: int(c.split("_")[1]))
        out = pd.DataFrame()
        out["id"] = df["id"].astype(str) if "id" in df.columns else df.index.astype(str)
        out["indices"] = [_active_indices(df.loc[i, bit_cols]) for i in range(len(df))]
        return out
    raise SystemExit("Formato no soportado: aporta 'indices', 1 columna, o 'fp_0000..fp_N'.")

df_idx = read_fps_as_indices_df(INPUT_PATH)
print("Filas leídas:", len(df_idx))
df_idx.head()


Filas leídas: 3


Unnamed: 0,id,indices
0,0,1 80 94 114 237 241 255 294 392 411 425 695 74...
1,1,97 101 314 378 389 442 501 650 728 817 896 909...
2,2,9 45 78 89 145 203 322 548 586 650 695 718 760...


In [4]:
# === Envoltorio MolForge: pasar SOLO el nombre del checkpoint (basename) ===
import sys, io
from contextlib import redirect_stdout
from pathlib import Path
from MolForge.predict import main as molforge_main

def run_for_fp(indices_str: str, fp_name: str, model_type: str, checkpoint_name: str, decode: str):
    assert (MODELS_DIR / checkpoint_name).exists(), f"No existe {MODELS_DIR/checkpoint_name}"
    argv_backup = sys.argv
    sys.argv = [
        "predict.py",
        f"--fp={fp_name}",
        f"--model_type={model_type}",
        f"--input={indices_str}",
        f"--checkpoint={checkpoint_name}",  # << clave: solo nombre
        f"--decode={decode}",
    ]
    buf = io.StringIO()
    try:
        with redirect_stdout(buf):
            molforge_main()
    finally:
        sys.argv = argv_backup
    out = buf.getvalue()
    for line in out.splitlines():
        if line.strip().startswith("Result:"):
            return line.split("Result:", 1)[1].strip().replace(" ", "")
    return None

print("✅ run_for_fp listo (usa basename del checkpoint).")


✅ run_for_fp listo (usa basename del checkpoint).


In [5]:
# === Ejecución sobre todas las filas ===
smiles_out = []
for i, (idx, indices_str) in enumerate(zip(df_idx["id"], df_idx["indices"]), start=1):
    s = run_for_fp(str(indices_str), FP_NAME, MODEL_TYPE, CHECKPOINT_NAME, DECODE)
    smiles_out.append((idx, s))
    print(f"[{i}/{len(df_idx)}] id={idx} -> {s}")

out_df = pd.DataFrame(smiles_out, columns=["id", "molforge_smiles"])
OUT_PATH.parent.mkdir(parents=True, exist_ok=True)
out_df.to_csv(OUT_PATH, index=False)
print("[OK] Guardado:", OUT_PATH, "| filas=", len(out_df))
out_df.head()


[1/3] id=0 -> CCOC1=C(C=C(C=C1)C(C(C)(C)C)N)OCC
[2/3] id=1 -> C1=CC=C(C=C1)C2C=C(NC(=O)C2C3=NC(=S)NC(=O)C34C(=NC(=S)NC4=O)C5=CC=C(C=C5)Br)C6=CC=C(C=C6)Br
[3/3] id=2 -> COC1=CC=C(C=C1)C(=O)C2=C(C(=C3N2C4=CC=CC=C4C=C3)C(=O)OC)C(=O)OC
[OK] Guardado: /mnt/d/MolForge_Testing/data/MolForge_output/molforge_outputs_demo.csv | filas= 3


Unnamed: 0,id,molforge_smiles
0,0,CCOC1=C(C=C(C=C1)C(C(C)(C)C)N)OCC
1,1,C1=CC=C(C=C1)C2C=C(NC(=O)C2C3=NC(=S)NC(=O)C34C...
2,2,COC1=CC=C(C=C1)C(=O)C2=C(C(=C3N2C4=CC=CC=C4C=C...
