# 02 · Ejecutar MolForge (CPU)
Notebook para lanzar MolForge **en CPU** a partir de un fichero de fingerprints.

## Parámetros

In [None]:
# Rutas
fps_path = "data/MolForge_input/morgan_2048.parquet"
checkpoint_path = "data/models/your_checkpoint.pth"  # pon aquí tu .pth
out_path = "data/MolForge_output/molforge_outputs.parquet"

# Config de MolForge
fp_name = "ECFP4"           # Debe corresponder al modelo entrenado
model_type = "smiles"       # o "selfies" si aplica
decode = "greedy"           # o "beam" si tu repo lo soporta

## Dependencias y modo CPU

In [None]:
import os, sys, io, pandas as pd
from contextlib import redirect_stdout

# Forzamos CPU (sin selector GPU/CPU)
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# Import de MolForge
from MolForge import main as molforge_main


def read_table(path: str):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".parquet":
        return pd.read_parquet(path)
    elif ext in (".csv", ".tsv", ".txt"):
        sep = "\t" if ext == ".tsv" else None
        return pd.read_csv(path, sep=sep)
    else:
        raise ValueError(f"Extensión no soportada: {ext}")

def active_indices(row_bits):
    return " ".join(str(i) for i, v in enumerate(row_bits) if int(v) == 1)

def extract_result_from_stdout(stdout_str: str):
    result = None
    for line in stdout_str.splitlines():
        if line.strip().startswith("Result:"):
            tokens = line.split("Result:", 1)[1].strip()
            result = tokens.replace(" ", "")
            break
    return result

def run_for_fp(indices_str: str, fp_name: str, model_type: str, checkpoint: str, decode: str):
    argv_backup = sys.argv
    sys.argv = [
        "predict.py",
        f"--fp={fp_name}",
        f"--model_type={model_type}",
        f"--input={indices_str}",
        f"--checkpoint={checkpoint}",
        f"--decode={decode}",
    ]
    buf = io.StringIO()
    try:
        with redirect_stdout(buf):
            molforge_main()
    finally:
        sys.argv = argv_backup
    out = buf.getvalue()
    smiles = extract_result_from_stdout(out)
    return smiles, out

## Ejecutar

In [None]:
df = read_table(fps_path)
bit_cols = [c for c in df.columns if c.startswith("fp_")]
bit_cols = sorted(bit_cols, key=lambda c: int(c.split("_")[1]))
ids = df["id"].astype(str).tolist() if "id" in df.columns else [str(i) for i in range(len(df))]

results = []
for i, idx in enumerate(ids):
    row_bits = df.loc[df.index[i], bit_cols]
    indices_str = active_indices(row_bits)
    if not indices_str:
        results.append((idx, None, ""))
        continue
    smiles, raw = run_for_fp(indices_str, fp_name, model_type, checkpoint_path, decode)
    results.append((idx, smiles, raw))
    print(f"[{i+1}/{len(ids)}] id={idx} -> {smiles}")

out_df = pd.DataFrame(results, columns=["id", "molforge_smiles", "raw_stdout"])
ext = os.path.splitext(out_path)[1].lower()
os.makedirs(os.path.dirname(out_path) or ".", exist_ok=True)
if ext == ".parquet":
    out_df.to_parquet(out_path, index=False)
elif ext in (".csv", ".tsv", ".txt"):
    sep = "\t" if ext == ".tsv" else ","
    out_df.to_csv(out_path, index=False, sep=sep)
else:
    raise ValueError(f"Extensión de salida no soportada: {ext}")

print(f"[OK] Guardado: {out_path} | filas={len(out_df)}")