# 01 · SMILES → Fingerprints (CPU)
Convierte una tabla con **SMILES** en **fingerprints** usando RDKit.

**Entrada**: CSV/Parquet con columna `smiles` (y opcional `id`).

**Salida**: Parquet/CSV con `id`, `smiles` y `fp_0000...` en `data/MolForge_input/`.


## Parámetros

In [None]:
# Ajusta estos parámetros
input_path = "data/SMILES/molecules.csv"  # CSV o Parquet
smiles_col = "smiles"
id_col = None  # por ejemplo "id" si la tienes
fp = "morgan"  # morgan | maccs | rdkit | atompair | tt
radius = 2     # solo para morgan
nBits = 2048   # ignorado en MACCS
output_path = "data/MolForge_input/morgan_2048.parquet"  # .parquet o .csv

## Dependencias (entorno `molforge-tools`)

In [None]:
import os, pandas as pd, numpy as np
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem, MACCSkeys, rdMolDescriptors as rdDesc

def to_numpy(bitvect):
    n_bits = bitvect.GetNumBits()
    arr = np.zeros((n_bits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(bitvect, arr)
    return arr

def fp_from_mol(mol, kind: str, nBits: int, radius: int):
    kind = kind.lower()
    if kind == "morgan":
        bv = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
    elif kind == "maccs":
        bv = MACCSkeys.GenMACCSKeys(mol)  # 167 bits
    elif kind == "rdkit":
        bv = Chem.RDKFingerprint(mol, fpSize=nBits)
    elif kind == "atompair":
        bv = rdDesc.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=nBits)
    elif kind == "tt":
        bv = rdDesc.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=nBits)
    else:
        raise ValueError(f"Fingerprint desconocido: {kind}")
    return to_numpy(bv)

def read_table(path: str, smiles_col: str):
    ext = os.path.splitext(path)[1].lower()
    if ext == ".parquet":
        df = pd.read_parquet(path)
    elif ext in (".csv", ".tsv", ".txt"):
        sep = "\t" if ext == ".tsv" else None
        df = pd.read_csv(path, sep=sep)
    else:
        raise ValueError(f"Extensión no soportada: {ext}")
    if smiles_col not in df.columns:
        raise ValueError(f"No existe la columna SMILES: {smiles_col}")
    return df

## Ejecutar conversión

In [None]:
df_in = read_table(input_path, smiles_col)
if id_col and id_col in df_in.columns:
    ids = df_in[id_col].astype(str).tolist()
else:
    ids = df_in.index.astype(str).tolist()

smiles_list = df_in[smiles_col].astype(str).tolist()

fps_list, keep_ids, keep_smiles = [], [], []
for _id, smi in zip(ids, smiles_list):
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        continue
    arr = fp_from_mol(mol, fp, nBits, radius)
    fps_list.append(arr); keep_ids.append(_id); keep_smiles.append(smi)

if not fps_list:
    raise SystemExit("No se generó ningún fingerprint (¿SMILES inválidos?)")

nbits = len(fps_list[0])
col_names = [f"fp_{i:04d}" for i in range(nbits)]
df_fp = pd.DataFrame(np.vstack(fps_list), columns=col_names)
df_fp.insert(0, "smiles", keep_smiles)
df_fp.insert(0, "id", keep_ids)

ext = os.path.splitext(output_path)[1].lower()
os.makedirs(os.path.dirname(output_path) or ".", exist_ok=True)
if ext == ".parquet":
    df_fp.to_parquet(output_path, index=False)
elif ext in (".csv", ".tsv", ".txt"):
    sep = "\t" if ext == ".tsv" else ","
    df_fp.to_csv(output_path, index=False, sep=sep)
else:
    raise ValueError(f"Extensión de salida no soportada: {ext}")

print(f"[OK] Guardado: {output_path} | filas={len(df_fp)} | bits={nbits}")