In [1]:
import csv
from pathlib import Path

from rdkit import RDLogger
from canonical_featurizers import featurize_molecule

# Silence noisy RDKit warnings like GetValence(getExplicit=False)
RDLogger.DisableLog("rdApp.warning")

THIS_DIR = Path(".").resolve()
SAMPLE_CSV = THIS_DIR / "example_smiles.csv"

MOLECULE_NAMES = {
    "CC": "ethane",
    "CCC": "propane",
    "CCCC": "butane",
    "CCO": "ethanol",
    "c1ccccc1": "benzene",
    "CC(=O)O": "acetic acid",
}


def load_smiles(path: Path):
    if not path.exists():
        raise FileNotFoundError(
            f"Expected CSV file not found: {path}\n"
            "Put a small 100-SMILES file named 'example_smiles.csv' next to this notebook.\n"
            "CSV header must be: id,smiles"
        )

    smiles_list = []
    with path.open() as f:
        reader = csv.DictReader(f)
        for row in reader:
            smiles_list.append((row["id"], row["smiles"]))
    return smiles_list



In [2]:
# Load SMILES and featurize them, mirroring example.py

smiles_data = load_smiles(SAMPLE_CSV)
print(f"Loaded {len(smiles_data)} SMILES from {SAMPLE_CSV.name}")

all_atom_feat_shapes = []
all_bond_feat_shapes = []
example_details = []

for idx, smi in smiles_data:
    try:
        atom_feats, bond_feats = featurize_molecule(smi)
    except ValueError as e:
        print(f"[WARN] Skipping row id={idx} due to parsing error: {e}")
        continue

    atom_shape = atom_feats["h"].shape
    bond_shape = bond_feats["e"].shape

    all_atom_feat_shapes.append(atom_shape)
    all_bond_feat_shapes.append(bond_shape)

    if len(example_details) < 5:
        name = MOLECULE_NAMES.get(smi, "(name not set)")
        example_details.append(
            {
                "id": idx,
                "smiles": smi,
                "name": name,
                "atom_shape": atom_shape,
                "bond_shape": bond_shape,
            }
        )

if not all_atom_feat_shapes:
    print("No valid SMILES were featurized.")
else:
    print("\nExample featurization summary")
    print("-----------------------------------")
    print(f"Number of molecules featurized: {len(all_atom_feat_shapes)}")
    first_atom_shape = all_atom_feat_shapes[0]
    first_bond_shape = all_bond_feat_shapes[0]
    print(f"Atom feature shape of first molecule: {first_atom_shape}  (N_atoms, 74)")
    print(f"Bond feature shape of first molecule: {first_bond_shape}  (N_bonds*2, 12)")

    print("\nExample molecules (up to 5):")
    for ex in example_details:
        print(
            f"  id={ex['id']}, name={ex['name']}, "
            f"smiles={ex['smiles']}, "
            f"atoms_shape={ex['atom_shape']}, bonds_shape={ex['bond_shape']}"
        )



Loaded 100 SMILES from example_smiles.csv

Example featurization summary
-----------------------------------
Number of molecules featurized: 100
Atom feature shape of first molecule: (2, 74)  (N_atoms, 74)
Bond feature shape of first molecule: (2, 12)  (N_bonds*2, 12)

Example molecules (up to 5):
  id=0, name=ethane, smiles=CC, atoms_shape=(2, 74), bonds_shape=(2, 12)
  id=1, name=propane, smiles=CCC, atoms_shape=(3, 74), bonds_shape=(4, 12)
  id=2, name=butane, smiles=CCCC, atoms_shape=(4, 74), bonds_shape=(6, 12)
  id=3, name=ethanol, smiles=CCO, atoms_shape=(3, 74), bonds_shape=(4, 12)
  id=4, name=(name not set), smiles=CCN, atoms_shape=(3, 74), bonds_shape=(4, 12)
