In [None]:
from pathlib import Path
DATA_DIR = Path("data")   # tweak as you like

from datasets import load_dataset
import pyarrow.parquet as pq
import pyarrow as pa

REPO = "antoinebcx/smiles-molecules-chembl"     # or "HoangHa/chembl-smiles-pretrain"
SMILES_COL = "smiles"                           # change to "molecule_smiles" for the other repo
SPLITS = ["train", "validation", "test"]        # keep all three so `split='all'` still works

for split in SPLITS:
    print(f"► Writing {split} …")
    # 1) stream → prevents loading the whole corpus into RAM
    ds_stream = load_dataset(REPO, split=split, streaming=True)
    # 2) convert each streaming batch to Arrow Table & write to Parquet
    #    (one shard per split keeps life simple; you could shard every N rows if desired)
    batches = []
    for batch in ds_stream.iter(1_000_000):     # 1 M rows per Arrow batch
        batches.append(pa.Table.from_pydict({ "SMILES": batch[SMILES_COL] }))
    table = pa.concat_tables(batches)
    pq.write_table(
        table,
        DATA_DIR / f"chembl_{split}.parquet",
        compression="zstd",                     # snappy or gzip are fine too
        version="2.6"                           # Parquet V2 for good compression
    )