# Converting  Cambridge Structural Database (CSD) to compressed SDF files

Before running this notebook, you'll need to:

1. **Obtain a CSD license** - Contact the Cambridge Crystallographic Data Centre (CCDC) to get access to the Cambridge Structural Database.

2. **Download the CSD data** - After getting your license, download the database files from the CCDC portal.

3. **Setup the environment** - Create a conda environment using the provided environment file:
    ```bash
    conda env create -f env/csd-api-env.yml
    conda activate csd-api-env
    ```

This notebook is used to transform the CSD stored as sqlite to compressed sdf files

In [None]:
import gzip
import shutil
from pathlib import Path

from ccdc.io import EntryReader
from ccdc.io import MoleculeWriter

In [None]:
def compress2gzip(input_file: str | Path, output_file: str | Path, remove_original: bool = False) -> None:
    """
    Compress a file to gzip format.

    Parameters
    ----------
    input_file : str or Path
        Path to the input file that will be compressed.
    output_file : str or Path
        Path to the output compressed gzip file.
    remove_original : bool, optional
        If True, the original file will be removed after compression. Default is False.

    Returns
    -------
    None
        Function compresses a file but doesn't return any value.

    Raises
    ------
    TypeError
        If input_file or output_file are not str or Path objects.

    Notes
    -----
    Uses gzip and shutil to efficiently compress the input file.
    The original file is preserved unless remove_original is set to True.
    """
    # check if inputs are str or Path
    if not (isinstance(input_file, (str, Path)) and isinstance(output_file, (str, Path))):
        raise TypeError("input_file and output_file must be str or Path")

    # convert str to Path if needs be
    input_path = Path(input_file) if isinstance(input_file, str) else input_file
    output_path = Path(output_file) if isinstance(output_file, str) else output_file

    # compress file
    with input_path.open("rb") as f_in:
        with gzip.open(output_path, "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)

    # remove original file if requested
    if remove_original:
        input_path.unlink()

In [None]:
# initialize variables
batch_size = 10_000
mol_count = 0
file_count = 0
csd_reader = EntryReader("CSD")
output_dir = Path("../data/processed/csd_sdf_batches")
output_dir.mkdir(exist_ok=True, parents=True)
sdf_writer = MoleculeWriter(output_dir / f"CSD_batch_{file_count:03d}.sdf")

In [None]:
for entry in csd_reader:
    mol = entry.molecule

    if mol_count % batch_size == 0 and mol_count > 0:
        # close current SDF writer
        sdf_writer.close()
        # compress file
        compress2gzip(
            output_dir / f"CSD_batch_{file_count:03d}.sdf", output_dir / f"CSD_batch_{file_count:03d}.sdf.gz", remove_original=True
        )
        file_count += 1
        sdf_writer = MoleculeWriter(output_dir / f"CSD_batch_{file_count:03d}.sdf")

    sdf_writer.write(mol)
    mol_count += 1

# close last SDF writer
sdf_writer.close()
compress2gzip(output_dir / f"CSD_batch_{file_count:03d}.sdf", output_dir / f"CSD_batch_{file_count:03d}.sdf.gz", remove_original=True)