In [None]:
# https://pubchemdocs.ncbi.nlm.nih.gov/pubchem3d
# ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound_3D/01_conf_per_cmpd/SDF/
# ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound_3D/10_conf_per_cmpd/SDF/

To minimize hd usage and server load:
1. Sort all CIDs in the dataset
2. Download all .sdf file *names* from the FTP site
3. Group CIDs by the corresponding .sdf file
4. For each group
    - Download the corresponding .sdf file
    - Extract conformers for each CID
    - Delete the .sdf file
    - Save conformers to disk

In [None]:
# from pybel import readstring
from rdkit.Chem.PandasTools import LoadSDF
from ftplib import FTP
from tempfile import NamedTemporaryFile
from os.path import basename
import re

def _start_session(host):
    sess = FTP(host); sess.login()
    return sess

def _construct_path(n10=False):
    return '/pubchem/Compound_3D/' + ('10' if n10 else '01') + '_conf_per_cmpd/SDF'

def _fetch_sdf_paths(n10=False):
    sdf_path = _construct_path(n10)
    sess = _start_session('ftp.ncbi.nlm.nih.gov')
    sdf_paths = sess.nlst(sdf_path)
    return sdf_paths

def _extract_cid_ranges(fpaths):
    def cid_range(fname):
        groups = re.match(r'([0-9]+)_([0-9]+)', fname)
        rmin = groups.group(1)
        rmax = groups.group(2)
        return [rmin,rmax,fname]
    sdf_files = [basename(p) for p in fpaths]
    cid_ranges = [cid_range(fname) for fname in sdf_files]
    return cid_ranges

def _group_cids(cids, ranges):
    cid_groups = dict()
    for c in cids:
        for r in ranges:
            rmin = r[0]
            rmax = r[1]
            fname = r[2]
            if c >= rmin and c <= rmax:
                cid_groups.setdefault(fname, []).append(c) 
                break
    return cid_groups

def _fetch_sdf(fname, n10=False):
    cmd = 'RETR ' + fname
    fpath = _construct_path(n10)
    sess = _start_session('ftp.ncbi.nlm.nih.gov')
    sess.cwd(fpath)
    with open('temp.sdf.gz', 'w+', encoding='utf-8') as temp:
        # TODO: The file is downloaded as gzip compressed
        sess.retrlines(cmd, temp.write)
        res_sdf = LoadSDF('temp.sdf', embedProps=True)
    return res_sdf

def fetch3D(cids, n10=False):
    base_path = _construct_path(n10)
    sdf_paths = _fetch_sdf_paths(n10)
    cid_ranges = _extract_cid_ranges(sdf_paths)
    cid_groups = _group_cids(sorted(cids), cid_ranges)
    cid_frames = pd.DataFrame(ID=cids)
    for sdf,members in cid_groups.items():
        abs_path = base_path + '/' + sdf
        res_sdf = _fetch_sdf(abs_path)
        cid_match = res_sdf[res_sdf.ID.isin[members]]
        cid_frames = pd.merge(cid_frames, cid_match, on='ID', how='right')
    return cid_frames

In [None]:
fetch3D(['1450078'])

In [None]:
from os.path import basename
basename('pubchem/Compound_3D/01_conf_per_cmpd/SDF/04550001_04575000.sdf.gz')

In [None]:
import re
mm = re.match(r'(\d{8})_(\d{8})', basename('pubchem/Compound_3D/01_conf_per_cmpd/SDF/04550001_04575000.sdf.gz'))
mm.group(0)

In [None]:
x = [] + ['hi']
x.extend([''])
x

In [None]:
[] + [['hi']]

In [None]:
' '.join(['1,2,3,4', '4567'])

In [None]:
xx = LoadSDF('temp.sdf')

In [None]:
xx