In [None]:
from utilities import metal_element, write_components, identify_host_guest_multiple, move_need_check_file, \
    format_molecular_formula, get_mol_formula
from utilities import metal_element
import os
import shutil
from rdkit import Chem
from ccdc import io
from tqdm import tqdm

In [None]:
base_dir = '/path/to/your/directory'
outdir = os.path.join(base_dir, '1.1.ConfigAndSplit')
organic_path = os.path.join(outdir, 'Organic')
inorganic_path = os.path.join(outdir, 'Inorganic')
check_dir = os.path.join(outdir, 'CheckDir')
os.makedirs(outdir, exist_ok=True)
os.makedirs(check_dir, exist_ok=True)
os.makedirs(organic_path, exist_ok=True)
os.makedirs(inorganic_path, exist_ok=True)
os.makedirs(os.path.join(check_dir, 'Organic'), exist_ok=True)
os.makedirs(os.path.join(check_dir, 'Inorganic'), exist_ok=True)
os.makedirs(os.path.join(organic_path, 'disorder'), exist_ok=True)
os.makedirs(os.path.join(organic_path, 'no_disorder'), exist_ok=True)
os.makedirs(os.path.join(inorganic_path, 'disorder'), exist_ok=True)
os.makedirs(os.path.join(inorganic_path, 'no_disorder'), exist_ok=True)

csd_reader = io.EntryReader()
print(len(csd_reader))


In [None]:
"""
With a single click, automate the screening of the CSD database to perform
disordered processing, molecule extraction, chemical structure validation,
and host-guest identification.
The results are then categorized and saved into molecular structure files.

For testing purposes, slice the {refcodes} list. For example, the first 10,000 crystal structures.
"""

%%time
import warnings
warnings.filterwarnings('ignore')

from rdkit import rdBase
rdBase.DisableLog('rdApp.warning')
rdBase.DisableLog('rdApp.error')

refcodes = [e.crystal.molecule.identifier for e in csd_reader]
print(f'Successfully load {len(refcodes)} refcodes')

breakpoint_txt = os.path.join(base_dir, '1.1.breakpoint.txt')
if os.path.exists(breakpoint_txt):
    with open(breakpoint_txt) as f:
        finish_list = f.readlines()
    for i, refcode in enumerate(finish_list):
        finish_list[i] = refcode.rstrip()
else:
    finish_list = []

for refcode in tqdm(refcodes):
    if refcode in finish_list:
        continue
    entry = csd_reader.entry(refcode)
    crystal = csd_reader.crystal(refcode)
    molecule = crystal.molecule
    components = molecule.components
    atom_nums = [len(comp.atoms) for comp in molecule.components]
    num_unequal_components = len(set(atom_nums))
    has_disorder = 1 if crystal.has_disorder else 0
    total_formula = molecule.formula

    is_organic = 1
    for ele in metal_element:
        if ele in total_formula:
            is_organic = 0
            break

    dir_name = {1: 'disorder', 0: 'no_disorder'}
    if is_organic == 1:
        # Write all the components in the cif structure
        write_components(components, os.path.join(organic_path, f'{dir_name[has_disorder]}'), refcode)
        try:
            # Identify host and guest mol file from componetns
            identify_host_guest_multiple(components, os.path.join(organic_path, f'{dir_name[has_disorder]}'), refcode,
                                         threshold=2., dist_threshold=1.5)
        except Exception as e:
            move_need_check_file(components, os.path.join(organic_path, f'{dir_name[has_disorder]}'),
                                 os.path.join(check_dir, 'Organic'), refcode)

    elif is_organic == 0:
        write_components(components, os.path.join(inorganic_path, f'{dir_name[has_disorder]}'), refcode)
        try:
            identify_host_guest_multiple(components, os.path.join(inorganic_path, f'{dir_name[has_disorder]}'), refcode,
                                         threshold=2., dist_threshold=1.5)
        except Exception as e:
            move_need_check_file(components, os.path.join(inorganic_path, f'{dir_name[has_disorder]}'),
                                 os.path.join(check_dir, 'Inorganic'), refcode)
    finish_list.append(f'{refcode}\n')
    with open(breakpoint_txt, 'w') as f:
        f.writelines(finish_list)

In [None]:
"""
Incorporating molecular formula checks can further enhance the efficiency of the screening process.
"""

target_dir = '/path/to/your/directory/2.1.RefineHostGuest'
inorganic_dir = os.path.join(target_dir, 'Inorganic')
organic_dir = os.path.join(target_dir, 'Organic')
host_check_dir = os.path.join(target_dir, 'HostCheck')
guest_check_dir = os.path.join(target_dir, 'GuestCheck')
metal_guest_dir = os.path.join(target_dir, 'MetalGuest')

for dire in [target_dir, inorganic_dir, organic_dir, host_check_dir, guest_check_dir, metal_guest_dir]:
    os.makedirs(dire, exist_ok=True)

# Copy all structures from the "Organic" and "Inorganic" folders into the "All" folder.
os.chdir('/path/to/your/directory/1.1.ConfigAndSplit/All')
files = os.listdir(os.getcwd())
refcodes = set([file.split('_')[0] for file in files])
csd_reader = io.EntryReader('CSD')

for refcode in tqdm(refcodes):
    guest_mol = Chem.MolFromMolFile(f'{refcode}_2.mol', sanitize=True, removeHs=False)
    host_mol = Chem.MolFromMolFile(f'{refcode}_1.mol', sanitize=True, removeHs=False)

    host_has_metal = 1 if any(atom.GetSymbol() in metal_element for atom in host_mol.GetAtoms()) else 0
    guest_has_metal = 1 if any(atom.GetSymbol() in metal_element for atom in guest_mol.GetAtoms()) else 0

    guest_formula = format_molecular_formula(get_mol_formula(f'{refcode}_2.mol'))
    host_formula = format_molecular_formula(get_mol_formula(f'{refcode}_1.mol'))

    entry = csd_reader.entry(refcode.upper())
    crystal = csd_reader.crystal(refcode.upper())
    molecule = crystal.molecule
    components = molecule.components
    formula_list = list(set([format_molecular_formula(comp.formula) for comp in components]))

    if guest_formula not in formula_list:
        shutil.move(f'{refcode}_1.mol', guest_check_dir)
        shutil.move(f'{refcode}_2.mol', guest_check_dir)
    elif host_formula not in formula_list:
        shutil.move(f'{refcode}_1.mol', host_check_dir)
        shutil.move(f'{refcode}_2.mol', host_check_dir)
    elif guest_has_metal == 1:
        shutil.move(f'{refcode}_1.mol', metal_guest_dir)
        shutil.move(f'{refcode}_2.mol', metal_guest_dir)
    elif host_has_metal == 1 and guest_has_metal == 0:
        shutil.move(f'{refcode}_1.mol', inorganic_dir)
        shutil.move(f'{refcode}_2.mol', inorganic_dir)
    elif host_has_metal == 0 and guest_has_metal == 0:
        shutil.move(f'{refcode}_1.mol', organic_dir)
        shutil.move(f'{refcode}_2.mol', organic_dir)