In [11]:
import os
from glob import glob

# CHANGE THESE TO YOUR MOL AND JDX FOLDERS
MOL_FOLDER = 'mol'
JDX_FOLDER = 'jdx'

mol_files = glob(os.path.join(MOL_FOLDER, '*.mol'))
jdx_files = glob(os.path.join(JDX_FOLDER, '*.jdx'))

jdx_cas_to_jdx_path = {}
for jdx_path in jdx_files:
    jdx_cas_with_index = os.path.splitext(os.path.basename(jdx_path))[0]
    jdx_cas_to_jdx_path[jdx_cas_with_index] = os.path.abspath(jdx_path)
    
mol_to_jdx = {}

# Find out which .jdx files belong to which .mol files
for mol_path in mol_files:
    mol_cas = os.path.splitext(os.path.basename(mol_path))[0]
    matching_jdx = []
    
    index = 0
    while (True):
        cas_with_index = mol_cas + ("-IR-%i" % (index))
        if cas_with_index in jdx_cas_to_jdx_path:
            matching_jdx.append(jdx_cas_to_jdx_path[cas_with_index])
            index = index + 1
        else:
            break
    
    mol_to_jdx[os.path.abspath(mol_path)] = matching_jdx
    
for mol_file, jdx_list in mol_to_jdx.items():
    print('\nMOL: %s' % (mol_file))
    for jdx_file in jdx_list:
        print("\tJDX: %s" % (jdx_file))


MOL: C:\Users\20234238\Downloads\IR-Spectra-Prediction-Graph-Models-main\IR-Spectra-Prediction-Graph-Models-main\mol\10-05-9.mol

MOL: C:\Users\20234238\Downloads\IR-Spectra-Prediction-Graph-Models-main\IR-Spectra-Prediction-Graph-Models-main\mol\10-15-1.mol

MOL: C:\Users\20234238\Downloads\IR-Spectra-Prediction-Graph-Models-main\IR-Spectra-Prediction-Graph-Models-main\mol\10-90-2.mol

MOL: C:\Users\20234238\Downloads\IR-Spectra-Prediction-Graph-Models-main\IR-Spectra-Prediction-Graph-Models-main\mol\10-94-6.mol

MOL: C:\Users\20234238\Downloads\IR-Spectra-Prediction-Graph-Models-main\IR-Spectra-Prediction-Graph-Models-main\mol\10-95-7.mol

MOL: C:\Users\20234238\Downloads\IR-Spectra-Prediction-Graph-Models-main\IR-Spectra-Prediction-Graph-Models-main\mol\10-99-1.mol

MOL: C:\Users\20234238\Downloads\IR-Spectra-Prediction-Graph-Models-main\IR-Spectra-Prediction-Graph-Models-main\mol\100-00-5.mol
	JDX: C:\Users\20234238\Downloads\IR-Spectra-Prediction-Graph-Models-main\IR-Spectra-Pred

In [12]:
def is_jdx_valid(jdx_file):
    try:
        with open(jdx_file, 'r') as file:
            content = file.read()
            
            # Measurements in these jdx files are always preceded by ##XYDATA=. Some files don't contain any measurements so we throw them away
            if (not ('##XYDATA=' in content)):
                return False
            
            return True
    except FileNotFoundError:
        print('File %s does not exist!' % (jdx_file))
        return False

for mol_file, jdx_list in mol_to_jdx.items():
    mol_to_jdx[mol_file] = [jdx for jdx in jdx_list if is_jdx_valid(jdx)]
    
# Remove .mol files for which there is no IR data
for key in list(mol_to_jdx):
    if mol_to_jdx[key] == []:
        del mol_to_jdx[key]

jdx_count = 0
for mol_file, jdx_list in mol_to_jdx.items():
    for jdx_file in jdx_list:
        jdx_count = jdx_count + 1
print('%i .jdx files corresponding to %i .mol files' % (jdx_count, len(mol_to_jdx)))

2912 .jdx files corresponding to 2124 .mol files


In [23]:
import re
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors


functional_group_patterns = {
    #'group_name': Chem.MolFromSmarts['smarts_code']
    'alcohol': Chem.MolFromSmarts('[OH]')
}

def determine_functional_groups(mol_file):
    mol = Chem.MolFromMolFile(mol_file)
    
    if mol is None:
        return []
    
    functional_groups = []
    for functional_group in functional_group_patterns:
        if mol.hasSubstructMatch(functional_group_patterns[functional_group]):
            functional_groups.append(functional_group)
    return functional_group

#double carbon bond, benzine ring, acid group

def get_state(jdx_file):
    try:
        with open(jdx_file, 'r') as file:
            lines = file.readlines()
            
            for line in lines:
                if line.startswith('##STATE='):
                    state = line[len('##STATE='):].strip().lower()
                    return re.split(r"[;, ()]+", state)[0]
            return 'Unknown'
    except FileNotFoundError:
        print('File %s does not exist!' % (jdx_file))
        return False
    
num_alcohols = 0
states = {}

for mol_file, jdx_list in mol_to_jdx.items():
    functional_groups = determine_functional_groups(mol_file)
    if (len(functional_groups) > 0):
        num_alcohols = num_alcohols + 1
    for jdx_file in jdx_list:
        state = get_state(jdx_file)
        if state in states:
            states[state] = states[state] + 1
        else:
            states[state] = 1
print('There are %i alcohols' % num_alcohols)
print(states)
            

There are 0 alcohols
{'gas': 1413, 'solution': 429, 'solid': 575, 'liquid': 377, 'Unknown': 83, 'film': 1, 'vapor': 31, 'salted': 1, 'visc.': 1, 'neat': 1}
