# Calculate characteristics of molecular building blocks

## set up

In [1]:
import os

from rdkit import Chem
from rdkit.Chem import rdchem
from rdkit.Chem import Descriptors

import re
from mendeleev import C, H, O

from scipy import constants
from scipy.constants import N_A

from MDAnalysis.lib.log import ProgressBar

import pandas as pd
import numpy as np

## define functions

In [2]:
def get_mol_data(fname:str):
    """
    Returns characteristics of a molecule when given a .mol file (fname)
    """
    
    #load mol
    mol = Chem.MolFromMolFile(fname, removeHs=False,sanitize=False)
    Chem.Kekulize(mol)
    
    #calc no of atoms
    n_atoms = mol.GetNumAtoms()
    
    #calc chemical composition
    chemformula = Chem.rdMolDescriptors.CalcMolFormula(mol)
    split_formula = re.split(r'\D+',chemformula)
    c = int(split_formula[1])
    h = int(split_formula[2])
    o = int(split_formula[3]) if len(split_formula)>3 else 0
    
    hc=h/c  
    oc=o/c

    #calc mw and mass
    mw = Chem.Descriptors.MolWt(mol)
    mass = mw / N_A
    
    #calc domain size
    sssr = Chem.GetSSSR(mol)
    
    #calc aromaticity if small enough molecule
    if sssr > 120:
        aromaticity = 100
        
    else:
        Chem.SanitizeMol(mol)
        
        n_aromatic = 0
        for no in range(0,int(mol.GetNumAtoms())):
            if mol.GetAtomWithIdx(no).GetSymbol()=='C':
                if mol.GetAtomWithIdx(no).GetIsAromatic()==True:
                    n_aromatic+=1
        
        aromaticity = n_aromatic/c *100
        
    f_oh = Chem.MolFromSmarts('[#6]-[O]-[H]', mergeHs=True)
    f_carbonyl = Chem.MolFromSmarts('[#6]=[O]')
    f_ether = Chem.MolFromSmarts('[#6][#8][#6]')

    oh = len(mol.GetSubstructMatches(f_oh))
    carbonyl = len(mol.GetSubstructMatches(f_carbonyl))
    ether = len(mol.GetSubstructMatches(f_ether))
                
    return n_atoms, chemformula, c, h, o, hc, oc, mw, mass, sssr, aromaticity, oh, carbonyl, ether

## import file names for .mol files

In [3]:
file_names = [f for f in os.listdir() if f.endswith('.mol')]
file_names.sort()

In [4]:
print(file_names)

['0001.mol', '0002.mol', '0003.mol', '0004.mol', '0005.mol', '0006.mol', '0007.mol', '0008.mol', '0009.mol', '0010.mol', '0011.mol', '0012.mol', '0013.mol', '0014.mol', '0015.mol', '0016.mol', '0017.mol', '0018.mol', '0019.mol', '0020.mol', '0021.mol', '0022.mol', '0023.mol', '0024.mol', '0025.mol', '0026.mol', '0027.mol', '0028.mol', '0029.mol', '0030.mol', '0031.mol', '0032.mol', '0033.mol', '0034.mol', '0035.mol', '0036.mol', '0037.mol', '0038.mol', '0039.mol', '0040.mol', '0041.mol', '0042.mol', '0043.mol', '0044.mol', '0045.mol', '0046.mol', '0047.mol', '0048.mol', '0049.mol', '0050.mol', '0051.mol', '0052.mol', '0053.mol', '0054.mol', '0055.mol', '0056.mol', '0057.mol']


## get mol data and save

In [5]:
mol_data = {fname:get_mol_data(fname) for fname in ProgressBar(file_names)}

  0%|          | 0/57 [00:00<?, ?it/s]



In [6]:
mol_df = pd.DataFrame.from_dict(mol_data, orient='index', columns=['n_atoms', 'chemformula', 'C', 'H', 'O', 'H/C', 'O/C', 'MW', 'mass /g', 'SSSR','aromaticity (%)', 'C-OH','C=O','C-O-C'])
mol_df

Unnamed: 0,n_atoms,chemformula,C,H,O,H/C,O/C,MW,mass /g,SSSR,aromaticity (%),C-OH,C=O,C-O-C
0001.mol,57,C48H36O9,48,36,9,0.75,0.1875,756.807,1.256708e-21,8,87.5,3.0,2.0,4.0
0002.mol,110,C59H40O11,59,40,11,0.677966,0.186441,924.958,1.535929e-21,11,84.745763,4.0,3.0,4.0
0003.mol,180,C150H76O30,150,76,30,0.506667,0.2,2358.228,3.91593e-21,40,90.666667,10.0,7.0,13.0
0004.mol,64,C53H42O11,53,42,11,0.792453,0.207547,854.908,1.419608e-21,7,71.698113,0.0,4.0,7.0
0005.mol,68,C57H38O11,57,38,11,0.666667,0.192982,898.92,1.4926919999999999e-21,9,70.175439,0.0,6.0,5.0
0006.mol,71,C60H50O11,60,50,11,0.833333,0.183333,947.049,1.572612e-21,9,73.333333,3.0,5.0,3.0
0007.mol,89,C74H50O15,74,50,15,0.675676,0.202703,1179.199,1.958106e-21,13,81.081081,5.0,7.0,3.0
0008.mol,93,C79H52O14,79,52,14,0.658228,0.177215,1225.271,2.03461e-21,16,81.012658,3.0,6.0,5.0
0009.mol,146,C83H50O13,83,50,13,0.60241,0.156627,1255.3,2.084475e-21,20,84.337349,2.0,4.0,7.0
0010.mol,360,C228H88O44,228,88,44,0.385965,0.192982,3531.168,5.863642e-21,66,94.736842,12.0,19.0,13.0


In [7]:
mol_df.to_csv('../molecules_info.csv')