In [1]:
import pandas as pd
import rdkit
from collections import Counter

# Read DataFrame

In [2]:
df = pd.read_csv('HoV_results.csv')
oxygenate_condition = (df.smiles.str.contains('O')) | (df.smiles.str.contains('o'))

df_hydrocarbons = df[ ~oxygenate_condition ]
df_oxygenates = df[oxygenate_condition]
print(len(df_hydrocarbons.smiles.unique()), len(df_oxygenates.smiles.unique()))

3153 4247


# Oxygenates

In [3]:
def oxygenates_FG(smiles):
    mol = rdkit.Chem.MolFromSmiles(smiles)
    
    FG_names = ['Acyclic_Ester', 'Cyclic_Ester', 
                'Acyclic_Ether', 'Cyclic_Ether',
                'COOH', '0ry_Alcohol', '1ry_Alcohol', '2ry_Alcohol', '3ry_Alcohol', 
                'Carbonyl1', 'Carbonyl2', 'peroxide', 'phenolics']
    
    SMARTS_list = ['[CX3;!R](=[OX1])O[C,c]', '[CX3;R](=[OX1])[O;R][C,c]',
                   '[OD2;!R]([#6])[#6]', '[OD2;R]([#6;R])[#6;R]',
                   '*-C(=O)[O;D1]', '[CH3]-[O;D1]', '[CH2]-[O;D1]', '[CH1]-[O;D1]', '[CH0]-[O;D1]',
                    '*-C(=O)-[C;D1]', '*=[O;D1]', '[O]-[O]', '[c;R]-[O;D1]'
                  ]
    FG_SMARTS = [ rdkit.Chem.MolFromSmarts(x) for x in SMARTS_list  ]
    
    
    FG_count = [len(mol.GetSubstructMatches(x)) for x in FG_SMARTS]
    
    if sum(FG_count) == 0:
        if smiles == 'O' or smiles == '[C-]#[O+]':
            return 'water or carbon monoxide'
        else:
            return 'Furanics'
    else:
        #FG_names_lumped = ['Acyclic_Ester', 'Cyclic_Ester', 
        #                    'Acyclic_Ether', 'Cyclic_Ether', 'COOH',
        #                    'Alcohol', 'Carbonyls', 'peroxide', 'phenolic']
        FG_names_lumped = ['Ester', 'Ester', 
                            'Ether', 'Ether', 'Carbonyls',
                            'Alcohol', 'Carbonyls', 'peroxide', 'phenolic']        
        
        
        FG_count = FG_count[0:5] + [sum(FG_count[5:9])] + [sum(FG_count[9:11])] + FG_count[11:]
        FG_index = next((i for i, x in enumerate(FG_count) if x), None)
        
        return FG_names_lumped[FG_index]

In [4]:
result = []
for smi in df_oxygenates.smiles.unique():
    fg = oxygenates_FG(smi)
    result.append([smi,fg])

# Hydrocarbons

In [5]:
def hydrocarbons_FG(smiles):
    mol = rdkit.Chem.MolFromSmiles(smiles)
    
    #FG_names = [  'Alkyne', 'Alkene', 'FusedRing-Aromatics', 'FusedRing-Cycloalkanes'  ]
    #FG_names = [  'Alkyne', 'Alkene', 'Aromatics', 'Cycloalkanes'  ]
    FG_names = [  'Alkyne', 'Alkene', 'FusedRings', 'FusedRings'  ]
    SMARTS_list = [ '[C]#[C]', '[C]=[C]', '[cR2]', '[CR2]'  ]
    
    FG_SMARTS = [ rdkit.Chem.MolFromSmarts(x) for x in SMARTS_list  ]
    FG_count = [len(mol.GetSubstructMatches(x)) for x in FG_SMARTS]
 
    FG_index = next((i for i, x in enumerate(FG_count) if x), None)

    if sum(FG_count) != 0:
        return FG_names[FG_index]
    else:
        if '1' in smiles:
            if 'c' in smiles:
                #return 'Aromatics'
                return 'Cyclics'
            else:
                #return 'Cycloalkanes'
                return 'Cyclics'
        else:
            return 'Alkane'

In [6]:
for smi in df_hydrocarbons.smiles.unique():
    fg = hydrocarbons_FG(smi)
    result.append([smi,fg])

In [7]:
pd.DataFrame(result).to_csv('functional_groups.csv',index=False, header = ['smiles','functional_group'])

In [8]:
for x in result:
    smi, fg = x
    df.loc[(df.smiles == smi), 'functional_group'] = fg

In [9]:
import numpy as np
df['AE'] = np.abs(df.NIST - df.Predicted)

In [10]:
for fg, sub_df in df.groupby('functional_group'):
    print(fg, len(sub_df.smiles.unique()), len(sub_df), np.round(len(sub_df) / len(sub_df.smiles.unique()),2),  
          np.round(sub_df.AE.mean(),2), np.round(sub_df.DB_unc.mean(),2))
    #print(len(sub_df[sub_df['Train/Valid/Test'] == 'Train']),
    #     len(sub_df[sub_df['Train/Valid/Test'] == 'Valid']),
    #     len(sub_df[sub_df['Train/Valid/Test'] == 'Test']))

Alcohol 1106 24134 21.82 3.69 3.67
Alkane 454 9708 21.38 2.24 2.82
Alkene 1083 22975 21.21 2.5 2.67
Alkyne 153 3350 21.9 2.66 2.56
Carbonyls 932 19372 20.79 4.46 3.53
Cyclics 876 17108 19.53 4.37 3.27
Ester 1044 21932 21.01 3.12 3.62
Ether 884 18914 21.4 3.53 3.02
Furanics 35 706 20.17 2.24 2.69
FusedRings 587 10012 17.06 5.03 3.53
peroxide 30 652 21.73 4.57 3.52
phenolic 214 4233 19.78 3.7 3.47
water or carbon monoxide 2 9 4.5 4.44 0.66


In [12]:
len(df.functional_group.unique())

13

In [14]:
df_each_molecule = pd.read_csv('functional_groups.csv')
df_each_molecule

Unnamed: 0,smiles,functional_group
0,C#CC(=O)OC,Ester
1,C#CC(=O)OCC,Ester
2,C#CC(=O)Oc1ccc(C(C)(C)c2ccc(OC(=O)C#C)cc2)cc1,Ester
3,C#CC(=O)Oc1ccc(OC(=O)C#C)cc1,Ester
4,C#CC(=O)Oc1ccc(Oc2ccc(OC(=O)C#C)cc2)cc1,Ester
...,...,...
7395,c1ccc2cc3c(ccc4ccccc43)cc2c1,FusedRings
7396,c1ccc2cccc-2cc1,FusedRings
7397,c1ccc2ccccc2c1,FusedRings
7398,c1ccccc#1,Cyclics


In [15]:
from tqdm import tqdm
for smi, sub_df in tqdm(df.groupby('smiles')):
    df_each_molecule.loc[ (df_each_molecule.smiles == smi),'N_data'] = len(sub_df)
    df_each_molecule.loc[ (df_each_molecule.smiles == smi),'MAE'] = sub_df.AE.mean()

100%|██████████| 7400/7400 [00:12<00:00, 608.44it/s]


In [16]:
Outliers = df_each_molecule.sort_values(by=['MAE'], ascending = False).iloc[0:5]

In [17]:
Outliers

Unnamed: 0,smiles,functional_group,N_data,MAE
4247,C,Alkane,3.0,81.400227
4541,C1CCCCCCCCCCCCCCCCCCCCCCCCC1,Cyclics,14.0,46.232156
4036,O=c1c(-c2ccccc2)c1-c1ccccc1,Carbonyls,16.0,45.770905
3987,O=C1c2cc3ccccc3cc2C(=O)c2cc3ccccc3cc21,Carbonyls,17.0,43.627844
102,C1COCCOCCOCCOCCOCCOCCOCCO1,Ether,15.0,43.453448
