In [1]:
import pandas as pd
import rdkit
from collections import Counter

# Read DataFrame

In [2]:
df = pd.read_csv('HoV_results.csv')
oxygenate_condition = (df.smiles.str.contains('O')) | (df.smiles.str.contains('o'))

df_hydrocarbons = df[ ~oxygenate_condition ]
df_oxygenates = df[oxygenate_condition]
print(len(df_hydrocarbons.smiles.unique()), len(df_oxygenates.smiles.unique()))

3153 4247


# Oxygenates

In [3]:
def oxygenates_FG(smiles):
    mol = rdkit.Chem.MolFromSmiles(smiles)
    
    FG_names = ['Acyclic_Ester', 'Cyclic_Ester', 
                'Acyclic_Ether', 'Cyclic_Ether',
                'COOH', '0ry_Alcohol', '1ry_Alcohol', '2ry_Alcohol', '3ry_Alcohol', 
                'Carbonyl1', 'Carbonyl2', 'peroxide', 'phenolics']
    
    SMARTS_list = ['[CX3;!R](=[OX1])O[C,c]', '[CX3;R](=[OX1])[O;R][C,c]',
                   '[OD2;!R]([#6])[#6]', '[OD2;R]([#6;R])[#6;R]',
                   '*-C(=O)[O;D1]', '[CH3]-[O;D1]', '[CH2]-[O;D1]', '[CH1]-[O;D1]', '[CH0]-[O;D1]',
                    '*-C(=O)-[C;D1]', '*=[O;D1]', '[O]-[O]', '[c;R]-[O;D1]'
                  ]
    FG_SMARTS = [ rdkit.Chem.MolFromSmarts(x) for x in SMARTS_list  ]
    
    
    FG_count = [len(mol.GetSubstructMatches(x)) for x in FG_SMARTS]
    
    if sum(FG_count) == 0:
        if smiles == 'O' or smiles == '[C-]#[O+]':
            return 'water or carbon monoxide'
        else:
            return 'Furanics'
    else:
        FG_names_lumped = ['Acyclic_Ester', 'Cyclic_Ester', 
                            'Acyclic_Ether', 'Cyclic_Ether', 'COOH',
                            'Alcohol', 'Carbonyls', 'peroxide', 'phenolic']
        FG_count = FG_count[0:5] + [sum(FG_count[5:9])] + [sum(FG_count[9:11])] + FG_count[11:]
        FG_index = next((i for i, x in enumerate(FG_count) if x), None)
        
        return FG_names_lumped[FG_index]

In [4]:
result = []
for smi in df_oxygenates.smiles.unique():
    fg = oxygenates_FG(smi)
    result.append([smi,fg])
#print('.'.join([ result_smi[i] for i, fg in enumerate(result_fg) if fg == 'Carbonyls' ]))

# Hydrocarbons

In [5]:
def hydrocarbons_FG(smiles):
    mol = rdkit.Chem.MolFromSmiles(smiles)
    
    FG_names = [  'Alkyne', 'Alkene', 'FusedRing-Aromatics', 'FusedRing-Cycloalkanes'  ]
    SMARTS_list = [ '[C]#[C]', '[C]=[C]', '[cR2]', '[CR2]'  ]
    
    FG_SMARTS = [ rdkit.Chem.MolFromSmarts(x) for x in SMARTS_list  ]
    FG_count = [len(mol.GetSubstructMatches(x)) for x in FG_SMARTS]
 
    FG_index = next((i for i, x in enumerate(FG_count) if x), None)

    if sum(FG_count) != 0:
        return FG_names[FG_index]
    else:
        if '1' in smiles:
            if 'c' in smiles:
                return 'Aromatics'
            else:
                return 'Cycloalkanes'
        else:
            return 'Alkane'

In [6]:
for smi in df_hydrocarbons.smiles.unique():
    fg = hydrocarbons_FG(smi)
    result.append([smi,fg])

result

[['C#CC(=O)OC', 'Acyclic_Ester'],
 ['C#CC(=O)OCC', 'Acyclic_Ester'],
 ['C#CC(=O)Oc1ccc(C(C)(C)c2ccc(OC(=O)C#C)cc2)cc1', 'Acyclic_Ester'],
 ['C#CC(=O)Oc1ccc(OC(=O)C#C)cc1', 'Acyclic_Ester'],
 ['C#CC(=O)Oc1ccc(Oc2ccc(OC(=O)C#C)cc2)cc1', 'Acyclic_Ester'],
 ['C#CC(=O)Oc1ccc2ccc(OC(=O)C#C)cc2c1', 'Acyclic_Ester'],
 ['C#CC(=O)Oc1ccc2cccc(OC(=O)C#C)c2c1', 'Acyclic_Ester'],
 ['C#CC(=O)Oc1cccc2c(OC(=O)C#C)cccc12', 'Acyclic_Ester'],
 ['C#CC(C)(C)O', 'Alcohol'],
 ['C#CC(C)(O)CC', 'Alcohol'],
 ['C#CC(C)(O)CC(C)C', 'Alcohol'],
 ['C#CC(C)(O)CCC', 'Alcohol'],
 ['C#CC(C)(O)CCC=C(C)C', 'Alcohol'],
 ['C#CC(C)(O)CCCC(C)C', 'Alcohol'],
 ['C#CC(C)(O)CCCC(C)CCCC(C)C', 'Alcohol'],
 ['C#CC(C)(O)CCCC(C)CCCC(C)CCCC(C)C', 'Alcohol'],
 ['C#CC(C)(O)c1ccccc1', 'Alcohol'],
 ['C#CC(C)C(C)(O)CCCC(C)C', 'Alcohol'],
 ['C#CC(C)O', 'Alcohol'],
 ['C#CC(O)(CC)CC', 'Alcohol'],
 ['C#CC(O)C1CC=CCC1', 'Alcohol'],
 ['C#CC(O)C1CC=CCC1C', 'Alcohol'],
 ['C#CC(OC(C)=O)C1CC=CCC1', 'Acyclic_Ester'],
 ['C#CC(OC(C)=O)C1CC=CCC1C', 'Acycl

In [7]:
pd.DataFrame(result).to_csv('functional_groups.csv',index=False, header = ['smiles','functional_group'])

In [8]:
for x in result:
    smi, fg = x
    df.loc[(df.smiles == smi), 'functional_group'] = fg

In [9]:
import numpy as np
df['AE'] = np.abs(df.NIST - df.Predicted)

In [10]:
df

Unnamed: 0.1,Unnamed: 0,Predicted,NIST,temperature,Train/Valid/Test,smiles,ML_unc,DB_unc,sample_weight,total_atoms,functional_group,AE
0,137734,87.557960,6.61901,150.0000,Test,C,22.069708,0.076146,1.0,1,Alkane,80.938950
1,137736,90.610910,8.73150,90.6941,Test,C,23.045454,0.436585,1.0,1,Alkane,81.879410
2,137735,89.892750,8.51043,100.0000,Test,C,22.790491,0.254270,1.0,1,Alkane,81.382320
3,137737,10.509477,8.15453,290.0000,Test,C#C,6.288347,0.292922,1.0,2,Alkyne,2.354947
4,137738,10.837637,11.63790,260.0000,Test,C#C,6.702164,0.151327,1.0,2,Alkyne,0.800263
...,...,...,...,...,...,...,...,...,...,...,...,...
153100,124098,14.387741,14.57940,640.0000,Train,c1coc(Cc2ccco2)c1,7.405857,5.580340,1.0,11,Furanics,0.191659
153101,124099,51.890213,52.88470,300.0000,Train,c1coc(Cc2ccco2)c1,5.720401,3.761330,1.0,11,Furanics,0.994487
153102,124078,43.905960,43.99980,420.0000,Train,c1coc(Cc2ccco2)c1,3.145880,2.231770,1.0,11,Furanics,0.093840
153103,124091,28.163960,27.73100,580.0000,Train,c1coc(Cc2ccco2)c1,2.680500,1.724390,1.0,11,Furanics,0.432960


In [14]:
for fg, sub_df in df.groupby('functional_group'):
    #for dataset, subsub_df in sub_df.groupby('Train/Valid/Test'):
    print(fg, len(sub_df.smiles.unique()), len(sub_df), sub_df.AE.mean())

Acyclic_Ester 983 20757 3.0653690221333525
Acyclic_Ether 676 14428 3.04671788970682
Alcohol 1106 24134 3.6913712314266176
Alkane 454 9708 2.24479844894726
Alkene 1083 22975 2.499672108576279
Alkyne 153 3350 2.655587697653731
Aromatics 541 10384 3.7960682203996527
COOH 285 5922 6.013156185449172
Carbonyls 647 13450 3.7767016291933087
Cyclic_Ester 61 1175 4.155512105702128
Cyclic_Ether 208 4486 5.086286242287115
Cycloalkanes 335 6724 5.262001734318857
Furanics 35 706 2.240608812747875
FusedRing-Aromatics 445 7271 4.984113923091734
FusedRing-Cycloalkanes 142 2741 5.1613859951112735
peroxide 30 652 4.572918860490798
phenolic 214 4233 3.7020053000708715
water or carbon monoxide 2 9 4.436921000000001


In [13]:
len(df.functional_group.unique())

18

In [8]:
'.'.join([x[0] for x in result if x[1] == 'Acyclic_Ether' and '=O' in x[0]])

'C=C(C)C1Cc2c(ccc3c2OC2COc4cc(OC)c(OC)cc4C2C3=O)O1.C=C(OC)C(C)C(C)=O.C=CCOc1ccc(O)c2c1C(=O)c1ccccc1C2=O.C=CCOc1cccc2c1C(=O)c1c(OCC=C)cccc1C2=O.C=COc1ccc(C(C)=O)cc1.CCCCCCCCCCCCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCCCCCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCCCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCCOc1ccc(C(=O)O)cc1.CCCCCCOc1ccc(C(=O)O)cc1.CCCCCOc1ccc(C(=O)O)cc1.CCCCOC(C)C(=O)O.CCCCOc1ccc(C(=O)O)cc1.CCCOc1ccc(C(=O)O)cc1.CCOc1c(OCC)c(=O)c1=O.CCOc1cc(C=O)ccc1O.CCOc1ccc(CC(=O)O)cc1.CCOc1cccc(C=O)c1O.COC(OC)(C(=O)c1ccccc1)c1ccccc1.COCC(=O)c1ccccc1.COCC(C)=O.COCC=O.COc1c2c(cc3ccccc13)C(=O)C=CC2=O.COc1cc(C(=O)O)cc(OC)c1O.COc1cc(C(=O)O)cc(OC)c1OC.COc1cc(C(=O)O)ccc1O.COc1cc(C)cc(C)c1C(C)=O.COc1cc(CC(=O)O)cc(OC)c1OC.COc1cc(OC)c(C(=O)O)cc1OC.COc1cc(OC)c(C=O)c(OC)c1.COc1cc(OC)c(OC)cc1C=O.COc1cc(OC)cc(C(=O)O)c1.COc1ccc(C(=

In [14]:
'.'.join([x[0] for x in result if x[1] == 'Acyclic_Ether' and x[0].count('O') >=2 and x[0].count('1') == 0])

'C=C(OC)C(C)C(C)=O.C=CCOC(C)OCC=C.C=COCCCCOC=C.C=COCCOC.C=COCCOC(C)C.C=COCCOC=C.C=COCCOCC.C=COCCOCC(C)C.C=COCCOCCC.C=COCCOCCCC.C=COCCOCCOCCO.C=COCCOCCOCCOC=C.CC(C)CCOCC(O)CO.CC(C)COC(C)OCC(C)C.CC(C)COCCO.CC(C)COCOCC(C)C.CC(C)OC(C)OC(C)C.CC(C)OC(OC(C)C)C(C)C.CC(C)OCCO.CC(C)OCCOCCO.CC(O)COC(C)(C)C.CC(O)COC(C)CO.CC(O)COC(C)COC(C)COC(C)COC(C)C.CC(O)COCC(C)O.CC=COC(C)(C)OC.CCC(C)(OC)OC.CCC(C)C(OC)OC.CCC(C)COCCOCCO.CCC(C)OC(C)OC(C)CC.CCC(C)OCCO.CCC(OC(C)C)OC(C)C.CCC(OC)OC.CCCC(C)CCCOCCCCCCOCCCC(C)CCC.CCCC(CC(OCC)OCC)OCC.CCCC(OC(C)C)OC(C)C.CCCC(OC)OC.CCCC(OCC(C)C)OCC(C)C.CCCC(OCC)OCC.CCCCC(C)(OC)OC.CCCCC(CC)C(OCC)OCC.CCCCC(CC)COCCCCCCO.CCCCC(CC)COCCOCCO.CCCCC(OCC)OCC.CCCCCC(CC)COCCCCCCOCC(CC)CCCCC.CCCCCCC(OCC)OCC.CCCCCCCCCCCCCCCCCCOCCO.CCCCCCCCCCCCCCCCOCCO.CCCCCCCCCCCCCCOCC(O)CO.CCCCCCCCCCCCCOCC(O)CO.CCCCCCCCCCCCOCC(O)CO.CCCCCCCCCCCOCC(O)CO.CCCCCCCCCCOCC(O)CO.CCCCCCCCCCOCCCO.CCCCCCCCCCOCCO.CCCCCCCCCOCC(O)CO.CCCCCCCCOCC(O)CO.CCCCCCCCOCCO.CCCCCCCCOCCOCCO.CCCCCCCCOCCOCCOCCO.CCCCCCCCOCCOCCOCCOCCO

In [None]:
d