## Imports 

In [1]:

import tensorflow as tf

import numpy as np
import random

from rdkit import RDLogger

import warnings

from rdkit.Chem import MolFromSmiles


import pandas as pd

RDLogger.DisableLog('rdApp.*')

In [19]:
full_dataset = pd.read_csv("sweeteness_dataset_v3_1.csv")
negative_dataset = full_dataset[full_dataset["sweet"]==0]
full_dataset = full_dataset[full_dataset["sweet"]==1]
pubchem_ids = []
for ids in full_dataset["database_references"]:
    if isinstance(ids, str):
        list_of_ids = ids.split(" & ")
        for id_ in list_of_ids:
            try:
                pubchem_id = int(id_)
                pubchem_ids.append(pubchem_id)
            except:
                pass


df = pd.read_csv("pubchem_mols/predicted_molecules_ensemble.csv")
df = df.drop_duplicates(subset=["cid"])
ids = []
for i,row in df.iterrows():
    if row["cid"] not in pubchem_ids:
        ids.append(i)
df = df.iloc[ids,:]

In [20]:
df.drop_duplicates("smiles").shape

(67724, 3)

In [24]:
def get_similarity(query_mol, other_molecules):
    
    query_fp = AllChem.GetMorganFingerprintAsBitVect(MolFromSmiles(query_mol), 2, 2048)
    fps = []
    
    for molecule in other_molecules:
        fp = AllChem.GetMorganFingerprintAsBitVect(MolFromSmiles(molecule), 2, 2048)
        fps.append(fp)
    ids = []
    for i,fp in enumerate(fps):
        similarity = DataStructs.TanimotoSimilarity(query_fp, fp)
        if 1 > similarity > 0.75:
            ids.append(i)
            
    return ids
    
def get_substructure_match(substructure, molecules):
    
    ids = []
    for i, molecule in enumerate(molecules):
        molecule = MolFromSmiles(molecule)
        
        if molecule.HasSubstructMatch(substructure):
            ids.append(i)
            
    return ids

## Get probable sweeteneners similar to aspartame

In [355]:
best_compounds = df.sort_values(by="predictions", ascending=False)
ids = get_similarity("COC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CC(=O)O)N", best_compounds.smiles.values)
best_compounds.iloc[ids,:].drop_duplicates(subset="smiles").shape

(26, 3)

## Get probable aspartame-derivatives sweeteners

In [30]:
best_compounds = df.sort_values(by="predictions", ascending=False)
ids = get_substructure_match(MolFromSmiles("COC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CC(=O)O)N"), best_compounds.smiles.values)
best_compounds.iloc[ids,:].drop_duplicates(subset="smiles").shape[0]

199

## Get probable sweeteneners similar to cyclamate


In [250]:
best_compounds = df.sort_values(by="predictions", ascending=False)
ids = get_similarity("C1CCC(CC1)NS(=O)(=O)O", best_compounds.smiles.values)
best_compounds.iloc[ids,:].shape

(0, 3)

## Get probable cyclamate-derived sweeteners

In [27]:
best_compounds = df.sort_values(by="predictions", ascending=False)
ids = get_substructure_match(MolFromSmiles("C1CCC(CC1)NS(=O)(=O)O"), best_compounds.smiles.values)
best_compounds.iloc[ids,:].drop_duplicates(subset="smiles").shape

(8, 3)

## Acesulfame

In [330]:
ids = get_similarity("CC1=CC(=O)NS(=O)(=O)O1", best_compounds.smiles.values)
best_compounds.iloc[ids,:].shape

(0, 3)

In [28]:
ids = get_substructure_match(MolFromSmiles("CC1=CC(=O)NS(=O)(=O)O1"), best_compounds.smiles.values)
best_compounds.iloc[ids,:].drop_duplicates(subset="smiles").shape

(11, 3)

## Alitame

In [332]:
ids = get_similarity("C[C@H](C(=O)NC1C(SC1(C)C)(C)C)NC(=O)[C@H](CC(=O)O)N", best_compounds.smiles.values)
best_compounds.iloc[ids,:].shape

(0, 3)

In [359]:
ids = get_substructure_match(MolFromSmiles("C[C@H](C(=O)NC1C(SC1(C)C)(C)C)NC(=O)[C@H](CC(=O)O)N"), best_compounds.smiles.values)
best_compounds.iloc[ids,:].drop_duplicates(subset="smiles").shape

(7, 3)

## Saccharin

In [336]:
ids = get_similarity("C1=CC=C2C(=C1)C(=O)NS2(=O)=O", best_compounds.smiles.values)
best_compounds.iloc[ids,:].shape

(1, 3)

In [360]:
ids = get_substructure_match(MolFromSmiles("C1=CC=C2C(=C1)C(=O)NS2(=O)=O"), best_compounds.smiles.values)
best_compounds.iloc[ids,:].drop_duplicates(subset="smiles").shape


(231, 3)

## Dulcin

In [339]:
ids = get_similarity("CCOC1=CC=C(C=C1)NC(=O)N", best_compounds.smiles.values)
best_compounds.iloc[ids,:].shape

(0, 3)

In [361]:
ids = get_substructure_match(MolFromSmiles("CCOC1=CC=C(C=C1)NC(=O)N"), best_compounds.smiles.values)
best_compounds.iloc[ids,:].drop_duplicates(subset="smiles").shape

(3, 3)

## Guanidines

In [363]:
from rdkit.Chem import MolFromSmarts

ids = get_substructure_match(MolFromSmarts("C(=N*)(N*)N*"), best_compounds.smiles.values)
best_compounds.iloc[ids,:].drop_duplicates(subset="smiles").shape

(28, 3)

In [364]:
best_compounds.iloc[ids,:].drop_duplicates(subset="smiles")

Unnamed: 0,cid,smiles,predictions
23605,13748439,COC(=O)[C@H](CC1=CC=CC=C1)NC(=O)[C@H](CC(=O)O)...,0.879518
13395,10597302,CCOC(=O)C(CC1=CC=CC=C1)NC(=NC)NS(=O)(=O)C1=CC=...,0.805407
44916,20716101,CC(C)COC(=O)C1NC(=O)C1CCCN=C(NC(=O)OCC1=CC=CC=...,0.779236
24846,14230963,N#CC1=CC=C(NC(=NCC(=O)O)NC2CCCCCCC2)C=C1,0.770597
73075,90989585,CC(C)COC(=O)C1NC(=O)[C@@H]1CCCN=C(NC(=O)OCC1=C...,0.77035
16345,10962398,COC(=O)[C@H](CCCN=C(NC(=O)OC(C)(C)C)NC(=O)OC(C...,0.760635
24845,14230962,N#CC1=CC=C(NC(=NCC(=O)O)NC2CCCCCC2)C=C1,0.747387
24184,13960823,N#CC1=CC=C(NC(=NCC(=O)O)NC2CCCCC2)C=C1,0.739246
44917,20716114,COC(=O)C1NC(=O)C1(C)CCCN=C(NC(=O)OCC1=CC=CC=C1...,0.659956
13251,10576667,CCOC(=O)C(CC1=CC=CC=C1)NC(=NCC1=CC=CC=C1)NS(=O...,0.654197
