In [10]:
import pandas as pd
import sklearn.preprocessing
import rdkit.Chem
from rdkit.Chem import Descriptors

# Drop console spam when molecules are featurized
rdkit.RDLogger.DisableLog("rdApp.*")

In [2]:
dataset_path = "raw_data/bitter-sweet.csv"
raw_data = pd.read_csv(dataset_path)

In [3]:
data = raw_data[["Name", "Canonical SMILES"]].copy()

# Let's take a 1 v all approach to this, and just encode "Bitter" as 1 and everything else as 0.
classes = ("Sweet", "Bitter", "Tasteless", "Non-bitter")
binarized = sklearn.preprocessing.label_binarize(raw_data.Taste.to_numpy(), classes=classes)
bitter = binarized[:,classes.index("Bitter")]

data["Bitter"] = bitter
data = data.dropna()
for colname in data.drop(columns=["Bitter"]).columns:
    data = data.dropna().drop_duplicates(subset=colname)

data

Unnamed: 0,Name,Canonical SMILES,Bitter
0,D-Fructose,OC[C@@]1(O)OC[C@H]([C@H]([C@@H]1O)O)O,0
1,D-Glucose,OC[C@H]1O[C@H](O)[C@@H]([C@H]([C@@H]1O)O)O,0
2,D-Mannose,OC[C@H]1O[C@H](O)[C@H]([C@H]([C@@H]1O)O)O,0
3,L-Rhamnose,O[C@H]1[C@H](C)O[C@@H]([C@@H]([C@@H]1O)O)O,0
4,D-Ribulose,OC[C@H]([C@H](C(=O)CO)O)O,0
...,...,...,...
2423,6-Methyl-2-pyridinemethanol,OCc1cccc(n1)C,1
2424,4-hydroxybenzyl alcohol,OCc1ccc(cc1)O,1
2425,4-Benzoylpyridine,O=C(c1ccncc1)c1ccccc1,1
2426,4-(5-Methyl-2-furyl)-2-butanone,CC(=O)CCc1ccc(o1)C,1


In [8]:
data["Molecule Objects"]=data["Canonical SMILES"].apply(rdkit.Chem.MolFromSmiles)
data.head()

Unnamed: 0,Name,Canonical SMILES,Bitter,Molecule Objects
0,D-Fructose,OC[C@@]1(O)OC[C@H]([C@H]([C@@H]1O)O)O,0,<rdkit.Chem.rdchem.Mol object at 0x12f60b400>
1,D-Glucose,OC[C@H]1O[C@H](O)[C@@H]([C@H]([C@@H]1O)O)O,0,<rdkit.Chem.rdchem.Mol object at 0x12f60b940>
2,D-Mannose,OC[C@H]1O[C@H](O)[C@H]([C@H]([C@@H]1O)O)O,0,<rdkit.Chem.rdchem.Mol object at 0x12f60be20>
3,L-Rhamnose,O[C@H]1[C@H](C)O[C@@H]([C@@H]([C@@H]1O)O)O,0,<rdkit.Chem.rdchem.Mol object at 0x12f60b880>
4,D-Ribulose,OC[C@H]([C@H](C(=O)CO)O)O,0,<rdkit.Chem.rdchem.Mol object at 0x12f60bee0>


In [14]:
features = Descriptors.descList
features[0]

('MaxEStateIndex',
 <function rdkit.Chem.EState.EState.MaxEStateIndex(mol, force=1)>)