In [1]:
import random

import numpy as np
import pandas as pd
import sklearn.preprocessing
import sklearn.model_selection
import rdkit.Chem
from rdkit.Chem import Descriptors

# Drop console spam when molecules are featurized
rdkit.RDLogger.DisableLog("rdApp.*")

In [2]:
dataset_path = "raw_data/bitter-sweet.csv"
raw_data = pd.read_csv(dataset_path)

In [3]:
data = raw_data[["Name", "Canonical SMILES"]].copy()

# Let's take a 1 v all approach to this, and just encode "Bitter" as 1 and everything else as 0.
classes = ("Sweet", "Bitter", "Tasteless", "Non-bitter")
binarized = sklearn.preprocessing.label_binarize(raw_data.Taste.to_numpy(), classes=classes)
bitter = binarized[:,classes.index("Bitter")]

data["Bitter"] = bitter
data = data.dropna()
for colname in data.drop(columns=["Bitter"]).columns:
    data = data.dropna().drop_duplicates(subset=colname)

data.head()

Unnamed: 0,Name,Canonical SMILES,Bitter
0,D-Fructose,OC[C@@]1(O)OC[C@H]([C@H]([C@@H]1O)O)O,0
1,D-Glucose,OC[C@H]1O[C@H](O)[C@@H]([C@H]([C@@H]1O)O)O,0
2,D-Mannose,OC[C@H]1O[C@H](O)[C@H]([C@H]([C@@H]1O)O)O,0
3,L-Rhamnose,O[C@H]1[C@H](C)O[C@@H]([C@@H]([C@@H]1O)O)O,0
4,D-Ribulose,OC[C@H]([C@H](C(=O)CO)O)O,0


In [4]:
data["Molecule Objects"]=data["Canonical SMILES"].apply(rdkit.Chem.MolFromSmiles)
data = data.dropna().copy()
data.head()

cls_data = data[["Bitter"]].copy()
cls_data.head()

Unnamed: 0,Bitter
0,0
1,0
2,0
3,0
4,0


In [5]:
# Ignore the fragment features for now
features = filter(lambda i: not i[0].startswith("fr_"), Descriptors.descList)
for name, fun in features:
    # If we can't apply a feature, just ignore it
    try:
        cls_data[name] = data["Molecule Objects"].apply(fun)
    except ZeroDivisionError:
        pass
cls_data.head()

Unnamed: 0,Bitter,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,NumHAcceptors,NumHDonors,NumHeteroatoms,NumRotatableBonds,NumSaturatedCarbocycles,NumSaturatedHeterocycles,NumSaturatedRings,RingCount,MolLogP,MolMR
0,0,9.24338,-2.168009,9.24338,0.32412,0.290178,180.156,168.06,180.063388,72,...,6,5,6,1,0,1,1,1,-3.2198,36.008
1,0,9.119537,-1.568935,9.119537,0.525787,0.290153,180.156,168.06,180.063388,72,...,6,5,6,1,0,1,1,1,-3.2214,35.986
2,0,9.119537,-1.568935,9.119537,0.525787,0.290153,180.156,168.06,180.063388,72,...,6,5,6,1,0,1,1,1,-3.2214,35.986
3,0,9.088287,-1.429444,9.088287,0.663519,0.326216,164.157,152.061,164.068473,66,...,5,4,5,0,0,1,1,1,-2.1938,34.5742
4,0,10.341019,-1.685185,10.341019,0.701481,0.34836,150.13,140.05,150.052823,60,...,5,4,5,4,0,0,0,0,-2.7381,31.1922


In [6]:
def drop_low_unique_columns(dataset, cutoff=2, target_col = "Bitter"):
    mask = dataset.nunique() > cutoff
    mask.Bitter=True
    return dataset[dataset.columns[mask]]

cls_data = drop_low_unique_columns(cls_data).reset_index().drop(columns=["index"])

In [7]:
# Also drop stuff with an "infinite" (overflowed) standard deviation
cls_data = cls_data.drop(columns=cls_data.columns[cls_data.std()==np.inf])

  sqr = _ensure_numeric((avg - values) ** 2)


In [8]:
random.seed(1234)
np.random.seed(1234)

data_train, data_test = sklearn.model_selection.train_test_split(cls_data, test_size=0.2)

data_train_x = data_train.drop(columns=["Bitter"])
data_train_y = data_train.Bitter

mean = data_train_x.mean()
std = data_train_x.std()

data_test_x = data_test.drop(columns=["Bitter"])
data_test_y = data_test.Bitter

In [9]:
data_train_scaled = pd.concat([data_train_y, (data_train_x - mean) / std], axis=1)
data_test_scaled = pd.concat([data_test_y, (data_test_x - mean) / std], axis=1)

In [20]:
mean.to_pickle("dataset_means_stds/bittersweet_mean.pkl")
std.to_pickle("dataset_means_stds/bittersweet_std.pkl")

data_train_scaled.to_csv("scaled_featurized_train/scaled_bittersweet_train.csv")
data_test_scaled.to_csv("scaled_featurized_test/scaled_bittersweet_test.csv")