In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from rdkit import Chem
from rdkit.Contrib.SA_Score import sascorer

import pandas as pd

In [3]:
# common solvents
smiles = [
    "CCO",
    "CC(C)O",
    "CC(C)C(=O)O",
    "CC(C)C(=O)OC",
    "CC(C)C(=O)OCC"]

The development and validation of such a method that is able to characterize molecule synthetic accessibility as a score between 1 (easy to make) and 10 (very difficult to make) is described in this article.

In [4]:
for s in smiles:
    mol = Chem.MolFromSmiles(s)
    print(f"solvent: {s}, score: {sascorer.calculateScore(mol)}")

solvent: CCO, score: 1.9802570386349831
solvent: CC(C)O, score: 1.7383038156987496
solvent: CC(C)C(=O)O, score: 1.7186169388187054
solvent: CC(C)C(=O)OC, score: 2.019235351229961
solvent: CC(C)C(=O)OCC, score: 1.758331407237998


In [6]:
data = pd.read_csv("../data/data.csv")
data.head()

Unnamed: 0,smiles,redox_potential
0,NC=O,6.12
1,CN(C)C=O,4.93
2,CN(C)C(C)=O,4.74
3,CNC(C)=O,5.32
4,CC(N)=O,6.0


In [8]:
data['synthesizability'] = data['smiles'].apply(lambda x: sascorer.calculateScore(Chem.MolFromSmiles(x)))
data.head()

Unnamed: 0,smiles,redox_potential,synthesizability
0,NC=O,6.12,3.548888
1,CN(C)C=O,4.93,2.663631
2,CN(C)C(C)=O,4.74,1.982013
3,CNC(C)=O,5.32,1.913396
4,CC(N)=O,6.0,1.834708


In [9]:
data['synthesizability'].describe()

count    302.000000
mean       2.688686
std        0.745356
min        1.000000
25%        2.208198
50%        2.582615
75%        3.050695
max        5.598512
Name: synthesizability, dtype: float64