<a href="https://colab.research.google.com/github/CompPhysVienna/MLSummerSchoolVienna/blob/main/Day10_July22/polymer_ml_tutorial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Polymer Machine Learning via Polymer Database

Datasource: http://polymerdatabase.com/polymer%20physics/Polymer%20Tg.html

Polymer names were cleaned to have better conformance with `pubchempy` API.

## Setup

### Installation

In [None]:
%pip install pubchempy
%pip install deepchem
%pip install selfies

Collecting selfies
  Downloading selfies-2.0.0-py3-none-any.whl (33 kB)
Installing collected packages: selfies
Successfully installed selfies-2.0.0


### Imports

In [None]:
from tqdm import tqdm
from time import sleep
import numpy as np
import pandas as pd
import pubchempy as pcp
import deepchem as dc
import selfies as sf

### Data

In [None]:
data = pd.read_csv("polymer-database-glass-transition.csv")
df = data.rename(columns = {"pretty_polymer": "name", "Tg (deg C)": "Tg"})
df

Unnamed: 0,Amorphous Polymer,name,Tg
0,Poly(chlorotrifluoroethylene),chlorotrifluoroethylene,87
1,"Poly(vinyl chloride), PVC",vinyl chloride,83
2,"Poly(vinyl fluoride), PVF",vinyl fluoride,52
3,"Poly(vinylidene chloride), PVDC",vinylidene chloride,-17
4,"Poly(vinylidene fluoride), PVDF",vinylidene fluoride,-34
...,...,...,...
217,Poly(methyl vinyl thioether),methyl vinyl thioether,28
218,Poly(vinyl phenyl sulfide),vinyl phenyl sulfide,-20
219,Poly(ethyl vinyl thioether),ethyl vinyl thioether,-7
220,Poly(methyl vinyl thioether),methyl vinyl thioether,-1


In [None]:
df.describe()

Unnamed: 0,Tg
count,222.0
mean,53.351351
std,82.653138
min,-105.0
25%,-7.75
50%,46.5
75%,110.0
max,327.0


## PubChemPy

### Basic Usage
See https://pubchempy.readthedocs.io/en/latest/guide/gettingstarted.html#getting-started

In [None]:
pcp.get_properties('IsomericSMILES', 'CC', 'smiles')

[{'CID': 6324, 'IsomericSMILES': 'CC'}]

In [None]:
pcp.get_properties('IsomericSMILES', "Nylon-12", "name")

[{'CID': 13690, 'IsomericSMILES': 'C1CCCCCC(=O)NCCCCC1'}]

### Retrieve SMILES Strings
Note that the `sleep` command helps prevent "Server Busy" errors. Consider increasing this value if the error appears again.

In [None]:
ps = []
for name in tqdm(df.name):
  # enter your code to retrieve the SMILES strings
  # consider how you want to deal with cases of multiple or no entries
  sleep(0.1) # to prevent "Server Busy" error

# assign your new output to df2

100%|██████████| 222/222 [00:55<00:00,  3.98it/s]


## DeepChem SMILES Featurization

In [None]:
# https://deepchem.readthedocs.io/en/latest/api_reference/featurizers.html#onehotfeaturizer
featurizer = dc.feat.OneHotFeaturizer()
smiles = df2["smiles"]
encodings = featurizer.featurize(smiles)

print("type: ", type(encodings[0]))
print("shape: ", encodings[0].shape)
print("untransformed: ", featurizer.untransform(encodings[0]))

type:  <class 'numpy.ndarray'>
shape:  (100, 35)
untransformed:  C(=C(F)Cl)(F)F


In [None]:
print(encodings[0])
print(encodings[0].sum())

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
100.0


## SELFIES Featurization

### Basic Usage



In [None]:
# https://github.com/aspuru-guzik-group/selfies#translation-between-selfies-and-smiles-representations
# SMILES -> SELFIES -> SMILES translation
benzene = "c1ccccc1"
try:
    benzene_sf = sf.encoder(benzene)  # [C][=C][C][=C][C][=C][Ring1][=Branch1]
    benzene_smi = sf.decoder(benzene_sf)  # C1=CC=CC=C1
except sf.EncoderError:
    pass  # sf.encoder error!
except sf.DecoderError:
    pass  # sf.decoder error!

len_benzene = sf.len_selfies(benzene_sf)  # 8

### One-hot Encoding Example

In [None]:
# https://github.com/aspuru-guzik-group/selfies#integer-and-one-hot-encoding-selfies
dataset = ["[C][O][C]", "[F][C][F]", "[O][=O]", "[C][C][O][C][C]"]
alphabet = sf.get_alphabet_from_selfies(dataset)
alphabet.add("[nop]")  # [nop] is a special padding symbol
alphabet = list(sorted(alphabet))  # ['[=O]', '[C]', '[F]', '[O]', '[nop]']

pad_to_len = max(sf.len_selfies(s) for s in dataset)  # 5
symbol_to_idx = {s: i for i, s in enumerate(alphabet)}

dimethyl_ether = dataset[0]  # [C][O][C]

label, one_hot = sf.selfies_to_encoding(
   selfies=dimethyl_ether,
   vocab_stoi=symbol_to_idx,
   pad_to_len=pad_to_len,
   enc_type="both"
)
print(label, one_hot)

[1, 3, 1, 4, 4] [[0, 1, 0, 0, 0], [0, 0, 0, 1, 0], [0, 1, 0, 0, 0], [0, 0, 0, 0, 1], [0, 0, 0, 0, 1]]
