# Basic development and testing of the polymer inverse design case study

In [1]:
%reload_ext autoreload
%autoreload 2

In [21]:
import time 
import pandas as pd 
import matplotlib.pyplot as plt 
plt.style.use(['nature', 'science'])

from gpt3forchem.data import get_polymer_data
from gpt3forchem.input import create_single_property_inverse_polymer_prompts

from sklearn.model_selection import train_test_split

from gpt3forchem.api_wrappers import fine_tune, query_gpt3, extract_prediction

We will keep some of the data for "testing" to have some "independent prompts"

In [10]:
df = get_polymer_data()
train_df, test_df = train_test_split(df, train_size=.9, random_state=None, stratify=df["deltaGmin_cat"])

In [11]:
train_df

Unnamed: 0.1,Unnamed: 0,smiles,string,deltaGmin,A2_normalized,deltaGmin_cat,A2_normalized_cat,num_[W],max_[W],num_[Tr],...,[W],[W].1,[Tr],[Tr].1,[Ta],[Ta].1,[R],[R].1,rel_shannon,length
2546,2546,[W][R][Tr][W][W][R][R][W][R][Ta][R][W][R][R][T...,W-R-B-W-W-R-R-W-R-A-R-W-R-R-A-A-W-R-R-R-W-W-W-...,-13.172710,0.124250,small,medium,0.375000,3,0.125000,...,12.0,0.352941,4.0,0.117647,6.0,0.176471,12.0,0.352941,0.366673,34
2604,2604,[R][Tr][W][W][Ta][Ta][Ta][R][Tr][Ta][Ta][R][Ta...,R-B-W-W-A-A-A-R-B-A-A-R-A-A-B-R-A-W-R-A-R-R-R-...,-19.821176,0.035329,very small,small,0.142857,2,0.000000,...,4.0,0.125000,4.0,0.125000,12.0,0.375000,12.0,0.375000,0.362256,32
2373,2373,[W][Tr][Ta][R][Ta][Ta][W][W][R][Ta][W][Tr][W][...,W-B-A-R-A-A-W-W-R-A-W-B-W-R-W-R-B-R-B-A-R-A-R-...,-14.718235,-0.206857,small,very small,0.400000,2,0.200000,...,12.0,0.272727,10.0,0.227273,12.0,0.272727,10.0,0.227273,0.365245,44
2379,2379,[R][Ta][Tr][Ta][Tr][Tr][W][Tr][Tr][R][W][R][Ta...,R-A-B-A-B-B-W-B-B-R-W-R-A-B-R-R-B-B-R-A-R-B-W-...,-11.994670,0.016217,medium,small,0.000000,0,0.750000,...,4.0,0.133333,12.0,0.400000,4.0,0.133333,10.0,0.333333,0.373406,30
558,558,[W][Tr][Ta][Ta][Tr][Ta][Tr][Tr][R][W][W][Tr][T...,W-B-A-A-B-A-B-B-R-W-W-B-B-B-B-R-A-W-A-B-A-W-A-...,-8.874077,0.108918,large,medium,0.200000,2,0.600000,...,6.0,0.200000,12.0,0.400000,8.0,0.266667,4.0,0.133333,0.385019,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2293,2293,[Ta][R][Tr][R][R][Ta][Ta][W][R][Ta][Tr][Tr][Ta...,A-R-B-R-R-A-A-W-R-A-B-B-A-W-B-W-W-R-B-B-R-W-B-...,-11.988838,-0.150618,medium,very small,0.285714,2,0.428571,...,10.0,0.277778,10.0,0.277778,6.0,0.166667,10.0,0.277778,0.381209,36
621,621,[W][Ta][W][Ta][Ta][R][Tr][W][R][W][Ta][Tr][Tr]...,W-A-W-A-A-R-B-W-R-W-A-B-B-W-R-A-B-W-A-A-W-W-B-...,-8.925864,-0.023971,large,small,0.166667,2,0.333333,...,12.0,0.300000,12.0,0.300000,12.0,0.300000,4.0,0.100000,0.356161,40
2289,2289,[Ta][Tr][W][Tr][W][Tr][R][Tr][Tr][W][R][Ta][Tr...,A-B-W-B-W-B-R-B-B-W-R-A-B-A-W-W-R-A-W-R-R-A-R-...,-11.468817,-0.073456,medium,very small,0.400000,2,0.200000,...,8.0,0.235294,10.0,0.294118,6.0,0.176471,10.0,0.294118,0.387489,34
1140,1140,[Tr][W][Tr][Ta][W][W][Ta][R][W][Ta][Tr][Tr][W]...,B-W-B-A-W-W-A-R-W-A-B-B-W-B-W-B-R-R-B-R-W-W-B-...,-6.879497,0.214136,very large,large,0.400000,2,0.400000,...,10.0,0.312500,12.0,0.375000,4.0,0.125000,6.0,0.187500,0.376571,32


In [12]:
train_prompts = create_single_property_inverse_polymer_prompts(
    train_df,
    "deltaGmin_cat",
    {"deltaGmin_cat": "adsorption energy"},
    encode_value=False,
)


test_prompts = create_single_property_inverse_polymer_prompts(
    test_df,
    "deltaGmin_cat",
    {"deltaGmin_cat": "adsorption energy"},
    encode_value=False,
)

In [13]:
train_prompts

Unnamed: 0,prompt,completion
0,what is a polymer with small adsorption energy...,W-R-B-W-W-R-R-W-R-A-R-W-R-R-A-A-W-R-R-R-W-W-W...
1,what is a polymer with very small adsorption e...,R-B-W-W-A-A-A-R-B-A-A-R-A-A-B-R-A-W-R-A-R-R-R...
2,what is a polymer with small adsorption energy...,W-B-A-R-A-A-W-W-R-A-W-B-W-R-W-R-B-R-B-A-R-A-R...
3,what is a polymer with medium adsorption energ...,R-A-B-A-B-B-W-B-B-R-W-R-A-B-R-R-B-B-R-A-R-B-W...
4,what is a polymer with large adsorption energy...,W-B-A-A-B-A-B-B-R-W-W-B-B-B-B-R-A-W-A-B-A-W-A...
...,...,...
2807,what is a polymer with medium adsorption energ...,A-R-B-R-R-A-A-W-R-A-B-B-A-W-B-W-W-R-B-B-R-W-B...
2808,what is a polymer with large adsorption energy...,W-A-W-A-A-R-B-W-R-W-A-B-B-W-R-A-B-W-A-A-W-W-B...
2809,what is a polymer with medium adsorption energ...,A-B-W-B-W-B-R-B-B-W-R-A-B-A-W-W-R-A-W-R-R-A-R...
2810,what is a polymer with very large adsorption e...,B-W-B-A-W-W-A-R-W-A-B-B-W-B-W-B-R-R-B-R-W-W-B...


In [15]:
filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
train_filename = f"run_files/{filename_base}_train_prompts_mof_h2o.jsonl"
valid_filename = f"run_files/{filename_base}_valid_prompts_mof_h2o.jsonl"

train_prompts.to_json(train_filename, orient="records", lines=True)
test_prompts.to_json(valid_filename, orient="records", lines=True)


In [17]:
fine_tune(train_filename, valid_filename)   

Fine-tune ft-sdP5bW8IqAPnuGCP7cCyiJPB has the status "pending" and will not be logged
🎉 wandb sync completed successfully


'ada:ft-lsmoepfl-2022-09-14-17-43-30'

In [27]:
completions = query_gpt3('ada:ft-lsmoepfl-2022-09-14-17-43-30', test_prompts, max_tokens=200)

In [30]:
predictions = [extract_prediction(completions, i) for i, completion in enumerate(completions["choices"])]

In [31]:
predictions

['W-W-R-W-R-W-A-W-R-B-W-A-R-B-W-A-R-B-W-A-R-B-W-R-A-B-W-B-R-A-B',
 'B-B-A-B-A-B-A-B-W-R-W-A-B-R-A-B-W-R-A-W-B-R-B',
 'B-R-B-B-R-B-R-B-R-W-B-A-R-B-W-A-R-B-W-A-R-B-W-A-R-B-W-R-A-B-R-W-A-B-R',
 'B-R-R-B-R-B-R-B-W-R-B-R-W-B-R-W-A-B-R-W-B-A-R-W-B-A-R-B-W-A-R',
 'A-R-R-A-R-A-R-A-W-R-A-R-W-A-R-W-A-B-A-W-R-B-W-R-A-B-A-W-R-B-A',
 'B-B-B-W-B-W-B-W-W-B-W-B-A-R-W-B-A-R-W-B-A-R-W-B-A-R-W',
 'B-B-B-B-R-B-R-B-A-R-B-A-R-B-W-A-R-B-W-A-R-B-A-W-R-B-W-R-B-A-R',
 'R-R-R-R-R-B-R-B-A-R-B-A-R-B-A-B-W-R-A-B-W-R-A-B-R-W-R-B-A-W-R',
 'R-R-R-R-R-B-A-R-A-B-A-R-B-A-B-R-A-B-R-W-A-B-W-R-A-R-B-W-R-A-B-W-R',
 'R-R-R-A-R-A-R-A-R-A-R-B-W-A-A-R-W-B-A-R-W-B-A-R-W-B-R',
 'B-B-B-B-A-B-R-B-A-R-W-B-A-R-W-A-B-R-A-B-W-R-B-A-R-W-B',
 'W-R-W-R-W-A-W-B-R-W-A-B-W-R-A-B-W-R-A-W-R-B-A-W-R-B-A-W-R-B-W-R-B-A-W-B-R-A-R',
 'W-B-W-B-R-W-B-A-W-R-B-W-R-B-A-W-R-B-A-W-R-B-A-W-R-B-A-W-R-B-W-A-R-B-W-R-A-B-W-R-B-A-R',
 'B-R-R-B-R-B-R-B-R-A-B-W-A-R-B-W-A-B-R-W-B-R-A-W-B',
 'A-A-A-A-B-W-R-B-W-A-B-W-A-B-W-R-A-B-W-R-A-W-R-B-A-W-R-B-A-W-R-B-W-R-B-A-W-