# Forward predictions of polymer adsorption energies


In [23]:
import time

from sklearn.model_selection import train_test_split

from gpt3forchem.data import get_polymer_data
from gpt3forchem.input import create_single_property_forward_prompts
from gpt3forchem.api_wrappers import fine_tune, query_gpt3, extract_prediction

from pycm import ConfusionMatrix

Let's run one fine-tuning and inference for sanity check and then do it a coule of times for statistics.


## Sanity check


In [26]:
df = get_polymer_data()


In [27]:
df_train, df_test = train_test_split(df, train_size=200, random_state=42)


In [28]:
train_prompts = create_single_property_forward_prompts(
    df_train, "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}
)

test_prompts = create_single_property_forward_prompts(
    df_test, "deltaGmin_cat", {"deltaGmin_cat": "adsorption energy"}
)


In [29]:
train_size  = len(train_prompts)
test_size = len(test_prompts)

filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
train_filename = f"run_files/{filename_base}_train_prompts_polymers_{train_size}.jsonl"
valid_filename = f"run_files/{filename_base}_valid_prompts_polymers_{test_size}.jsonl"

train_prompts.to_json(train_filename, orient="records", lines=True)
test_prompts.to_json(valid_filename, orient="records", lines=True)


In [30]:
modelname = fine_tune(train_filename, valid_filename)

wandb: Currently logged in as: kjappelbaum. Use `wandb login --relogin` to force relogin
wandb: Tracking run with wandb version 0.13.1
wandb: Run data is saved locally in /Users/kevinmaikjablonka/git/kjappelbaum/gpt3forchem/experiments/wandb/run-20220817_204432-ft-9PrAvlxyrCPMzgOBrXZI5Clc
wandb: Run `wandb offline` to turn off syncing.
wandb: Syncing run ft-9PrAvlxyrCPMzgOBrXZI5Clc
wandb: ⭐️ View project at https://wandb.ai/kjappelbaum/GPT-3
wandb: 🚀 View run at https://wandb.ai/kjappelbaum/GPT-3/runs/ft-9PrAvlxyrCPMzgOBrXZI5Clc
wandb: Waiting for W&B process to finish... (success).
wandb:                                                                                
wandb: 
wandb: Run history:
wandb:             elapsed_examples ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
wandb:               elapsed_tokens ▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
wandb:                training_loss █▅▄▅▄▃▄▃▃▄▃▃▃▃▃▅▃▃▂▃▃▃▃▂▃▂▁▁▁▃▂▂▂▁▂▃▁▁▂▁
wandb:   training_sequence_accuracy █▁▁▁▁█▁▁▁▁▁█▁██▁▁▁█▁▁█▁█▁███

🎉 wandb sync completed successfully


In [32]:
test_prompt_subset = test_prompts
completions = query_gpt3(modelname, test_prompt_subset)

In [None]:
predictions = [extract_prediction(completion) for completion in completions]
true = [t.split('@')[0] for t in test_prompt_subset['completion']]

In [None]:
cm = ConfusionMatrix(true, predictions)

In [None]:
print(cm)

Predict   0        2        3        4       3        
Actual
 0       0        0        0        0        4        

 2       0        0        0        0        1        

 3       0        0        0        0        1        

 4       0        0        0        0        4        

3        0        0        0        0        0        





Overall Statistics : 

95% CI                                                            (0.0,0.0)
ACC Macro                                                         0.6
ARI                                                               0.0
AUNP                                                              None
AUNU                                                              None
Bangdiwala B                                                      None
Bennett S                                                         -0.25
CBA                                                               0.0
CSI                                                         