In [70]:
%load_ext autoreload 
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


We can add context in two ways to help improve the model performance:

1. Train on multiple small datasets at the same time (e.g. Xe, Kr, CH4, N2, H2O Henry coefficients) instead of trainin only on one dataset.

2. Train with additional information about the gases and test if this can be used by the model to "extrapolate" to unseen guests

All available outputs are

- "outputs.pbe.bandgap",
- "outputs.Xe-henry_coefficient-mol--kg--Pa",
- "outputs.Kr-henry_coefficient-mol--kg--Pa",
- "outputs.H2O-henry_coefficient-mol--kg--Pa",
- "outputs.H2S-henry_coefficient-mol--kg--Pa",
- "outputs.CO2-henry_coefficient-mol--kg--Pa",
- "outputs.CH4-henry_coefficient-mol--kg--Pa",
- "outputs.O2-henry_coefficient-mol--kg--Pa",


In [71]:
from gpt3forchem.data import get_mof_data, discretize
from sklearn.model_selection import train_test_split
from gpt3forchem.input import (
    create_single_property_forward_prompts,
    create_single_property_forward_prompts_multiple_targets,
)
from gpt3forchem.api_wrappers import extract_prediction, fine_tune, query_gpt3
from collections import Counter
import numpy as np 
from pycm import ConfusionMatrix
import time

In [3]:
df = get_mof_data()


  return HashableDataFrame(pd.read_csv(os.path.join(datadir, "mof.csv")))


In [25]:
df = df.dropna(subset=["outputs.H2O-henry_coefficient-mol--kg--Pa"])


Let's get the logs of the Henry coefficients.


In [30]:
features = [
    "outputs.Xe-henry_coefficient-mol--kg--Pa",
    "outputs.Kr-henry_coefficient-mol--kg--Pa",
    "outputs.H2O-henry_coefficient-mol--kg--Pa",
    "outputs.H2S-henry_coefficient-mol--kg--Pa",
    "outputs.CO2-henry_coefficient-mol--kg--Pa",
    "outputs.CH4-henry_coefficient-mol--kg--Pa",
    "outputs.O2-henry_coefficient-mol--kg--Pa",
]


In [36]:
for feature in features:
    df[feature + '_log'] = np.log10(df[feature] + 1e-40)

Since the datasets are relatively small, we will only work with three bins.

In [69]:
for feature in features:

    discretize(
        df, f"{feature}_log", n_bins=3, labels=["low", "medium", "high"]
    )


In [42]:
Counter(df["outputs.H2O-henry_coefficient-mol--kg--Pa_log_cat"])


Counter({'medium': 47, 'low': 99, 'high': 7})

In [73]:
train_df, test_df = train_test_split(
    df, train_size=0.8, stratify=df["outputs.H2O-henry_coefficient-mol--kg--Pa_log_cat"]
)


# 1. Train on multiple small datasets at the same time


Let's take H2O Henry coefficient as our target. We first train a model just on it and then train a model on it and a bunch of Henry coefficients of other molecules (but we still only test on H2O).


In [74]:
train_prompts = create_single_property_forward_prompts(train_df, "outputs.H2O-henry_coefficient-mol--kg--Pa_log_cat", {'outputs.H2O-henry_coefficient-mol--kg--Pa_log_cat': 'H2O Henry coefficient'}, representation_col="info.mofid.mofid_clean", encode_value=False)

In [75]:
test_prompts = create_single_property_forward_prompts(
    test_df,
    "outputs.H2O-henry_coefficient-mol--kg--Pa_log_cat",
    {"outputs.H2O-henry_coefficient-mol--kg--Pa_log_cat": "H2O Henry coefficient"},
    representation_col="info.mofid.mofid_clean",
    encode_value=False,
)


In [76]:
filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
train_filename = f"run_files/{filename_base}_train_prompts_mof_h2o.jsonl"
valid_filename = f"run_files/{filename_base}_valid_prompts_mof_h2o.jsonl"

train_prompts.to_json(train_filename, orient="records", lines=True)
test_prompts.to_json(valid_filename, orient="records", lines=True)

In [78]:
fine_tune(train_filename, valid_filename)

Traceback (most recent call last):
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/bin/openai", line 8, in <module>
    sys.exit(main())
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/_openai_scripts.py", line 63, in main
    args.func(args)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/cli.py", line 545, in sync
    resp = openai.wandb_logger.WandbLogger.sync(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 74, in sync
    fine_tune_logged = [
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 75, in <listcomp>
    cls._log_fine_tune(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 125, in _log_fine_tune
    wandb_run = cls._get_wandb_run(run_path)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/pyth

'ada:ft-lsmoepfl-2022-09-13-20-55-28'

In [79]:
completions = query_gpt3('ada:ft-lsmoepfl-2022-09-13-20-55-28', test_prompts)

In [80]:
predictions = [
    extract_prediction(completions, i) for i in range(len(completions['choices']))
]

In [81]:
true = test_prompts['completion'].apply(lambda x: x.split('@')[0].strip())

In [82]:
cm = ConfusionMatrix(actual_vector=true.to_list(), predict_vector=predictions)

In [84]:
print(cm)

Predict      high         low          medium       
Actual
high         0            0            1            

low          0            19           1            

medium       0            9            1            





Overall Statistics : 

95% CI                                                            (0.47673,0.81359)
ACC Macro                                                         0.76344
ARI                                                               0.12517
AUNP                                                              0.54329
AUNU                                                              0.52276
Bangdiwala B                                                      0.61356
Bennett S                                                         0.46774
CBA                                                               0.25952
CSI                                                               None
Chi-Squared                                                       None
Chi-Squ

Now, let's use the same train/test split - but add additional outputs.

In [85]:
train_prompts_aug = create_single_property_forward_prompts_multiple_targets(
    train_df,
    [
        "outputs.Xe-henry_coefficient-mol--kg--Pa_log_cat",
        "outputs.Kr-henry_coefficient-mol--kg--Pa_log_cat",
        "outputs.H2S-henry_coefficient-mol--kg--Pa_log_cat",
        "outputs.CO2-henry_coefficient-mol--kg--Pa_log_cat",
        "outputs.CH4-henry_coefficient-mol--kg--Pa_log_cat",
        "outputs.O2-henry_coefficient-mol--kg--Pa_log_cat",
    ],
    {
        "outputs.Xe-henry_coefficient-mol--kg--Pa_log_cat": "Xe Henry coefficient",
        "outputs.Kr-henry_coefficient-mol--kg--Pa_log_cat": "Kr Henry coefficient",
        "outputs.H2S-henry_coefficient-mol--kg--Pa_log_cat": "H2S Henry coefficient",
        "outputs.CO2-henry_coefficient-mol--kg--Pa_log_cat": "CO2 Henry coefficient",
        "outputs.CH4-henry_coefficient-mol--kg--Pa_log_cat": "CH4 Henry coefficient",
        "outputs.O2-henry_coefficient-mol--kg--Pa_log_cat": "O2 Henry coefficient",
    },
    representation_col="info.mofid.mofid_clean",
    encode_value=False,
)


In [97]:
import pandas as pd

In [98]:
train_prompts_aug_ = []

for i, row in train_prompts_aug.iterrows():
    if not 'nan' in  row['completion']:
        train_prompts_aug_.append(row)

train_prompts_aug = pd.DataFrame(train_prompts_aug_)

In [99]:
train_augment_filename = f"run_files/{filename_base}_train_prompts_mof_aug.jsonl"

train_prompts_aug.to_json(train_augment_filename, orient="records", lines=True)

In [100]:
fine_tune(train_augment_filename, valid_filename)

Fine-tune ft-lOtEpqD3BeNqeZo71xdB83n6 has the status "pending" and will not be logged
🎉 wandb sync completed successfully


'ada:ft-lsmoepfl-2022-09-13-21-17-29'

In [101]:
completions_aug = query_gpt3('ada:ft-lsmoepfl-2022-09-13-21-17-29', test_prompts)

In [102]:
predictions_aug = [
    extract_prediction(completions_aug, i) for i in range(len(completions_aug['choices']))
]

In [103]:
true = test_prompts['completion'].apply(lambda x: x.split('@')[0].strip())

In [104]:
cm_aug = ConfusionMatrix(actual_vector=true.to_list(), predict_vector=predictions_aug)

In [106]:
print(cm_aug)

Predict      high         low          medium       
Actual
high         0            0            1            

low          3            2            15           

medium       0            0            10           





Overall Statistics : 

95% CI                                                            (0.21563,0.55856)
ACC Macro                                                         0.5914
ARI                                                               -0.10601
AUNP                                                              0.56905
AUNU                                                              0.53968
Bangdiwala B                                                      0.34323
Bennett S                                                         0.08065
CBA                                                               0.16154
CSI                                                               -0.17179
Chi-Squared                                                       3.27885


Ok, so just adding more objectives to the training actually makes the performance worse.

# 2. Train w/ additional information about the gases


### Using only the chemical name of the molecules as context


### Numerically encoding the molecular properties
