In [1]:
%load_ext autoreload
%autoreload 2

In [72]:
from gpt3forchem.data import get_bandgap_data
from gpt3forchem.api_wrappers import fine_tune, query_gpt3, extract_inverse_prediction
import pandas as pd 
import numpy as np 
from sklearn.model_selection import train_test_split
import time

from fastcore.helpers import save_pickle

In [115]:
data = get_bandgap_data()

In [7]:
train_data, test_data = train_test_split(data, train_size=500, test_size=100, random_state=42)

In [116]:
smiles_selfies = dict(zip(data['smiles'], data['selfies']))

In [117]:
data = data.groupby('smiles').agg('mean').reset_index()
data['selfies'] = data['smiles'].map(smiles_selfies)

In [14]:
PROMPT_TEMPLATE_bandgap_inverse = "What is a molecule with a bandgap of {} eV###"
COMPLETION_TEMPLATE_bandgap_inverse= "{}@@@"


def generate_inverse_photoswitch_prompts(data: pd.DataFrame, representation: str = 'smiles') -> pd.DataFrame:
    prompts = []
    completions = []
    smiles = []
    for i, row in data.iterrows():
    
        prompt = PROMPT_TEMPLATE_bandgap_inverse.format(
               np.round(row['GFN2_HOMO_LUMO_GAP']* 27.2114, 1)
            )

        completion = COMPLETION_TEMPLATE_bandgap_inverse.format(row[representation])
        prompts.append(prompt)
        completions.append(completion)
        smiles.append(row["smiles"])

    prompts = pd.DataFrame({"prompt": prompts, "completion": completions, "SMILES": smiles})

    return prompts


In [15]:
train_prompts, test_prompts = generate_inverse_photoswitch_prompts(train_data), generate_inverse_photoswitch_prompts(test_data)

In [16]:
train_prompts

Unnamed: 0,prompt,completion,SMILES
0,What is a molecule with a bandgap of 2.1 eV###,[H]c1c([H])c(-n2nc(C(=O)N([H])C([H])([H])C([H]...,[H]c1c([H])c(-n2nc(C(=O)N([H])C([H])([H])C([H]...
1,What is a molecule with a bandgap of 1.9 eV###,[H]c1c([H])c([H])c2c(c1[H])C(=O)C1=[N+]([O-])[...,[H]c1c([H])c([H])c2c(c1[H])C(=O)C1=[N+]([O-])[...
2,What is a molecule with a bandgap of 1.9 eV###,[H]c1nc2c(N3C([H])([H])C([H])([H])C([H])(C#N)C...,[H]c1nc2c(N3C([H])([H])C([H])([H])C([H])(C#N)C...
3,What is a molecule with a bandgap of 3.0 eV###,[H]c1c([H])c(S(=O)(=O)N([H])C([H])([H])C([H])(...,[H]c1c([H])c(S(=O)(=O)N([H])C([H])([H])C([H])(...
4,What is a molecule with a bandgap of 3.5 eV###,[H]OC(=O)C([H])([H])c1c([H])c([H])c(OC([H])([H...,[H]OC(=O)C([H])([H])c1c([H])c([H])c(OC([H])([H...
...,...,...,...
495,What is a molecule with a bandgap of 2.0 eV###,[H]c1nc(-c2c([H])c(F)c(F)c([H])c2F)c(-c2c([H])...,[H]c1nc(-c2c([H])c(F)c(F)c([H])c2F)c(-c2c([H])...
496,What is a molecule with a bandgap of 2.7 eV###,[H]c1c([H])c(C(=O)n2c(C([H])([H])C([H])([H])C(...,[H]c1c([H])c(C(=O)n2c(C([H])([H])C([H])([H])C(...
497,What is a molecule with a bandgap of 1.8 eV###,[H]c1c([H])c([H])c(-c2nc(N([H])c3c([H])c([H])c...,[H]c1c([H])c([H])c(-c2nc(N([H])c3c([H])c([H])c...
498,What is a molecule with a bandgap of 2.0 eV###,[H]c1c([H])c(OC([H])([H])[H])c(C([H])(C([H])([...,[H]c1c([H])c(OC([H])([H])[H])c(C([H])(C([H])([...


In [17]:
def train_inverse_model(train_prompts, representation):

    train_size = len(train_prompts)

    filename_base = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())
    train_filename = f"run_files/{filename_base}_train_prompts_bandgap_inverse_{representation}_{train_size}.jsonl"

    train_prompts.to_json(train_filename, orient="records", lines=True)

    modelname = fine_tune(train_filename, train_filename, "ada")
    return modelname, train_filename

In [20]:
modelname, train_filename = train_inverse_model(train_prompts, 'smiles')

Traceback (most recent call last):
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/bin/openai", line 8, in <module>
    sys.exit(main())
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/_openai_scripts.py", line 63, in main
    args.func(args)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/cli.py", line 545, in sync
    resp = openai.wandb_logger.WandbLogger.sync(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 74, in sync
    fine_tune_logged = [
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 75, in <listcomp>
    cls._log_fine_tune(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 125, in _log_fine_tune
    wandb_run = cls._get_wandb_run(run_path)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/pyth

In [23]:
train_prompts['prompt'].iloc[0]

'What is a molecule with a bandgap of 2.1 eV###'

In [27]:
from gpt3forchem.output import test_inverse_bandgap

In [66]:
res = test_inverse_bandgap(test_prompts.iloc[:10], modelname, train_prompts['SMILES'].iloc[:10], 0.75, max_tokens=250)

2022-11-25 11:48:51.396 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H]c1nc([H])c([H])c(N([H])C(=O)N([H])c2c([H])c([H])c([H])c([H])c2[H])c1[H]
2022-11-25 11:48:51.396 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]c1nc([H])c([H])c(N([H])C(=O)N([H])c2c([H])c([H])c([H])c([H])c2[H])c1[H]
2022-11-25 11:48:51.400 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 6 samples
2022-11-25 11:48:51.438 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
  gen_mol_act = model.predict_generator(
2022-11-25 11:48:59.342 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:959 - Computed frechet score: (77.47723397212097, 1.8638586028278752e-07)
2022-11-25 11:48:59.398 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:970 - Computed KL div score: 0.3089225170957535


In [70]:
def test_inverse_model(modelname, test_prompts, df_train, max_tokens: int =250, temperatures=None, representation="SMILES"): 
    temperatures = temperatures or  [0, 0.25, 0.5, 0.75, 1.0, 1.25, 1.5]
    train_smiles = df_train['SMILES'].to_list()
    results = []
    for temperature in temperatures:
        try:
            print(f"Testing temperature {temperature} for {representation}")
            result = test_inverse_bandgap(
                test_prompts, modelname, train_smiles=train_smiles, temperature=temperature, max_tokens=max_tokens, representation=representation
            )

            results.append(result)
        except Exception as e:
            print(e)
            pass
    
    return results

In [71]:
res_500_smiles = test_inverse_model(modelname, test_prompts, train_prompts, representation="SMILES")

Testing temperature 0 for SMILES


2022-11-25 11:58:46.088 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H]c1c([H])c([H])c(C([H])([H])N([H])C(=O)N([H])c2c([H])c([H])c(C(=O)N([H])c3c([H])c([H])c(OC([H])([H])[H])c([H])c3[H])c([H])c2[H])c([H])c1[H]
2022-11-25 11:58:46.088 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]c1c([H])c([H])c(C([H])([H])N([H])C(=O)N([H])c2c([H])c([H])c(C(=O)N([H])c3c([H])c([H])c(OC([H])([H])[H])c([H])c3[H])c([H])c2[H])c([H])c1[H]
2022-11-25 11:58:46.110 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 100 samples
2022-11-25 11:58:46.173 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
  gen_mol_act = model.predict_generator(
  arg2 = norm(X.dot(X) - A, 'fro')**2 / norm(A, 'fro')
2022-11-25 11:59:10.584 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:959 - Co

Testing temperature 0.25 for SMILES


2022-11-25 11:59:46.096 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H]c1c([H])c([H])c(C([H])([H])N([H])C(=O)N([H])c2c([H])c([H])c(C(=O)N([H])c3c([H])c([H])c([H])c([H])c3[H])c([H])c2[H])c([H])c1[H]
2022-11-25 11:59:46.097 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]c1c([H])c([H])c(C([H])([H])N([H])C(=O)N([H])c2c([H])c([H])c(C(=O)N([H])c3c([H])c([H])c([H])c([H])c3[H])c([H])c2[H])c([H])c1[H]
2022-11-25 11:59:46.119 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 76 samples
2022-11-25 11:59:46.149 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-11-25 12:00:10.442 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:959 - Computed frechet score: (57.22531938419547, 1.0702171674685315e-05)
2022-11-25 12:00:10.811 | DEBUG    | gpt3forchem.output

Testing temperature 0.5 for SMILES


2022-11-25 12:00:45.102 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H]c1c([H])c([H])c(C(=O)N([H])c2nc3c([H])c(C(=O)N([H])C([H])([H])C([H])([H])C([H])([H])N4C([H])([H])C([H])([H])C([H])([H])C4([H])[H])c([H])c([H])c3sc2C([H])([H])[H])c([H])c1[H]
2022-11-25 12:00:45.102 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]c1c([H])c([H])c(C(=O)N([H])c2nc3c([H])c(C(=O)N([H])C([H])([H])C([H])([H])C([H])([H])N4C([H])([H])C([H])([H])C([H])([H])C4([H])[H])c([H])c([H])c3sc2C([H])([H])[H])c([H])c1[H]
2022-11-25 12:00:45.133 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 60 samples
2022-11-25 12:00:45.161 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-11-25 12:01:09.334 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:959 - Computed frechet score: (38.6

Testing temperature 0.75 for SMILES


2022-11-25 12:01:44.624 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H]c1nc(N([H])c2nc(C([H])([H])[H])c([H])c([H])c2N([H])[H])c([H])c(C([H])([H])[H])c1[H]
2022-11-25 12:01:44.624 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]c1nc(N([H])c2nc(C([H])([H])[H])c([H])c([H])c2N([H])[H])c([H])c(C([H])([H])[H])c1[H]
2022-11-25 12:01:44.666 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 30 samples
2022-11-25 12:01:44.693 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-11-25 12:02:08.755 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:959 - Computed frechet score: (45.54272623196655, 0.0001107156620423434)
2022-11-25 12:02:08.945 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:970 - Computed KL div score: 0.3657454532513622


Testing temperature 1.0 for SMILES


2022-11-25 12:02:44.262 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H]c1c([H])c(C([H])([H])Br)c(N([H])c2c(N3C([H])([H])C([H])([H])N(C(=O)N([H])C([H])([H])[H])SC([H])([H])C3([H])[H])c([H])c([H])c2C([H])([H])C3([H])[H])c1[H]
2022-11-25 12:02:44.262 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]c1c([H])c(C([H])([H])Br)c(N([H])c2c(N3C([H])([H])C([H])([H])N(C(=O)N([H])C([H])([H])[H])SC([H])([H])C3([H])[H])c([H])c([H])c2C([H])([H])C3([H])[H])c1[H]
2022-11-25 12:02:44.299 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 21 samples
2022-11-25 12:02:44.331 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-11-25 12:03:02.060 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:959 - Computed frechet score: (45.4895519614289, 0.00011189939018701424)
2022

Testing temperature 1.25 for SMILES


2022-11-25 12:03:37.901 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H]/C(=N(C([H])([H])S(=O)(=O)OF)(=O)C([H])([H])S(=O)(=O)C([H])([H])c1nnc([H])c([H])c2nc(N([H])s1C([H])([H])C9([H])([H])C([H])([H])OC([H])([H])C9([H])[H])c3sc([H])c([H])c([H])c([H])c32)c1[H])C2=O
2022-11-25 12:03:37.902 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]/C(=N(C([H])([H])S(=O)(=O)OF)(=O)C([H])([H])S(=O)(=O)C([H])([H])c1nnc([H])c([H])c2nc(N([H])s1C([H])([H])C9([H])([H])C([H])([H])OC([H])([H])C9([H])[H])c3sc([H])c([H])c([H])c([H])c32)c1[H])C2=O
2022-11-25 12:03:37.927 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 5 samples
2022-11-25 12:03:37.985 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-11-25 12:03:47.195 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:

Testing temperature 1.5 for SMILES


2022-11-25 12:04:23.150 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H]OC1(C([H])([H])OC(C([H])([H]])n2c2c(-c3c(ocooc(-ce)c(-c4c(-c([H])
2022-11-25 12:04:23.151 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]OC1(C([H])([H])OC(C([H])([H]])n2c2c(-c3c(ocooc(-ce)c(-c4c(-c([H])
2022-11-25 12:04:23.175 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 3 samples
2022-11-25 12:04:23.256 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-11-25 12:04:29.991 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:959 - Computed frechet score: (83.02738582460621, 6.142326499048503e-08)
  return n/db/n.sum(), bin_edges
2022-11-25 12:04:30.025 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:970 - Computed KL div score: nan


In [73]:
save_pickle( "run_files/res_500_smiles.pkl", res_500_smiles)

In [79]:
smiles_500_smiles_sets = {}

for res in res_500_smiles:
    smiles_500_smiles_sets[res['meta']['temperature']] = set(res['predictions'][res['valid_smiles']])

for temp, smiles_set in smiles_500_smiles_sets.items():
    with open(f'for_xtb_opt/500_smiles_{temp}.txt', 'w') as f:
        for i, smiles in enumerate(smiles_set):
            if i != len(smiles_set) - 1:
                f.write(smiles + '\n')
            else:
                f.write(smiles)

For the ground truth, we need to swtich to running on the server

#### Now use SELFIES

In [80]:
train_prompts_selfies, test_prompts_selfies = generate_inverse_photoswitch_prompts(train_data, 'selfies'), generate_inverse_photoswitch_prompts(test_data, 'selfies')

In [84]:
modelname_500_selfies, train_filename_500_selfies = train_inverse_model(train_prompts_selfies, 'selfies')

Traceback (most recent call last):
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/bin/openai", line 8, in <module>
    sys.exit(main())
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/_openai_scripts.py", line 63, in main
    args.func(args)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/cli.py", line 545, in sync
    resp = openai.wandb_logger.WandbLogger.sync(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 74, in sync
    fine_tune_logged = [
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 75, in <listcomp>
    cls._log_fine_tune(
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/python3.9/site-packages/openai/wandb_logger.py", line 125, in _log_fine_tune
    wandb_run = cls._get_wandb_run(run_path)
  File "/Users/kevinmaikjablonka/miniconda3/envs/gpt3/lib/pyth

In [86]:
res_500_seflies = test_inverse_model(modelname_500_selfies, test_prompts_selfies , train_prompts_selfies, representation="selfies")

Testing temperature 0 for selfies


2022-11-25 13:59:53.495 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch2][Branch1][#Branch1][C][=Branch1][C][=O][N][Branch1][C][H][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch2][Ring1][=Branch1][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][H][=C][Ring1][#Branch2][H][C][Branch1][C][H][=C][Ring2][Ring1][Branch1][H][C][Branch1][C][H][=C][Ring2][Ring2][Branch1][H]
2022-11-25 13:59:53.532 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]C=C([H])C([H])=C(C(=O)N([H])C=C([H])C([H])=C(C1=C([H])C([H])=C([H])C([H])=C1[H])C([H])=C[H])[H]
2022-11-25 13:59:53.540 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 100 samples
2022-11-25 13:59:53.612 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_

Testing temperature 0.25 for selfies


2022-11-25 14:00:55.903 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch2][=Branch1][#Branch1][C][Branch1][C][H][Branch1][C][H][N][Branch1][C][H][C][=Branch1][C][=O][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch2][Ring1][=Branch1][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][H][=C][Ring1][#Branch2][H][C][Branch1][C][H][=C][Ring2][Ring1][#Branch1][H][C][Branch1][C][H][Branch1][C][H][C][Branch1][C][H][Branch1][C][H][C][Branch1][C][H][
2022-11-25 14:00:55.934 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch2][=Branch1][#Branch1][C][Branch1][C][H][Branch1][C][H][N][Branch1][C][H][C][=Branch1][C][=O][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch2][Ring1][=Branch1][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][H][=C][Ring1][#Branch2][H][C][Bran

Testing temperature 0.5 for selfies


2022-11-25 14:01:59.332 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H][C][=N][C][Branch2][Ring1][=Branch1][N][Branch1][C][H][C][=N][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][H][=C][Ring1][#Branch2][N][Branch1][C][H][H][=C][Ring2][Ring1][#Branch2][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][H][=C][Ring1][#Branch2][N][Branch1][C][H][H][C][Branch1][C][H][=C][Ring2][Ring1][#Branch2][H]
2022-11-25 14:01:59.367 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]C=NC(N([H])C=NC([H])=C([H])C([H])=C)N([H])[H]
2022-11-25 14:01:59.391 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 26 samples
2022-11-25 14:01:59.458 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-11-25 14:02:21.184 | DEBUG    | gpt3forchem.output:tes

Testing temperature 0.75 for selfies


2022-11-25 14:02:57.405 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H][C][=C][Branch1][C][H][C][Branch2][Ring2][#Branch1][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][F][=C][Ring1][#Branch2][H][C][Branch1][C][H][Branch1][C][H][S][C][Branch1][C][H][Branch1][C][H][H][C][=Branch1][C][=O][N][Branch1][C][H][C][=N][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][H][=C][Ring1][#Branch2][N][Branch1][C][H][H][C][Branch1][C][H][=C][Ring2][Ring2][=Branch2][C][Branch1
2022-11-25 14:02:57.449 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H][C][=C][Branch1][C][H][C][Branch2][Ring2][#Branch1][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][F][=C][Ring1][#Branch2][H][C][Branch1][C][H][Branch1][C][H][S][C][Branch1][C][H][Branch1][C][H][H][C][=Branch1][C][=O][N][Branch1][C][H][C][=N][C][=C][Branch1][C][H][C][Branch1][C][H][=C][Branch1][C][H

Testing temperature 1.0 for selfies


2022-11-25 14:03:56.519 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H][C][=N][C][Branch1][C][H][=C][Branch1][C][H][C][Branch2][Ring2][=C][C][=N][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][H][=C][Ring1][#Branch2][C][=Ring1][=N][C][Branch1][C][H][Branch1][C][H][C][Branch1][C][H][Branch1][C][H][H][=C][Branch2][Ring1][=Branch2][N][Branch1][C][H][C][=Branch1][C][=O][C][Branch1][C][H][Branch1][C][H][N][Branch1][C][H][C][=Branch1][C][=O][C@][Branch1][C][H][C][Branch1][C][H][Branch1][C][H][N][C][Branch1][C
2022-11-25 14:03:56.547 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H][C][=N][C][Branch1][C][H][=C][Branch1][C][H][C][Branch2][Ring2][=C][C][=N][C][Branch1][C][H][=C][Branch1][C][H][C][Branch1][C][H][=C][Ring1][#Branch2][C][=Ring1][=N][C][Branch1][C][H][Branch1][C][H][C][Branch1][C][H][Branch1][C][H][H][=C][Branch2][Ring1][=Branch2][N][Branch1][C][H][C][=Branch1][C][=O][C][Branch1][C][H][Branch1][

Testing temperature 1.25 for selfies


2022-11-25 14:05:07.030 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:928 - Got predictions, example: [H][C][=N][C][Branch1][C][H][=C][Branch1][Ring1][O][H][=C][Ring1][=Branch1][H][N][C][Branch2][C][Branch2][Branch1][C][Ring1][N][C][Branch1][C][H][Branch1][C][H][C@][C][Branch1][C][H][C][Branch1][C][H][Branch1][C][H][C@][Ring1][S][Branch1][C][H][C][=Ring2][Ring1][S][Ring1][N]
2022-11-25 14:05:07.046 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:939 - Loaded predictions. Example: [H]C=NC([H])=C(O[H])C[H]
2022-11-25 14:05:07.071 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:950 - Calculating Frechet ChemNet distance for 25 samples
2022-11-25 14:05:07.115 | INFO     | gpt3forchem.output:_load_chemnet:217 - Saved ChemNet model to '/var/folders/m9/_txh68y946s4pxy1x2wnd3lh0000gn/T/ChemNet_v0.13_pretrained.h5'
2022-11-25 14:05:29.242 | DEBUG    | gpt3forchem.output:test_inverse_bandgap:959 - Computed frechet score: (50.868700634934555, 3.815933542638705e-05)
2022-11-25 14:

Testing temperature 1.5 for selfies
list index out of range


In [87]:
save_pickle( "run_files/res_500_selfies.pkl", res_500_smiles)

In [89]:
import selfies as sf

In [92]:
selfies_500_smiles_sets = {}

for res in res_500_smiles:
    selfies_500_smiles_sets[res['meta']['temperature']] = set(res['predictions'][res['valid_smiles']])

for temp, smiles_set in selfies_500_smiles_sets.items():
    with open(f'for_xtb_opt/500_selfies_{temp}.txt', 'w') as f:
        for i, smiles in enumerate(smiles_set):
            if i != len(smiles_set) - 1:
                f.write(smiles + '\n')
            else:
                f.write(smiles)

## Larger training set

In [118]:
train_large, test_large = train_test_split(data, train_size=1000, test_size=100, random_state=42)

In [120]:
train_prompts_large_smiles, test_prompts_large_smiles = generate_inverse_photoswitch_prompts(train_large, 'smiles'), generate_inverse_photoswitch_prompts(test_large, 'smiles')

train_prompts_large_selfies, test_prompts_large_selfies = generate_inverse_photoswitch_prompts(train_large, 'selfies'), generate_inverse_photoswitch_prompts(test_large, 'selfies')