In [16]:
from openai import OpenAI
from tdc.multi_pred import DTI
from tdc.benchmark_group import dti_dg_group
import numpy as np

In [2]:
with open('./openai_api_key') as f:
    openai_api_key = f.readline().strip()
client = OpenAI(api_key = openai_api_key)

In [3]:
def get_openai_response(prompt):
    response = client.chat.completions.create(
            model = "gpt-4o",
            messages=[
                {"role": "system", "content": "You are a highly intelligent AI that is designed to can solve computational biology problems."},
                {"role": "user", "content": prompt}
            ]
        )
    return response.choices[0].message.content

In [20]:
data = DTI(name = 'BindingDB_Kd')
split = data.get_split()
train, val, test = (
    split['train'], split['valid'], split['test']
)

group = dti_dg_group(path = 'data/')
benchmark = group.get('BindingDB_Patent')

Found local copy...
Loading...
Done!
Found local copy...


In [22]:
benchmark['test']

Unnamed: 0,Drug_ID,Drug,Target_ID,Target,Y,Year
0,126484387.0,CC(=O)Nc1cc(Nc2nc(nn3ccc(CNCC(C)(C)O)c23)-c2cc...,P37173,MGRGLLRGLWPLHIVLWTRIASTIPPHVQKSVNNDMIVTDNNGAVK...,9.615805,2019
1,126484452.0,OCCN1CCN(Cc2ccn3nc(nc(Nc4ccncc4F)c23)-c2cccc(n...,P37173,MGRGLLRGLWPLHIVLWTRIASTIPPHVQKSVNNDMIVTDNNGAVK...,9.615805,2019
2,126484204.0,O[C@H]1CCN(Cc2ccn3nc(nc(Nc4ccncc4F)c23)-c2cccc...,P37173,MGRGLLRGLWPLHIVLWTRIASTIPPHVQKSVNNDMIVTDNNGAVK...,6.522093,2019
3,126484351.0,CC(=O)Nc1cc(Nc2nc(nn3ccc(CN4CCN(CCO)CC4)c23)-c...,P37173,MGRGLLRGLWPLHIVLWTRIASTIPPHVQKSVNNDMIVTDNNGAVK...,7.170120,2019
4,126484298.0,COc1nc(cs1)-c1nc(Nc2ccncc2F)c2cccn2n1,P37173,MGRGLLRGLWPLHIVLWTRIASTIPPHVQKSVNNDMIVTDNNGAVK...,9.615805,2019
...,...,...,...,...,...,...
49023,122670107.0,COc1cc(cnc1C(=O)C1CC1C(=O)NS(=O)(=O)c1ccccc1F)...,Q16873,MKDEVALLAAVTLLGVLLQAYFSLQVISARRAFRVSPPLTTGPPEF...,4.304065,2021
49024,122679444.0,COc1cc(ncc1C(=O)C1CC1C(O)=O)N(CC1CC1)c1ccc(C)c...,Q16873,MKDEVALLAAVTLLGVLLQAYFSLQVISARRAFRVSPPLTTGPPEF...,4.025352,2021
49025,122670055.0,COc1cc(ccc1C(=O)C1CC1C(O)=O)N(CC1CC1)c1ccc(C)c...,Q16873,MKDEVALLAAVTLLGVLLQAYFSLQVISARRAFRVSPPLTTGPPEF...,4.127134,2021
49026,122670085.0,COc1cc(ccc1C(=O)C1CC1C(O)=O)N(CC1CC1)c1ccc(Cl)...,Q16873,MKDEVALLAAVTLLGVLLQAYFSLQVISARRAFRVSPPLTTGPPEF...,4.110874,2021


In [7]:
num_samples = 10
mol = val['Drug'].to_dict()
prot = val['Target'].to_dict()
kd = val['Y'].to_dict()
x = ''
for i in range(len(val)):
    x = x + mol[i] + ' ' + prot[i] + ' \n'
    if i >= num_samples - 1:
        break

In [8]:
base_prompt = '''Provided below are a number of combinations of drug molecules and target proteins for which we want you to predict the dissociation constant Kd.
Each row has the SMILES string of a drug compound, which is followed by a space and then the amino acid sequence of the protein, after which there is a space and a newline.
For each row, you must predict the dissociation constant Kd.
Answers should be in units of nanomolar (nM).
You must make your best estimate of a numerical value for each row.
Make sure each row gets an answer.
Please preface each answer with three less than signs and finish each answer with three greater than signs with only a numerical value inside.
This is an example of a well-formatted answer: >>> 5.37e3 <<<).
'''
print(base_prompt)
prompt = base_prompt + x
res = get_openai_response(prompt)
print(res)

Provided below are a number of combinations of drug molecules and target proteins for which we want you to predict the dissociation constant Kd.
Each row has the SMILES string of a drug compound, which is followed by a space and then the amino acid sequence of the protein, after which there is a space and a newline.
For each row, you must predict the dissociation constant Kd.
Answers should be in units of nanomolar (nM).
You must make your best estimate of a numerical value for each row.
Make sure each row gets an answer.
Please preface each answer with three less than signs and finish each answer with three greater than signs with only a numerical value inside.
This is an example of a well-formatted answer: >>> 5.37e3 <<<).

It's important to note that predicting dissociation constants (Kd) accurately requires complex computational models and access to extensive biochemical data. Without the necessary algorithms and databases, any predictions made here will be approximations and not e

In [9]:
pred = []
start = 0
for i in range(res.count('>>>')):
    begin = res.find('>>>', start) + 4
    end = res.find('<<<', begin) - 1
    pred.append(float(res[begin: end]))
    start = end + 4

In [28]:
np.corrcoef(pred, list(kd.values())[:9])

array([[ 1.        , -0.07369292],
       [-0.07369292,  1.        ]])

In [25]:
kd

{0: 5000.0,
 1: 12.0,
 2: 30.0,
 3: 1.0,
 4: 10000.0,
 5: 6.4,
 6: 7100.0,
 7: 41000.0,
 8: 10000.0,
 9: 16.0,
 10: 10000.0,
 11: 2300.0,
 12: 570.0,
 13: 1.0,
 14: 0.6759999999999999,
 15: 10000.0,
 16: 22.0,
 17: 3162.0,
 18: 95.5,
 19: 10000.0,
 20: 10000.0,
 21: 10000.0,
 22: 10000.0,
 23: 410.0,
 24: 30200.0,
 25: 10000.0,
 26: 300.0,
 27: 10000.0,
 28: 10000.0,
 29: 10000.0,
 30: 1.0,
 31: 2500000.0,
 32: 10.0,
 33: 10000.0,
 34: 7500.0,
 35: 10000.0,
 36: 10000.0,
 37: 4400.0,
 38: 10000.0,
 39: 72.0,
 40: 10000.0,
 41: 10000.0,
 42: 3.0,
 43: 73.0,
 44: 4800.0,
 45: 8600.0,
 46: 800.0,
 47: 3800.0,
 48: 155000.0,
 49: 10000.0,
 50: 10000.0,
 51: 10000.0,
 52: 10000.0,
 53: 980.0,
 54: 10000.0,
 55: 10000.0,
 56: 39.0,
 57: 8.91,
 58: 1000.0,
 59: 225.0,
 60: 3.3,
 61: 6.0,
 62: 140.0,
 63: 10000.0,
 64: 990.0,
 65: 1.0,
 66: 4400.0,
 67: 10000.0,
 68: 70000.0,
 69: 10000.0,
 70: 70.0,
 71: 10000.0,
 72: 10000.0,
 73: 4.4,
 74: 100000.0,
 75: 10000.0,
 76: 53703.0,
 77: 10000.0,