This notebook contains example "chat"s between an example user and our bio/chem database LLM

make sure you have `langchain`, `chembl_webresource_client`, and `openai` pip installed

In [1]:
import pandas as pd

In [2]:
from chembl_webresource_client.new_client import new_client

available_resources = [resource for resource in dir(new_client) if not resource.startswith('_')]

In [3]:
from chembl_webresource_client.new_client import new_client


target = new_client.target
protein_name = 'mpro'
res = target.filter(target_synonym__icontains=protein_name).only(['target_chembl_id', 'pref_name'])
for i in res:
    print(i)

{'pref_name': 'Angiotensin-converting enzyme 2', 'target_chembl_id': 'CHEMBL3736'}
{'pref_name': 'SARS coronavirus 3C-like proteinase', 'target_chembl_id': 'CHEMBL3927'}
{'pref_name': 'Replicase polyprotein 1ab', 'target_chembl_id': 'CHEMBL5118'}
{'pref_name': 'Angiotensin-converting enzyme', 'target_chembl_id': 'CHEMBL2096989'}
{'pref_name': 'Replicase polyprotein 1ab', 'target_chembl_id': 'CHEMBL4523582'}
{'pref_name': 'Spike glycoprotein/Angiotensin-converting enzyme 2', 'target_chembl_id': 'CHEMBL4888460'}


In [4]:
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI
from langchain.agents import load_tools, initialize_agent
from langchain.agents import AgentType

OPENAI_API_KEY = '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'
# llm = ChatOpenAI(temperature=0.0, openai_api_key=OPENAI_API_KEY, openai_organization='org-zcFmIksFIyEPBZIOZ9kDKkRN')
llm = OpenAI(temperature=0.0, openai_api_key=OPENAI_API_KEY, openai_organization='org-zcFmIksFIyEPBZIOZ9kDKkRN')



In [92]:
math_llm = OpenAI(temperature=0.0, openai_api_key=OPENAI_API_KEY, openai_organization='org-zcFmIksFIyEPBZIOZ9kDKkRN')
tools = load_tools(
    ["human", "llm-math"], 
    llm=math_llm,
)

agent_chain = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)

In [11]:
agent_chain.run("What's my friend Eric's surname?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mI don't know Eric's surname, so I need to ask someone who might know.
Action: Human
Action Input: "Do you know Eric's surname?"[0m

Do you know Eric's surname?

Observation: [36;1m[1;3mZhang[0m
Thought:[32;1m[1;3mI have the answer to Eric's surname.
Final Answer: Eric's surname is Zhang.[0m

[1m> Finished chain.[0m


"Eric's surname is Zhang."

In [5]:
from langchain.tools import tool

@tool("search_target_id", return_direct=False)
def search_target_id_tool(target_name: str) -> str:
    """Searches ChEMBL for the target ID of a given target name. Returns a dictionary of the target ID and the target name.
    The input to this tool should be a string of the target name. If the dictionary has length greather than one, prints the possible target IDs and ask the user to select one of the targets."""
    target = new_client.target
    res = target.filter(target_synonym__icontains=target_name).only(['target_chembl_id', 'pref_name'])
    if len(res) > 1:
        print(f"\nFound {len(res)} possible targets:")
        for i in res:
            print(i)
        user_prompt = ("\nPlease select one of the targets:")
        return input(user_prompt)
    elif len(res) == 1:
        return res[0]
    else:
        return "No targets found."

def search_target_id(target_name: str) -> str:
    """Searches ChEMBL for the target ID of a given target name. Returns a dictionary of the target ID and the target name. 
    If the dictionary has length greather than one, print the possible target IDs and ask the user to select one of the targets."""
    target = new_client.target
    res = target.filter(target_synonym__icontains=target_name).only(['target_chembl_id', 'pref_name'])
    return res


In [83]:
search_target_id('mpro')

[{'pref_name': 'Maltase-glucoamylase', 'target_chembl_id': 'CHEMBL2074'}, {'pref_name': 'Sulfonylurea receptor 2', 'target_chembl_id': 'CHEMBL1971'}, {'pref_name': 'Phosphodiesterase 5A', 'target_chembl_id': 'CHEMBL1827'}, {'pref_name': 'Voltage-gated T-type calcium channel alpha-1H subunit', 'target_chembl_id': 'CHEMBL1859'}, '...(remaining elements truncated)...']

In [76]:
target = new_client.target
res = target.filter(target_synonym__icontains='MDCK').only(['target_chembl_id', 'pref_name'])
res

[]

In [98]:
# tools = load_tools(
#     ["human", search_target_id], 
# )

agent_chain = initialize_agent(
    tools + [search_target_id_tool],
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)

In [99]:
agent_chain.run("What is the target ID of mpro?")




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to search for the target ID of mpro.
Action: search_target_id
Action Input: "mpro"[0m
Found 6 possible targets:
{'pref_name': 'Angiotensin-converting enzyme 2', 'target_chembl_id': 'CHEMBL3736'}
{'pref_name': 'SARS coronavirus 3C-like proteinase', 'target_chembl_id': 'CHEMBL3927'}
{'pref_name': 'Replicase polyprotein 1ab', 'target_chembl_id': 'CHEMBL5118'}
{'pref_name': 'Angiotensin-converting enzyme', 'target_chembl_id': 'CHEMBL2096989'}
{'pref_name': 'Replicase polyprotein 1ab', 'target_chembl_id': 'CHEMBL4523582'}
{'pref_name': 'Spike glycoprotein/Angiotensin-converting enzyme 2', 'target_chembl_id': 'CHEMBL4888460'}

Observation: [38;5;200m[1;3mCHEMBL4888460[0m
[32;1m[1;3m[0m

[1m> Finished chain.[0m


'CHEMBL4888460'

In [27]:
@tool("search_activities", return_direct=True)
def search_activities_for_target_id(target_id):
    """Searches ChEMBL for the activities of a given target ID. Returns a list of the activities."""
    activity = new_client.activity
    target_activities = activity.filter(target_chembl_id=target_id)
    return pd.DataFrame.from_dict(target_activities)

@tool("search_top_n_activities", return_direct=True)
def search_top_n_activities_for_target_id(target_id, top_n=10):
    """After finding the target ChEMBL ID, searches ChEMBL for the top N activities of the given target ID. Returns a list of the activities."""
    activity = new_client.activity
    target_activities = activity.filter(target_chembl_id=target_id)
    df = pd.DataFrame.from_dict(target_activities)
    df['pchembl_value'] = df['pchembl_value'].astype(float)
    df = df.dropna(subset=['pchembl_value'])
    return df.nlargest(top_n, 'pchembl_value')

In [21]:

activity = new_client.activity
res = activity.filter(target_chembl_id='CHEMBL3938')
df = pd.DataFrame.from_dict(res)
# print(df)
# print(df['pchembl_value'].values)
df['pchembl_value'] = df['pchembl_value'].astype(float)
df = df.dropna(subset=['pchembl_value'])
df.nlargest(10, 'pchembl_value')

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
164,,7582740,[],CHEMBL1908472,Binding constant for STK16 kinase domain,B,,,BAO_0000034,BAO_0000357,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,Kd,nM,UO_0000065,,1.7
149,,7576440,[],CHEMBL1908472,Binding constant for STK16 kinase domain,B,,,BAO_0000034,BAO_0000357,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,Kd,nM,UO_0000065,,6.6
139,,7572051,[],CHEMBL1908472,Binding constant for STK16 kinase domain,B,,,BAO_0000034,BAO_0000357,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,Kd,nM,UO_0000065,,18.0
142,,7573099,[],CHEMBL1908472,Binding constant for STK16 kinase domain,B,,,BAO_0000034,BAO_0000357,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,Kd,nM,UO_0000065,,18.0
104,,3448791,[],CHEMBL1244987,Binding affinity to STK16,B,,,BAO_0000034,BAO_0000357,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,Kd,nM,UO_0000065,,19.0
176,,7588031,[],CHEMBL1908472,Binding constant for STK16 kinase domain,B,,,BAO_0000034,BAO_0000357,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,Kd,nM,UO_0000065,,19.0
24,,2477867,[],CHEMBL986217,Binding affinity to human MPSK1,B,,,BAO_0000034,BAO_0000357,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,Kd,nM,UO_0000065,,24.0
152,,7577546,[],CHEMBL1908472,Binding constant for STK16 kinase domain,B,,,BAO_0000034,BAO_0000357,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,Kd,nM,UO_0000065,,29.0
701,,19009176,[],CHEMBL4345186,Displacement of Alexafluor labelled kinase tra...,B,,,BAO_0000192,BAO_0000019,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,Ki,nM,UO_0000065,,78.0
896,,24412450,[],CHEMBL5046583,Inhibition of human STK16 in presence of [gamm...,B,,,BAO_0000190,BAO_0000357,...,Homo sapiens,Serine/threonine-protein kinase 16,9606,,,IC50,nM,UO_0000065,,82.0


In [33]:
tools = load_tools(
    ["human",], 
)

agent_chain = initialize_agent(
    tools + [search_target_id_tool] + [search_activities_for_target_id],
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)

agent_chain.run("What are the activities of mpro in CHEMBL?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find the target ID of mpro first.
Action: search_target_id
Action Input: mpro[0m
Found 6 possible targets:
{'pref_name': 'Angiotensin-converting enzyme 2', 'target_chembl_id': 'CHEMBL3736'}
{'pref_name': 'SARS coronavirus 3C-like proteinase', 'target_chembl_id': 'CHEMBL3927'}
{'pref_name': 'Replicase polyprotein 1ab', 'target_chembl_id': 'CHEMBL5118'}
{'pref_name': 'Angiotensin-converting enzyme', 'target_chembl_id': 'CHEMBL2096989'}
{'pref_name': 'Replicase polyprotein 1ab', 'target_chembl_id': 'CHEMBL4523582'}
{'pref_name': 'Spike glycoprotein/Angiotensin-converting enzyme 2', 'target_chembl_id': 'CHEMBL4888460'}

Observation: [33;1m[1;3mCHEMBL5118[0m
Thought:[32;1m[1;3m I now have the target ID, so I can search for the activities.
Action: search_activities
Action Input: CHEMBL5118[0m
Observation: [38;5;200m[1;3m    activity_comment  activity_id activity_properties assay_chembl_id  \
0               None

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
0,,1988091,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,870.0
1,,1988092,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,200.0
2,,1988093,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,300.0
3,,1988094,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,15.0
4,,1988095,[],CHEMBL898907,Inhibition of SARS-CoV 3C-like protease by FRE...,B,,,BAO_0000190,BAO_0000019,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,10.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373,,18548176,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.3
374,,18548177,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,5.5
375,,18548178,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,4.1
376,,18548179,[],CHEMBL4198706,Inhibition of SARS coronavirus 3CL protease us...,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,Replicase polyprotein 1ab,227859,,,IC50,uM,UO_0000065,,3.2


`STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION` needed for multi-input tools

In [35]:
tools = load_tools(
    ["human",], 
)

agent_chain = initialize_agent(
    tools + [search_target_id_tool] + [search_activities_for_target_id] + [search_top_n_activities_for_target_id],
    llm,
    agent=AgentType.STRUCTURED_CHAT_ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True,
)

agent_chain.run("What are the top-25 activities of mpro in CHEMBL? Make sure to search the target ID first.")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "search_target_id",
  "action_input": "mpro"
}
```

[0m
Found 6 possible targets:
{'pref_name': 'Angiotensin-converting enzyme 2', 'target_chembl_id': 'CHEMBL3736'}
{'pref_name': 'SARS coronavirus 3C-like proteinase', 'target_chembl_id': 'CHEMBL3927'}
{'pref_name': 'Replicase polyprotein 1ab', 'target_chembl_id': 'CHEMBL5118'}
{'pref_name': 'Angiotensin-converting enzyme', 'target_chembl_id': 'CHEMBL2096989'}
{'pref_name': 'Replicase polyprotein 1ab', 'target_chembl_id': 'CHEMBL4523582'}
{'pref_name': 'Spike glycoprotein/Angiotensin-converting enzyme 2', 'target_chembl_id': 'CHEMBL4888460'}

Observation: [33;1m[1;3mCHEMBL3927[0m
Thought:[32;1m[1;3m I need to search for activities
Action:
```
{
  "action": "search_top_n_activities",
  "action_input": {
    "target_id": "CHEMBL3927",
    "top_n": 25
  }
}
```

[0m
Observation: [36;1m[1;3m    activity_comment  activity_id activity_properties as

Unnamed: 0,activity_comment,activity_id,activity_properties,assay_chembl_id,assay_description,assay_type,assay_variant_accession,assay_variant_mutation,bao_endpoint,bao_format,...,target_organism,target_pref_name,target_tax_id,text_value,toid,type,units,uo_units,upper_value,value
245,,1831024,[],CHEMBL918059,Inhibition of SARS coronavirus 3C-like protease,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,nM,UO_0000065,,50.0
175,,1749294,[],CHEMBL867941,Inhibition of SARS coronavirus 3CL protease,B,,,BAO_0000192,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,Ki,uM,UO_0000065,,0.053
173,,1749287,[],CHEMBL867941,Inhibition of SARS coronavirus 3CL protease,B,,,BAO_0000192,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,Ki,uM,UO_0000065,,0.058
225,,1830967,[],CHEMBL918059,Inhibition of SARS coronavirus 3C-like protease,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,nM,UO_0000065,,60.0
226,,1830968,[],CHEMBL918059,Inhibition of SARS coronavirus 3C-like protease,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,nM,UO_0000065,,63.0
244,,1831023,[],CHEMBL918059,Inhibition of SARS coronavirus 3C-like protease,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,nM,UO_0000065,,65.0
243,,1831022,[],CHEMBL918059,Inhibition of SARS coronavirus 3C-like protease,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,nM,UO_0000065,,95.0
106,,1720293,[],CHEMBL868157,Inhibition of SARS CoV 3C-like protease,B,,,BAO_0000192,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,Ki,uM,UO_0000065,,0.12
114,,1720301,[],CHEMBL868157,Inhibition of SARS CoV 3C-like protease,B,,,BAO_0000192,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,Log Ki,,UO_0000065,,-6.92
246,,1831025,[],CHEMBL918059,Inhibition of SARS coronavirus 3C-like protease,B,,,BAO_0000190,BAO_0000357,...,SARS coronavirus,SARS coronavirus 3C-like proteinase,227859,,,IC50,nM,UO_0000065,,170.0


Example prompt for chemical activity search

In [None]:
human_prompt = 'Show me the top 10 most active compounds for mpro'
ai_thought = 'I need to find the ChEMBL target ID for mpro'
action = 'search_target_id'
action_input = 'mpro'
{'CHEMBL3736': 'Angiotensin-converting enzyme 2',
 'CHEMBL3927': 'SARS coronavirus 3C-like proteinase',
 'CHEMBL5118': 'Replicase polyprotein 1ab',
 'CHEMBL2096989': 'Angiotensin-converting enzyme',
 'CHEMBL4523582': 'Replicase polyprotein 1ab',
 'CHEMBL4888460': 'Spike glycoprotein/Angiotensin-converting enzyme 2',
 }
observation = 'multiple_target_ids'

In [None]:
thought = 'There is more than one possible ChEMBL target ID for Mpro, I should ask the user to clarify.'
action = 'Human'
action_input = 'There are multiple possible ChEMBL target IDs - here are the names of the proteins with their associated target IDs. which one do you want?\n\\
                CHEMBL3736: Angiotensin-converting enzyme 2
                CHEMBL3927: SARS coronavirus 3C-like proteinase
                CHEMBL5118: Replicase polyprotein 1ab
                CHEMBL2096989: Angiotensin-converting enzyme
                CHEMBL4523582: Replicase polyprotein
                CHEMBL4888460: Spike glycoprotein/Angiotensin-converting enzyme 2'
observation = 'CHEMBL3927'

In [None]:
thought = 'Now I need to find the activities associated with CHEMBL3927.'
action = 'search_activities_from_target_id'
action_input = 'CHEMBL3927'

observation = 'some_json.'

In [None]:
thought = 'Now I need to find the top 10 most active compounds from this list.'
action = 'find_top_n'
action_input = 'some_json, 10'

observation = 'final_json.'
thought = 'I now have the final answer, I need to display this to the user.'

> finished chain.