In [362]:
from langchain.vectorstores import Chroma
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from langchain.llms import Ollama
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain


In [391]:
VECTOR_DB_PATH = "../../data/vectorDB/disease_context_chromaDB_using_all_distilroberta_v1_sentence_transformer_model_with_chunk_size_6000"
SENTENCE_EMBEDDING_MODEL = "all-distilroberta-v1"


VECTOR_DB_PATH = "../../data/vectorDB/disease_context_chromaDB"
VECTOR_DB_PATH = "../../data/vectorDB/disease_context_chromaDB_using_all_MiniLM_L6_v2_sentence_transformer_model_with_chunk_size_3000"
SENTENCE_EMBEDDING_MODEL = "all-MiniLM-L6-v2"


In [392]:
embedding_function = SentenceTransformerEmbeddings(model_name=SENTENCE_EMBEDDING_MODEL)

vectorstore = Chroma(persist_directory=VECTOR_DB_PATH, 
                     embedding_function=embedding_function)


In [395]:
# Configuring RAG

template = """Use the following pieces of context to answer the question at the end and also to return the provenance. 
If you don't know the answer, just say that you don't know, don't try to make up an answer.   
{context}
Question: {question}
Helpful Answer:
"""

QA_CHAIN_PROMPT = PromptTemplate(
    input_variables=["context", "question"],
    template=template,
)

# template = """Use the following pieces of context to answer the question at the end and also to return the provenance. 
# If you don't know the answer, just say that you don't know, don't try to make up an answer.   
# {context}
# Summaries: {summaries}
# Question: {question}
# Helpful Answer:"""

# QA_CHAIN_PROMPT = PromptTemplate(
#     input_variables=["context", "summaries", "question"],
#     template=template,
# )



llm = Ollama(base_url="http://localhost:11434",
             model="llama2:13b",
             temperature=0,
             verbose=True,
             callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))


# llm = Ollama(base_url="http://localhost:11434",
#              model="llama2:13b",
#              temperature=0.01
#              )

# qa_chain = RetrievalQA.from_chain_type(
#     llm,
#     retriever=vectorstore.as_retriever(search_kwargs={"k": 1}),
#     chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
#     return_source_documents=True
# )

qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(search_kwargs={"k": 1}),
    chain_type="stuff",
    return_source_documents=True
)

# qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
#     llm,
#     retriever=vectorstore.as_retriever(search_type="mmr", search_kwargs={"fetch_k": 40, "lambda_mult":0.5, "k":2}),
#     chain_type = "stuff",
#     chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
#     return_source_documents=True
# )




In [403]:
%%time


# question = "What is the disease ontology identifier for glycogen storage disease Ic ​and what are the genes associated with this disease?​"

prompt = "What are the variants associated with COVID-19?"

question = """
query : {}
Answer the above query in the following format:
{{
answer : [answer 1, answer 2, answer 3 etc]
}}
If you don't know the answer, report it as:
{{
answer : Don't know
}}
""".format(prompt)


# question = "What are the genes associated with congenital myopathy 22A"

# question = """Which gene has stronger association with the disease 'liver benign neoplasm', is it PNPLA3 or HLA-B?
# First find the gwas pvalue for the association between 'liver benign neoplasm' and gene PNPLA3. Then find the gwas pvalue for the association between 'liver benign neoplasm' and gene HLA-B.
# Then compare those values. Note that smaller the pvalue is stronger the association will be.
# """

# result = qa_chain({"query": question})
result = qa_chain(question)


 Based on the information provided, here are the variants associated with COVID-19:

{
answer : [
rs2271616,
rs117479047,
rs142658912,
rs1173773,
rs13078854,
rs5913398,
rs35044562,
rs9411378,
rs12564811,
rs2069837,
rs72809129,
rs17158686,
rs657152
]
}

Note that these are the variants associated with COVID-19 as per the information provided in the GWAS Catalog. However, it is important to note that this list may not be exhaustive and more research may be required to identify all the variants associated with COVID-19. Additionally, the p-values and other attributes provided for each variant can help further understand the strength of the association between each variant and COVID-19.CPU times: user 1.38 s, sys: 569 ms, total: 1.95 s
Wall time: 1min 51s


In [389]:
# Configuring RAG




llm = Ollama(base_url="http://localhost:11434",
             model="llama2:13b",
             temperature=0,
             verbose=True,
             callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))



# qa_chain = RetrievalQA.from_chain_type(
#     llm,
#     chain_type="stuff",
#     retriever=vectorstore.as_retriever(search_kwargs={"k": 1}),
#     return_source_documents=True
# )

qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(search_kwargs={"k": 1}),
    return_source_documents=True
)





In [390]:
question = "What compounds treat multiple sclerosis? State the provenance and phase of the treatment"

llm_response = qa_chain(question)


ValueError: Document prompt requires documents to have metadata variables: ['source']. Received document with missing metadata: ['source'].

In [387]:
llm_response

{'query': 'What compounds treat multiple sclerosis? State the provenance and phase of the treatment',
 'result': ' Based on the given context, the following compounds treat multiple sclerosis:\n\n1. Minocycline (phase 3, source: ChEMBL)\n2. Histamine (phase 3, source: ChEMBL)\n3. Evobrutinib (phase 3, source: ChEMBL)\n4. Acetazolamide (phase 2, source: ChEMBL)\n5. Prednisolone (phase 4, source: ChEMBL and DrugCentral)\n6. Methylprednisolone (phase 4, source: ChEMBL and DrugCentral)\n7. Isopropyl Alcohol (phase 4, source: ChEMBL)\n8. Topiramate (phase 2, source: ChEMBL)\n9. Creatine (phase 1, source: ChEMBL)\n10. Taurine (phase 1, source: ChEMBL)\n11. Icosapent (phase 2, source: ChEMBL)\n12. Laquinimod (phase 3, source: ChEMBL)\n13. Dronabinol (phase 1, source: ChEMBL)\n14. Dantrolene (phase 4, source: ChEMBL)\n15. Triamcinolone Acetonide (phase 4, source: ChEMBL and DrugCentral)\n16. Leflunomide (phase 4, source: ChEMBL)\n17. Prednisone (phase 4, source: ChEMBL and DrugCentral)\n18. Am

In [321]:
result["result"]

" Based on the given context, there are several compounds that have been found to treat hereditary spastic paraplegia (HSP), including:\n\n1. Baclofen\n2. Diazepam\n3. Ibuprofen\n4. 4-Aminopyridine\n\nThese compounds have been associated with HSP through various sources, including ChEMBL, and have been found to have a relationship with the disease based on their attributes, such as 'phase' and 'sources'.\n\nThe provenance of this information is:\n\n* ChEMBL: a large-scale chemical database\n* NCBI PubMed: a database of biomedical literature\n\nNote that while these compounds have been found to be associated with HSP, further research and clinical trials are needed to confirm their efficacy and safety as treatments for the disease."

In [359]:
result["source_documents"]

[Document(page_content="Following is the contextual information about the Disease hepatitis A (Disease Ontology identifier of this Disease is DOID:12549).\nhepatitis A ASSOCIATES Gene GPT. Attributes of this relationship are:\n(1) 'sources' is ['DISEASES'] (2) 'diseases_confidences' is [3.249] (3) 'diseases_scores' is ['6.499'] \nhepatitis A ISA Disease viral infectious disease. Attributes of this relationship are:\n(1) 'source' is Disease Ontology \nCompound TYPHOID VACCINE TREATS hepatitis A. Attributes of this relationship are:\n(1) 'phase' is 3 (2) 'sources' is ['ChEMBL'] \nCompound HONEY TREATS hepatitis A. Attributes of this relationship are:\n(1) 'phase' is 2 (2) 'sources' is ['ChEMBL'] \nOrganism Hepatovirus A CAUSES hepatitis A. Attributes of this relationship are:\n(1) 'source' is PathoPhenoDB \nCompound Silibinin TREATS hepatitis A. Attributes of this relationship are:\n(1) 'phase' is 2 (2) 'sources' is ['ChEMBL'] \nCompound beta-Lactose TREATS hepatitis A. Attributes of thi

In [250]:
result["source_documents"]

[Document(page_content="scurvy LOCALIZES Anatomy alveolar ridge. Attributes of this relationship are:\n(1) 'enrichment' is 26.40382262996942 (2) 'odds' is 27.46189376443418 (3) 'source' is NCBI PubMed (4) 'cooccur' is 3 (5) 'fisher' is 0.00021635000764512021 \nscurvy LOCALIZES Anatomy pubic symphysis. Attributes of this relationship are:\n(1) 'enrichment' is 32.1117619711762 (2) 'odds' is 33.087360951285 (3) 'source' is NCBI PubMed (4) 'cooccur' is 2 (5) 'fisher' is 0.0018347761682033837 \nscurvy LOCALIZES Anatomy autonomic ganglion. Attributes of this relationship are:\n(1) 'enrichment' is 28.215849673202616 (2) 'odds' is 29.04057239057239 (3) 'source' is NCBI PubMed (4) 'cooccur' is 2 (5) 'fisher' is 0.0023644138485134163 \nscurvy LOCALIZES Anatomy aqueous humor of eyeball. Attributes of this relationship are:\n(1) 'enrichment' is 10.475037913254475 (2) 'odds' is 10.828593002768688 (3) 'source' is NCBI PubMed (4) 'cooccur' is 3 (5) 'fisher' is 0.003072773400700315 \nscurvy LOCALIZES 

In [108]:
# %%time

# # Split docs into batches of 5000
# batches = [docs[i:i + 200] for i in range(0, len(docs), 200)]
# batches = batches[0:2]

# vectorstore = Chroma(embedding_function=SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2"), 
#                      persist_directory="disease_context")

# # Add each batch to the Chroma instance
# for batch in batches:
#     vectorstore.add_documents(documents=batch)

In [107]:

# docs