In [17]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import pickle


In [82]:
DATA_PATH = "../../data/disease_context_from_dev.pickle"

# VECTOR_DB_PATH = "../../data/vectorDB/disease_context_chromaDB_using_all_distilroberta_v1_sentence_transformer_model_with_chunk_size_6000"
# SENTENCE_EMBEDDING_MODEL = "all-distilroberta-v1"

VECTOR_DB_PATH = "../../data/vectorDB/disease_context_chromaDB_using_all_MiniLM_L6_v2_sentence_transformer_model_with_chunk_size_3000"
SENTENCE_EMBEDDING_MODEL = "all-MiniLM-L6-v2"

CHUNK_SIZE = 650
CHUNK_OVERLAP = 200


In [83]:
with open(DATA_PATH, "rb") as f:
    data = pickle.load(f)

metadata_list = list(map(lambda x:{"node information":x.split("(")[0].split("Following is the contextual information about the ")[-1] + "from SPOKE knowledge graph"}, data))
    

In [85]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
docs = text_splitter.create_documents(data, metadatas=metadata_list)


In [96]:
len(docs)

676082

In [97]:
docs[0]

Document(page_content="Following is the contextual information about the Disease familial partial lipodystrophy type 5 (Disease Ontology identifier of this Disease is DOID:0070203).\nfamilial partial lipodystrophy type 5 ISA Disease autosomal recessive disease. Attributes of this relationship are:\n(1) 'source' is Disease Ontology \nfamilial partial lipodystrophy type 5 ASSOCIATES Gene CIDEC. Attributes of this relationship are:\n(1) 'sources' is ['OMIM'] (2) 'o_inheritance' is AR \nfamilial partial lipodystrophy type 5 ISA Disease familial partial lipodystrophy. Attributes of this relationship are:\n(1) 'source' is Disease Ontology \nEND OF NODE CONTEXT", metadata={'node information': 'Disease familial partial lipodystrophy type 5 from SPOKE knowledge graph'})

In [23]:
print(docs[2].page_content)

Following is the contextual information about the Disease pulmonary embolism (Disease Ontology identifier of this Disease is DOID:9477).
pulmonary embolism ISA Disease pulmonary artery disease. Attributes of this relationship are:
(1) 'source' is Disease Ontology 
pulmonary embolism LOCALIZES Anatomy femoral vein. Attributes of this relationship are:
(1) 'enrichment' is 15.774525676395141 (2) 'odds' is 17.015737887874973 (3) 'source' is NCBI PubMed (4) 'cooccur' is 118 (5) 'fisher' is 2.7782517089336134e-97 
pulmonary embolism LOCALIZES Anatomy cardiac atrium. Attributes of this relationship are:
(1) 'enrichment' is 4.297896521062576 (2) 'odds' is 4.436699532351707 (3) 'source' is NCBI PubMed (4) 'cooccur' is 142 (5) 'fisher' is 1.958374164482187e-45 
pulmonary embolism LOCALIZES Anatomy leg. Attributes of this relationship are:
(1) 'enrichment' is 3.5631019649253624 (2) 'odds' is 3.6374820149255886 (3) 'source' is NCBI PubMed (4) 'cooccur' is 87 (5) 'fisher' is 4.614655195444314e-23 


In [24]:

embedding_function = SentenceTransformerEmbeddings(model_name=SENTENCE_EMBEDDING_MODEL)

vectorstore = Chroma(persist_directory=VECTOR_DB_PATH, 
                     embedding_function=embedding_function)




  from .autonotebook import tqdm as notebook_tqdm


In [95]:
search_result = vectorstore.similarity_search_with_score(question, k=10)
retrieved_context = ""
for item in search_result:
    retrieved_context += item[0].page_content
    retrieved_context += "\n"

prompt = """
Use the following pieces of context to answer the question at the end. 
Context: {}
Question : {}
Answer the above Question in the following format:
{{
answer : [answer 1, answer 2, answer 3 etc]
}}
If you don't know the answer, report it as:
{{
answer : Don't know
}}
""".format(retrieved_context, question)

print(prompt)


Use the following pieces of context to answer the question at the end. 
Context: Variant rs1050700461 ASSOCIATES Joubert syndrome 15. Attributes of this relationship are:
(1) 'alt_allele' is A (2) 'confidence_score' is 1.0 (3) 'collection_method' is clinical testing (4) 'ClinicalSignificance' is Uncertain significance (5) 'source' is ClinVar (6) 'ref_allele' is G 
END OF NODE CONTEXT
Variant rs1029781765 ASSOCIATES Joubert syndrome. Attributes of this relationship are:
(1) 'alt_allele' is C (2) 'confidence_score' is 1.0 (3) 'collection_method' is clinical testing (4) 'ClinicalSignificance' is Uncertain significance (5) 'source' is ClinVar (6) 'ref_allele' is T 
END OF NODE CONTEXT
Variant rs1025041382 ASSOCIATES Joubert syndrome 8. Attributes of this relationship are:
(1) 'alt_allele' is T (2) 'confidence_score' is 1.0 (3) 'collection_method' is clinical testing (4) 'ClinicalSignificance' is Uncertain significance (5) 'source' is ClinVar (6) 'ref_allele' is G 
END OF NODE CONTEXT
Vari

In [81]:
question = "Variant rs775393475 is associated with Joubert syndrome 31"

search_result = vectorstore.similarity_search_with_score(question, k=5)
print(search_result[0][0].page_content)


Variant rs1050700461 ASSOCIATES Joubert syndrome 15. Attributes of this relationship are:
(1) 'alt_allele' is A (2) 'confidence_score' is 1.0 (3) 'collection_method' is clinical testing (4) 'ClinicalSignificance' is Uncertain significance (5) 'source' is ClinVar (6) 'ref_allele' is G 
END OF NODE CONTEXT


In [63]:
question = "What are the genes associated with Cryptococcal meningitis?"
search_result = vectorstore.similarity_search_with_score(question, k=1)
search_result


[(Document(page_content="Cryptococcal meningitis RESEMBLES Disease meningoencephalitis. Attributes of this relationship are:\n(1) 'enrichment' is 24.255475317462597 (2) 'odds' is 25.285566953867328 (3) 'source' is NCBI PubMed (4) 'cooccur' is 58 (5) 'fisher' is 1.1931771397007566e-58 \nCryptococcal meningitis RESEMBLES Disease cranial nerve palsy. Attributes of this relationship are:\n(1) 'enrichment' is 5.766808071694847 (2) 'odds' is 5.795352549889135 (3) 'source' is NCBI PubMed (4) 'cooccur' is 6 (5) 'fisher' is 0.0007212322910249944 \nCryptococcal meningitis RESEMBLES Disease idiopathic CD4-positive T-lymphocytopenia. Attributes of this relationship are:\n(1) 'enrichment' is 101.36716020843693 (2) 'odds' is 107.09432313814357 (3) 'source' is NCBI PubMed (4) 'cooccur' is 12 (5) 'fisher' is 1.0536231500651493e-20 \nCryptococcal meningitis RESEMBLES Disease neuroaspergillosis. Attributes of this relationship are:\n(1) 'enrichment' is 34.33624455103599 (2) 'odds' is 34.958373159901164 

In [47]:
import pandas as pd

In [48]:
df = pd.DataFrame(data, columns=["context"])


In [54]:
df[df.context.str.contains("INCREASEDIN")].head(50)

Unnamed: 0,context
2369,Following is the contextual information about ...


In [98]:
print(data[0])

Following is the contextual information about the Disease familial partial lipodystrophy type 5 (Disease Ontology identifier of this Disease is DOID:0070203).
familial partial lipodystrophy type 5 ISA Disease autosomal recessive disease. Attributes of this relationship are:
(1) 'source' is Disease Ontology 
familial partial lipodystrophy type 5 ASSOCIATES Gene CIDEC. Attributes of this relationship are:
(1) 'sources' is ['OMIM'] (2) 'o_inheritance' is AR 
familial partial lipodystrophy type 5 ISA Disease familial partial lipodystrophy. Attributes of this relationship are:
(1) 'source' is Disease Ontology 
END OF NODE CONTEXT


#### 