In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.vectorstores import Chroma
import pickle



In [28]:
DATA_PATH = "../../data/disease_with_relation_to_genes.pickle"
VECTOR_DB_NAME = "../../data/vectorDB/disease_nodes_chromaDB_using_all_distilroberta_v1"
CHUNK_SIZE = 6000
CHUNK_OVERLAP = 200

SENTENCE_EMBEDDING_MODEL = "all-distilroberta-v1"


In [29]:
with open(DATA_PATH, "rb") as f:
    data = pickle.load(f)

# data = data[5529:5607]
metadata_list = list(map(lambda x:{"source": x + " from SPOKE knowledge graph"}, data))
len(metadata_list)

6307

In [11]:
with open(DATA_PATH, "rb") as f:
    data = pickle.load(f)

# data = data[5529:5607]
metadata_list = list(map(lambda x:{"node information":x.split("(")[0].split("Following is the contextual information about the ")[-1] + "from SPOKE knowledge graph"}, data))
    

In [30]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
docs = text_splitter.create_documents(data, metadatas=metadata_list)
len(docs)

6307

In [31]:
docs[2]

Document(page_content='spondylometaphyseal dysplasia Sedaghatian type', metadata={'source': 'spondylometaphyseal dysplasia Sedaghatian type from SPOKE knowledge graph'})

In [None]:
vectorstore = Chroma(embedding_function=SentenceTransformerEmbeddings(model_name=SENTENCE_EMBEDDING_MODEL), 
                     persist_directory=VECTOR_DB_NAME)


In [55]:
%%time

vectorstore.add_documents(documents=docs)


CPU times: user 3min 31s, sys: 23.2 s, total: 3min 55s
Wall time: 34.4 s


['2bbab99a-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabb48-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabba2-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabbde-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabc1a-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabc56-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabc88-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabcc4-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabcf6-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabd50-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabd96-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabdc8-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabe18-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabe54-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabe90-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabec2-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabef4-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabf30-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabf62-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabf94-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbabfd0-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbac002-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbac03e-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbac070-5d5a-11ee-99e2-52778bb2e2f0',
 '2bbac0a2-5d5a-

In [72]:
question = "What compound treats isolated growth hormone deficiency?"
search_result = vectorstore.similarity_search_with_score(question, k=1)
print(search_result[0][0].page_content)


(1) 'enrichment' is 9.083154301870298 (2) 'odds' is 9.18847748257019 (3) 'source' is NCBI PubMed (4) 'cooccur' is 16 (5) 'fisher' is 7.283265876973802e-11 
isolated growth hormone deficiency LOCALIZES Anatomy arcuate nucleus of hypothalamus. Attributes of this relationship are:
(1) 'enrichment' is 170.12906403940886 (2) 'odds' is 181.32142857142858 (3) 'source' is NCBI PubMed (4) 'cooccur' is 2 (5) 'fisher' is 6.64478261176143e-05 
isolated growth hormone deficiency LOCALIZES Anatomy optic disc. Attributes of this relationship are:
(1) 'enrichment' is 5.855379607337833 (2) 'odds' is 6.235083357706932 (3) 'source' is NCBI PubMed (4) 'cooccur' is 5 (5) 'fisher' is 0.0016911262712014613 
isolated growth hormone deficiency LOCALIZES Anatomy neurohypophysis. Attributes of this relationship are:
(1) 'enrichment' is 83.62276029055691 (2) 'odds' is 92.92672413793103 (3) 'source' is NCBI PubMed (4) 'cooccur' is 6 (5) 'fisher' is 1.3789332984995605e-10 
isolated growth hormone deficiency LOCALIZ