# Create and Populate a Vector Store
* We will use ChromaDB


In [1]:
import asyncio
import chromadb
from chromadb.config import Settings
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoTokenizer, AutoModel
from langchain_text_splitters import CharacterTextSplitter
import torch

In [2]:
settings = Settings(
    allow_reset=True
)

chroma_client = chromadb.PersistentClient(path="./data/chroma_bd",settings=settings)
chroma_client.reset()

# Create an empty collection in ChromaDB
collection_name = "my_vector_store"
collection = chroma_client.create_collection(name=collection_name)


In [17]:

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

# Example usage
documents = [
    "This is a document about pineapple",
    "This is a document about oranges"
]
chunked_documents = []
for doc in documents:
    chunked_document = text_splitter.split_text(doc)
    chunked_documents.extend(chunked_document)
chunked_documents

['This is a document about pineapple', 'This is a document about oranges']

In [18]:
import hashlib

def generate_id(content):
    return hashlib.md5(content.encode()).hexdigest()

# Generate unique IDs by hashing document content
unique_ids = [generate_id(doc) for doc in chunked_documents]


# Add documents to the collection with unique IDs
collection.add(
    documents=documents,
    ids=unique_ids
)


Insert of existing embedding ID: e191a15443f6e455cf0333b6182a7f31
Insert of existing embedding ID: 483e24dbc1c862221ea9efbb69637092
Add of existing embedding ID: e191a15443f6e455cf0333b6182a7f31
Add of existing embedding ID: 483e24dbc1c862221ea9efbb69637092


In [6]:
results = collection.query(
    query_texts=["This is a query document about hawaii"], # Chroma will embed this for you
    n_results=2 # how many results to return
)
print(results)

{'ids': [['e191a15443f6e455cf0333b6182a7f31', '483e24dbc1c862221ea9efbb69637092']], 'distances': [[1.0403737134245823, 1.2430689493879792]], 'metadatas': [[None, None]], 'embeddings': None, 'documents': [['This is a document about pineapple', 'This is a document about oranges']], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [30]:
from langchain_community.document_loaders import PyMuPDFLoader
file_path="./data/Example_Paper.pdf"
#loader = PyMuPDFLoader(file_path)
loader = PyMuPDFLoader(file_path,extract_images=True)
ret_data = loader.load()
dict(ret_data[0])

{'page_content': 'RESEARCH ARTICLE\nStress-responsive and metabolic gene\nregulation are altered in low S-\nadenosylmethionine\nWei Ding1, Daniel P. HigginsID2, Dilip K. Yadav1, Adwait A. Godbole1, Read Pukkila-\nWorleyID3, Amy K. WalkerID1*\n1 Program in Molecular Medicine, UMASS Medical School, Worcester, MA, United States of America,\n2 Department of Computer Sciences, Georgia Institute of Technology, Atlanta, GA, United States of America,\n3 Program in Innate Immunity, Division of Infectious Diseases and Immunology, UMASS Medical School,\nWorcester, MA, United States of America\n* amy.walker@umassmed.edu\nAbstract\nS-adenosylmethionine (SAM) is a donor which provides the methyl groups for histone or\nnucleic acid modification and phosphatidylcholine production. SAM is hypothesized to link\nmetabolism and chromatin modification, however, its role in acute gene regulation is poorly\nunderstood. We recently found that Caenorhabditis elegans with reduced SAM had deficien-\ncies in H3K4

In [29]:
dict(ret_data[1])

{'page_content': 'alcohol or diseases such as cystic fibrosis. Thus, low SAM availability may leave organisms\nless able to respond to additional stress, which could exacerbate tissue injury or disease\nprogression.\nIntroduction\nCellular functions are profoundly affected by metabolic state. For example, transcriptional reg-\nulation can be linked to metabolism through the modification of chromatin by methylation\n[1]. Using the methyl groups produced by the 1-carbon cycle (1CC) and donated by S-adeno-\nsylmethionine (SAM), histone methyltransferases (HMTs) can change the regulatory state of\nchromatin, promoting or limiting gene activity [2]. HMT activity can be controlled by recruit-\nment of HMT containing complexes to specific genomic locations [2]. However, SAM avail-\nability may also affect histone methylation patterns [3]. SAM is produced by the 1-carbon\ncycle (1CC) and levels can be affected by folate, methionine or choline levels or by other factors\nsuch as alcohol consump