In [1]:
## build a sample vector store
from langchain_community.document_loaders import TextLoader
from langchain_chroma import Chroma
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
loader=TextLoader("What_is_DataIngestion.txt")
documents=loader.load()
documents

[Document(metadata={'source': 'What_is_DataIngestion.txt'}, page_content='Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.\nSources: Data can come from a wide variety of sources, including databases, APIs, log files, IoT devices, applications (SaaS), and file storage systems.\nFormats: The process handles data in various formats, including structured (like databases), semi-structured (like JSON or XML), and unstructured data (like images, audio, or text files).\nDestination: The data is typically moved into a single repository for consolidation, allowing organizations

In [3]:
## Split
text_splitter=RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=0)
splits=text_splitter.split_documents(documents)

In [4]:
embeddings=OllamaEmbeddings(model="nomic-embed-text")
vectordb=Chroma.from_documents(documents=splits,embedding=embeddings)
vectordb

  embeddings=OllamaEmbeddings(model="nomic-embed-text")


<langchain_chroma.vectorstores.Chroma at 0x17a6990f6d0>

In [5]:
## Querying the Vector Store
query="What is Data Ingestion?"
docs=vectordb.similarity_search(query)
docs[0].page_content

'Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.'

In [6]:
## Save to the disk
vectordb=Chroma.from_documents(documents=splits,embedding=embeddings,persist_directory="./chroma_db")

In [7]:
## Call the vector store
db2=Chroma(persist_directory="./chroma_db",embedding_function=embeddings)
docs=db2.similarity_search("What is Data Ingestion?")
docs[0].page_content

'Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.'

In [8]:
### Retriever option
retriver=vectordb.as_retriever()
retriver.invoke(query)[0].page_content

'Data ingestion is the process of collecting and importing raw data from multiple and diverse sources into a centralized destination (such as a data lake, data warehouse, or database) where it can be stored, processed, and analyzed. It is the foundational first step in any modern data pipeline. \nKey Aspects\nPurpose: The main goal is to efficiently gather data and make it available and accessible for business intelligence, analytics, and machine learning/AI initiatives.'