In [1]:
#Load Libraries
from langchain_community.document_loaders import TextLoader
from langchain_ollama import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from langchain.vectorstores import Chroma

In [2]:
##Step1: Read the Text Data required
loader = TextLoader("sample.txt")
text_documents = loader.load() 
text_documents

[Document(metadata={'source': 'sample.txt'}, page_content='LangChain is the framework for building context-aware reasoning applications')]

In [3]:
text_documents[0].page_content

'LangChain is the framework for building context-aware reasoning applications'

In [5]:
#Step2:Divide the Text Document into Chunks 
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 100000, chunk_overlap=0)
chunked_docs = text_splitter.split_documents(text_documents)
chunked_docs[0]

Document(metadata={'source': 'sample.txt'}, page_content='LangChain is the framework for building context-aware reasoning applications')

In [6]:
##Step3: Convert Chunked Data into Vectors and Store into Vector store.

#OllamaEmbeddings 
embeddings = OllamaEmbeddings(
    model="llama2",
)

#embeddings = OpenAIEmbeddings() #if embeddings are OpenAI embeddings. 
#embeddings= GPT4AllEmbeddings()


In [10]:
vector_store = FAISS.from_documents(chunked_docs, embedding=embeddings)
# vector_store = InMemoryVectorStore.from_documents(chunked_docs, embedding=embeddings) #if embeddings storing is in memory
# vector_store = Chroma.from_documents(chunked_docs, embedding=embeddings) #if embeddings storing is in Chromadb


In [11]:
query = "What is LangChain?"

# Use the vectorstore as a retriever
retriever = vector_store.as_retriever()
# Retrieve the most similar text
retrieved_documents = retriever.invoke(query)
# show the retrieved document's content
retrieved_documents[0].page_content

'LangChain is the framework for building context-aware reasoning applications'

In [12]:
matched_docs = vector_store.similarity_search(query = query, k = 5)
matched_docs 

[Document(metadata={'source': 'sample.txt'}, page_content='LangChain is the framework for building context-aware reasoning applications')]

In [None]:
#####################################################################################################