In [2]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter

In [8]:
loader=TextLoader('test.txt')
docs=loader.load()

text_splitter=CharacterTextSplitter(chunk_size=100,chunk_overlap=100)
docs=text_splitter.split_documents(docs)

Created a chunk of size 406, which is longer than the specified 100
Created a chunk of size 552, which is longer than the specified 100


In [9]:
docs

[Document(page_content='Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is a subset of artificial intelligence that uses generative models to produce text, images, videos, or other forms of data.[2][3][4] These models learn the underlying patterns and structures of their training data and use them to produce new data[5][6] based on the input, which often comes in the form of natural language prompts.[7][8]', metadata={'source': 'test.txt'}),
 Document(page_content='Improvements in transformer-based deep neural networks, particularly large language models (LLMs), enabled an AI boom of generative AI systems in the early 2020s. These include chatbots such as ChatGPT, Copilot, Gemini, and LLaMA; text-to-image artificial intelligence image generation systems such as Stable Diffusion, Midjourney, and DALL-E; and text-to-video AI generators such as Sora.[9][10][11][12] Companies such as OpenAI, Anthropic, Microsoft, Google, and Baidu as well as numerous smaller firms have

In [10]:
embedding=OllamaEmbeddings(model="gemma:2b")
db=FAISS.from_documents(docs,embedding)
db

<langchain_community.vectorstores.faiss.FAISS at 0x253ff567340>

In [11]:
query="What is generative ai?"

docs=db.similarity_search(query)
docs

[Document(page_content='Generative AI has uses across a wide range of industries, including software development, healthcare, finance, entertainment, customer service,[15] sales and marketing,[16] art, writing,[17] fashion,[18] and product design.[19]', metadata={'source': 'test.txt'}),
 Document(page_content='Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is a subset of artificial intelligence that uses generative models to produce text, images, videos, or other forms of data.[2][3][4] These models learn the underlying patterns and structures of their training data and use them to produce new data[5][6] based on the input, which often comes in the form of natural language prompts.[7][8]', metadata={'source': 'test.txt'}),
 Document(page_content='Improvements in transformer-based deep neural networks, particularly large language models (LLMs), enabled an AI boom of generative AI systems in the early 2020s. These include chatbots such as ChatGPT, Copilot, Gemini, a

In [12]:
docs[0].page_content

'Generative AI has uses across a wide range of industries, including software development, healthcare, finance, entertainment, customer service,[15] sales and marketing,[16] art, writing,[17] fashion,[18] and product design.[19]'

## As a Retriever

We can convert vector store to Retriever class . This allows us to easily use it in other langchain methods which largely work with retrievers

In [13]:
#Vector db is converted to retriever to access any details
retriever=db.as_retriever()
retriever.invoke(query)

[Document(page_content='Generative AI has uses across a wide range of industries, including software development, healthcare, finance, entertainment, customer service,[15] sales and marketing,[16] art, writing,[17] fashion,[18] and product design.[19]', metadata={'source': 'test.txt'}),
 Document(page_content='Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is a subset of artificial intelligence that uses generative models to produce text, images, videos, or other forms of data.[2][3][4] These models learn the underlying patterns and structures of their training data and use them to produce new data[5][6] based on the input, which often comes in the form of natural language prompts.[7][8]', metadata={'source': 'test.txt'}),
 Document(page_content='Improvements in transformer-based deep neural networks, particularly large language models (LLMs), enabled an AI boom of generative AI systems in the early 2020s. These include chatbots such as ChatGPT, Copilot, Gemini, a

## Similarity search with scores

This is a FAISS specific method which helps us to return not only documents but also the distance score of query to them

In [15]:
docs_and_score=db.similarity_search_with_score(query)
docs_and_score  #Scores based on manhattan distance

[(Document(page_content='Generative AI has uses across a wide range of industries, including software development, healthcare, finance, entertainment, customer service,[15] sales and marketing,[16] art, writing,[17] fashion,[18] and product design.[19]', metadata={'source': 'test.txt'}),
  1687.6439),
 (Document(page_content='Generative artificial intelligence (generative AI, GenAI,[1] or GAI) is a subset of artificial intelligence that uses generative models to produce text, images, videos, or other forms of data.[2][3][4] These models learn the underlying patterns and structures of their training data and use them to produce new data[5][6] based on the input, which often comes in the form of natural language prompts.[7][8]', metadata={'source': 'test.txt'}),
  1917.3319),
 (Document(page_content='Improvements in transformer-based deep neural networks, particularly large language models (LLMs), enabled an AI boom of generative AI systems in the early 2020s. These include chatbots such

In [None]:
#Saving FAISS in local

db.save_local("faiss_index")



In [None]:
#load the vector database
new_df=FAISS.load_local("faiss_index",embedding,allow_dangerous_deserialization=True)

In [None]:
output=new_df.similarity_search(query)

## Chroma DB 

It is a ai  native open source vector database focussed on developer productivity by Apache

In [17]:
from langchain_chroma import Chroma

In [None]:
loader=TextLoader('test.txt')
docs=loader.load()

text_splitter=CharacterTextSplitter(chunk_size=100,chunk_overlap=100)
docs=text_splitter.split_documents(docs)


embedding=OllamaEmbeddings(model="gemma:2b")

vectordb=Chroma.from_documents(docs,embedding)

In [None]:
vectordb

In [None]:
#Query db

docs=vectordb.similarity_search(query)


