In [1]:
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()

os.environ['HF_TOKEN']=os.getenv("HF_TOKEN")

In [3]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [5]:
vec = embeddings.embed_query("hello AI")
len(vec)

384

In [12]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances

In [7]:
documents = [
    "what is the capital of USA?",
    "Who is the president of USA?",
    "Who is the prime minister of India?"
]

In [8]:
my_query = "Narendra modi is the prime minister of india."

In [9]:
document_embeddings = embeddings.embed_documents(documents)

In [10]:
query_embedding = embeddings.embed_query(my_query)

In [11]:
cosine_similarity([query_embedding], document_embeddings)

array([[0.07118418, 0.33584699, 0.76168934]])

In [13]:
euclidean_distances([query_embedding], document_embeddings)

array([[1.36294968, 1.1525216 , 0.69037768]])

| Metric            | Similarity Score Range | Behavior                              |
| ----------------- | ---------------------- | ------------------------------------- |
| Cosine Similarity | \[-1, 1]               | Focuses on angle only |
| L2 Distance       | \[0, ∞)                | Focuses on **magnitude + direction**  |


In [14]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [15]:
index = faiss.IndexFlatL2(384)
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x0000020CEC33B510> >

In [16]:
vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
vector_store

<langchain_community.vectorstores.faiss.FAISS at 0x20cefc2aa80>

In [17]:
vector_store.add_texts(["AI is future","AI is powerful","Dogs are cute"])

['c311b7b3-de2d-4535-b965-62efb6f83c19',
 'a9224e28-ae56-4f03-8458-83c6780b9c7c',
 'c0ef2ec4-db69-4ade-b57b-dd0f674bd514']

In [18]:
vector_store.index_to_docstore_id

{0: 'c311b7b3-de2d-4535-b965-62efb6f83c19',
 1: 'a9224e28-ae56-4f03-8458-83c6780b9c7c',
 2: 'c0ef2ec4-db69-4ade-b57b-dd0f674bd514'}

In [19]:
results = vector_store.similarity_search("Tell me about AI", k=1)

In [20]:
results

[Document(id='a9224e28-ae56-4f03-8458-83c6780b9c7c', metadata={}, page_content='AI is powerful')]

| Feature               | `Flat`                | `IVF` (Inverted File Index)        | `HNSW` (Graph-based Index)          |
| --------------------- | --------------------- | ---------------------------------- | ----------------------------------- |
| Type of Search     | Exact                 | Approximate (cluster-based)        | Approximate (graph-based traversal) |
| Speed               | Slow (linear scan)    | Fast (search only in top clusters) | Very Fast (graph walk)              |


| Dataset Size              | Recommended Index                 |
| ------------------------- | --------------------------------- |
| UPTO 1L                     | `IndexFlatL2` or `IndexFlatIP`    |
| UPTO 1M                  | `IndexIVFFlat` or `IndexHNSWFlat` |
| > 1M                      | `IndexIVFPQ` or `IndexHNSWFlat`   |


In [21]:
# from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

In [22]:
documents

[Document(metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.'),
 Document(metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(metadata={'source': 'website'}, page_content='Is the new iPhone worth the price? Read this review to find out.'),
 Document(metadata={'source': 'website'}, page_content='The top 10 soccer players in the world right now.'),
 Document(metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic application

In [23]:
index = faiss.IndexFlatIP(384)

vector_store = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [24]:
vector_store.add_documents(documents=documents)

['6d57db54-d4e3-45d7-bdc9-cf09db0bac13',
 '5621e37a-50c0-469a-a549-2509bab9d2dd',
 '5ef888fb-8247-4d1e-a8c1-fcd5b1822c5c',
 'f66dbac7-b986-4e23-b30d-2449588aa0f8',
 '02132a46-b446-4ede-b2b2-36b1cdf9e054',
 '864a62cb-6bd3-4ac5-a2cc-4f58833d53ff',
 '29138511-a9ee-45ce-964c-96ef97976eda',
 'b81212ef-838f-48dd-8b53-7c99fd9d08a7',
 'c8b70a9c-bceb-4e67-8079-12ec7aafe609',
 'ed50cd7d-a22e-42a0-88b1-6d2aa34e7d29']

In [25]:
vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k = 2 
    
)

[Document(id='5ef888fb-8247-4d1e-a8c1-fcd5b1822c5c', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='b81212ef-838f-48dd-8b53-7c99fd9d08a7', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [26]:
vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    filter = {"source":{"$eq": "tweet"}}
    
)

[Document(id='5ef888fb-8247-4d1e-a8c1-fcd5b1822c5c', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='b81212ef-838f-48dd-8b53-7c99fd9d08a7', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='ed50cd7d-a22e-42a0-88b1-6d2aa34e7d29', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(id='6d57db54-d4e3-45d7-bdc9-cf09db0bac13', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.')]

In [28]:
result = vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    filter = {"source":"news"}
    
)

In [29]:
result

[Document(id='f66dbac7-b986-4e23-b30d-2449588aa0f8', metadata={'source': 'news'}, page_content='Robbers broke into the city bank and stole $1 million in cash.'),
 Document(id='5621e37a-50c0-469a-a549-2509bab9d2dd', metadata={'source': 'news'}, page_content='The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.'),
 Document(id='c8b70a9c-bceb-4e67-8079-12ec7aafe609', metadata={'source': 'news'}, page_content='The stock market is down 500 points today due to fears of a recession.')]

In [30]:
result[0].metadata

{'source': 'news'}

In [31]:
result[0].page_content

'Robbers broke into the city bank and stole $1 million in cash.'

In [32]:
retriever = vector_store.as_retriever(search_kwargs={"k": 3})

In [33]:
retriever.invoke("LangChain provides abstractions to make working with LLMs easy")

[Document(id='5ef888fb-8247-4d1e-a8c1-fcd5b1822c5c', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='b81212ef-838f-48dd-8b53-7c99fd9d08a7', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='ed50cd7d-a22e-42a0-88b1-6d2aa34e7d29', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [35]:
# inmemory(local server)
# ondisk(local server)
# cloud(yet to discuss)

In [37]:
vector_store.save_local("../faiss_index")

In [38]:
new_vector_store = FAISS.load_local(
        "../faiss_index",
        embeddings,
        allow_dangerous_deserialization=True
)

In [39]:
new_vector_store.similarity_search("langchain")

[Document(id='5ef888fb-8247-4d1e-a8c1-fcd5b1822c5c', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='b81212ef-838f-48dd-8b53-7c99fd9d08a7', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='02132a46-b446-4ede-b2b2-36b1cdf9e054', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(id='29138511-a9ee-45ce-964c-96ef97976eda', metadata={'source': 'website'}, page_content='The top 10 soccer players in the world right now.')]