In [1]:
from dotenv import load_dotenv

In [2]:
load_dotenv()

True

In [3]:
import os
os.environ['HF_TOKEN']=os.getenv("HUGGINGFACEHUB_API_TOKEN")

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings
embeddings=HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
embeddings.embed_query("hello AI")

[-0.033388201147317886,
 0.034539803862571716,
 0.059474579989910126,
 0.05928615480661392,
 -0.06353538483381271,
 -0.06819586455821991,
 0.08823323249816895,
 0.0344407856464386,
 -0.03278515860438347,
 -0.015814995393157005,
 0.020981667563319206,
 -0.018340280279517174,
 -0.03983214125037193,
 -0.08047076314687729,
 -0.014469259418547153,
 0.03326486423611641,
 0.01425926387310028,
 -0.03404999524354935,
 -0.142915740609169,
 -0.023083265870809555,
 -0.02138010412454605,
 0.0026335634756833315,
 -0.04729275032877922,
 -0.010752726346254349,
 -0.06866799294948578,
 0.03112499974668026,
 0.07594592869281769,
 0.0011283049825578928,
 0.011631982401013374,
 -0.036039214581251144,
 0.04483765363693237,
 0.01839079149067402,
 0.12672799825668335,
 -0.001359735382720828,
 0.008206715807318687,
 0.06909963488578796,
 -0.0807635709643364,
 -0.05841311439871788,
 0.05375451594591141,
 0.026227623224258423,
 -0.00682859867811203,
 -0.05635840445756912,
 0.0032929950393736362,
 -0.072501808404

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

In [8]:
documents=["what is a capital of USA?",
           "Who is a president of USA?",
           "Who is a prime minister of India?"]

In [9]:
my_query="Narendra modi is prime minister of india?"

In [10]:
document_embedding=embeddings.embed_documents(documents)

In [12]:
len(document_embedding) # each string got its own vector representation

3

In [13]:
query_embedding=embeddings.embed_query(my_query) # almost the same as embed_documents but here input must be string, not list

In [14]:
len(query_embedding)

384

In [15]:
cosine_similarity([query_embedding],document_embedding) # you are comparing the cosine similarity between your query and other 3 strings. Higher means the angle between two vectors is smaller

array([[0.11756665, 0.3432456 , 0.8141324 ]])

In [16]:
from sklearn.metrics.pairwise import euclidean_distances

In [17]:
euclidean_distances([query_embedding], document_embedding) # it represents distances between two vectors. Here, sentence three is closer to the query (smaller euclidean distance)

array([[1.32848288, 1.14608416, 0.60970094]])

# Similarity Metrics Comparison

| Metric | Formula | Range | Behavior | When to Use | Limitations | Notes |
|--------|---------|--------|----------|-------------|-------------|-------|
| **Cosine Similarity** | `cos(θ) = (x·y)/(‖x‖ ‖y‖)` | [-1, 1] | Focuses on angle only | High-dim sparse data, text analysis, when magnitude irrelevant | Ignores magnitude completely, undefined for zero vectors | Convert to distance: `1 - cosine_sim` |
| **L2 Distance (Euclidean)** | `√(∑(xᵢ - yᵢ)²)` | [0, ∞) | Focuses on **magnitude + direction** | Low-dim dense data, spatial problems, similar-scale features | Scale sensitive, curse of dimensionality, requires normalization | Convert to similarity: `1/(1 + distance)` |
| **Manhattan Distance (L1)** | `∑\|xᵢ - yᵢ\|` | [0, ∞) | Less sensitive to outliers | Robust to outliers, grid-like spaces, mixed data types | Still scale sensitive, less intuitive geometrically | More robust than L2 |
| **Jaccard Similarity** | `\|A ∩ B\| / \|A ∪ B\|` | [0, 1] | Set overlap proportion | Binary/categorical data, recommendation systems | Only for binary/set data, ignores frequency | Distance: `1 - jaccard` |
| **Hamming Distance** | `∑(xᵢ ≠ yᵢ)` | [0, n] | Count of differing positions | Binary strings, categorical data, error detection | Fixed-length vectors only, treats all differences equally | Normalized version: divide by n |
| **Pearson Correlation** | `cov(x,y)/(σₓσᵧ)` | [-1, 1] | Linear relationship strength | Time series, when linear correlation matters | Only captures linear relationships, sensitive to outliers | Distance: `1 - \|correlation\|` |

## Key Decision Framework

### By Data Type:
- **Continuous, similar scales** → Euclidean Distance
- **Continuous, different scales** → Cosine Similarity (after normalization)
- **High-dimensional sparse** → Cosine Similarity
- **Binary/categorical** → Jaccard or Hamming
- **Time series** → Pearson Correlation

### By Problem Context:
- **Magnitude matters** → Euclidean/Manhattan
- **Direction/pattern matters** → Cosine
- **Outlier robustness needed** → Manhattan
- **Set similarity** → Jaccard

### Preprocessing Requirements:
- **L1, L2**: Standardize features with different scales
- **Cosine**: Already scale-invariant
- **Jaccard, Hamming**: No preprocessing needed

FAISS is Facebook AI Similarity Search

In [18]:
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

In [19]:
index=faiss.IndexFlatL2(384) # inside index we will store our data. 384 is the embedding size (nr of feature representations)
# we always have to create index! We use euclidean distance index.
# What index does:
# Stores vectors: Your embeddings live inside the index
# Enables search: Provides fast nearest neighbor lookup
# Optimizes retrieval: Uses algorithms better than brute-force comparison

# L2 index 
# Flat: Brute-force exact search (no approximation)
# L2: Uses Euclidean distance
# 384: Must match your embedding size exactly


# Other index types:

# IndexFlatIP: Inner product (cosine similarity)
# IndexIVFFlat: Faster approximate search
# IndexHNSW: Graph-based fast search

# The index type determines both accuracy and speed of your similarity search.


In [20]:
index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001F241649B00> >

In [21]:
# creating a vector store!
vector_store=FAISS(
    embedding_function=embeddings, # here, hugging face
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)


In [22]:
vector_store.add_texts(["AI is future","AI is powerful","Dogs are cute"]) # they are automatically vectorized and added to the table

['d2df5d9f-062f-4473-969b-ca4d7c20947c',
 '52f88ace-7148-4b7d-8f40-703ae2c2bebc',
 '2596047a-396c-44e2-bf62-8f26ba85e26b']

In [23]:
vector_store.index_to_docstore_id

{0: 'd2df5d9f-062f-4473-969b-ca4d7c20947c',
 1: '52f88ace-7148-4b7d-8f40-703ae2c2bebc',
 2: '2596047a-396c-44e2-bf62-8f26ba85e26b'}

In [28]:
results = vector_store.similarity_search("Tell me about AI", k=2) # it gives you 3 closest based on L2 distance texts.

In [29]:
results


[Document(id='52f88ace-7148-4b7d-8f40-703ae2c2bebc', metadata={}, page_content='AI is powerful'),
 Document(id='d2df5d9f-062f-4473-969b-ca4d7c20947c', metadata={}, page_content='AI is future')]

| Feature               | `Flat`                | `IVF` (Inverted File Index)        | `HNSW` (Graph-based Index)          |
| --------------------- | --------------------- | ---------------------------------- | ----------------------------------- |
| Type of Search     | Exact                 | Approximate (cluster-based)        | Approximate (graph-based traversal) |
| Speed               | Slow (linear scan)    | Fast (search only in top clusters) | Very Fast (graph walk)              |


| Dataset Size              | Recommended Index                 |
| ------------------------- | --------------------------------- |
| UPTO 1L                     | `IndexFlatL2` or `IndexFlatIP`    |
| UPTO 1M                  | `IndexIVFFlat` or `IndexHNSWFlat` |
| > 1M                      | `IndexIVFPQ` or `IndexHNSWFlat`   |


In [30]:
# from uuid import uuid4
from langchain_core.documents import Document

document_1 = Document(
    page_content="I had chocolate chip pancakes and scrambled eggs for breakfast this morning.",
    metadata={"source": "tweet"},
)

document_2 = Document(
    page_content="The weather forecast for tomorrow is cloudy and overcast, with a high of 62 degrees.",
    metadata={"source": "news"},
)

document_3 = Document(
    page_content="Building an exciting new project with LangChain - come check it out!",
    metadata={"source": "tweet"},
)

document_4 = Document(
    page_content="Robbers broke into the city bank and stole $1 million in cash.",
    metadata={"source": "news"},
)

document_5 = Document(
    page_content="Wow! That was an amazing movie. I can't wait to see it again.",
    metadata={"source": "tweet"},
)

document_6 = Document(
    page_content="Is the new iPhone worth the price? Read this review to find out.",
    metadata={"source": "website"},
)

document_7 = Document(
    page_content="The top 10 soccer players in the world right now.",
    metadata={"source": "website"},
)

document_8 = Document(
    page_content="LangGraph is the best framework for building stateful, agentic applications!",
    metadata={"source": "tweet"},
)

document_9 = Document(
    page_content="The stock market is down 500 points today due to fears of a recession.",
    metadata={"source": "news"},
)

document_10 = Document(
    page_content="I have a bad feeling I am going to get deleted :(",
    metadata={"source": "tweet"},
)

documents = [
    document_1,
    document_2,
    document_3,
    document_4,
    document_5,
    document_6,
    document_7,
    document_8,
    document_9,
    document_10,
]

In [35]:
index=faiss.IndexFlatIP(384) # cosine similarity
vector_store=FAISS( # this object automatically does embedding and it adds the vector to the database
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [36]:
vector_store.add_documents(documents=documents) # stored all 10 documents in database. if you run it twice, you will get same texts with two indexes (20 rows)

['2594229c-fdd6-40b8-8bfc-d6d5d3d17282',
 '5d090ccc-f21f-4666-bf6a-eb1849ece93b',
 '4d444fcc-5fa8-4110-af27-8e4e6702890f',
 '6c6c624d-05a5-40df-bbc9-5ddbc39b6bad',
 '1d4f3fc6-2e65-4c63-9208-c4454fa0246d',
 '097e6ad3-85a6-46af-9e7e-0af577804560',
 '3b05a64e-0876-453b-af0f-4e2524179b6b',
 '48f57a32-490c-445d-99fa-7fa5b2b863a3',
 'd7e504db-d52f-4c22-8077-84ffb60ed3c7',
 '7dbbf273-864c-4d64-b2dd-8931d37718c9']

In [37]:
vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    k=2 #hyperparameter -> number of documents I want to retrieve
    
)

[Document(id='4d444fcc-5fa8-4110-af27-8e4e6702890f', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='48f57a32-490c-445d-99fa-7fa5b2b863a3', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!')]

In [38]:
vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    #k=2 #hyperparameter,
    filter={"source":{"$eq": "tweet"}} # we retrieve data only from tweet sourced information
    
)

[Document(id='4d444fcc-5fa8-4110-af27-8e4e6702890f', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='48f57a32-490c-445d-99fa-7fa5b2b863a3', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='7dbbf273-864c-4d64-b2dd-8931d37718c9', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :('),
 Document(id='2594229c-fdd6-40b8-8bfc-d6d5d3d17282', metadata={'source': 'tweet'}, page_content='I had chocolate chip pancakes and scrambled eggs for breakfast this morning.')]

In [39]:
result=vector_store.similarity_search(
    "LangChain provides abstractions to make working with LLMs easy",
    #k=2 #hyperparameter,
    filter={"source":"news"}
    
)

In [40]:
result[0].metadata

{'source': 'news'}

In [41]:
result[0].page_content

'Robbers broke into the city bank and stole $1 million in cash.'

In [42]:
retriever=vector_store.as_retriever(search_kwargs={"k": 3}) # you can transform vector store as a retriever to use it inside RAG pipeline

In [43]:
retriever.invoke("LangChain provides abstractions to make working with LLMs easy")

[Document(id='4d444fcc-5fa8-4110-af27-8e4e6702890f', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='48f57a32-490c-445d-99fa-7fa5b2b863a3', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='7dbbf273-864c-4d64-b2dd-8931d37718c9', metadata={'source': 'tweet'}, page_content='I have a bad feeling I am going to get deleted :(')]

In [None]:
# inmemory(server)
# ondisk(server)
# cloud(yet to discuss)

In [44]:
vector_store.save_local("today's class faiss index")

In [45]:
new_vector_store=FAISS.load_local(
  "today's class faiss index",embeddings ,allow_dangerous_deserialization=True
)

In [46]:
new_vector_store.similarity_search("langchain")

[Document(id='4d444fcc-5fa8-4110-af27-8e4e6702890f', metadata={'source': 'tweet'}, page_content='Building an exciting new project with LangChain - come check it out!'),
 Document(id='48f57a32-490c-445d-99fa-7fa5b2b863a3', metadata={'source': 'tweet'}, page_content='LangGraph is the best framework for building stateful, agentic applications!'),
 Document(id='1d4f3fc6-2e65-4c63-9208-c4454fa0246d', metadata={'source': 'tweet'}, page_content="Wow! That was an amazing movie. I can't wait to see it again."),
 Document(id='3b05a64e-0876-453b-af0f-4e2524179b6b', metadata={'source': 'website'}, page_content='The top 10 soccer players in the world right now.')]

In [47]:
from langchain_community.document_loaders import PyPDFLoader

In [48]:
FILE_PATH=r"C:\Users\Artur Dragunov\Documents\GIT\agentic-ai-bootcamp\2-Langchain Basics\llama2-bf0a30209b224e26e31087559688ce81.pdf"

In [49]:
loader=PyPDFLoader(FILE_PATH)

In [50]:
len(loader.load())

77

In [51]:
pages=loader.load() # it loads all pages

In [52]:
pages = []
async for page in loader.alazy_load(): # load pages in parallel using asyncio
    pages.append(page)

In [53]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [54]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,#hyperparameter
    chunk_overlap=50 #hyperparemeter
)

In [55]:
split_docs = splitter.split_documents(pages)

In [56]:
len(split_docs)

615

In [57]:
index=faiss.IndexFlatIP(384)
vector_store=FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

In [58]:
vector_store.add_documents(documents=split_docs) # added 615 chunks with cosine similarity index

['405e9643-d257-4954-aeb2-678a444452eb',
 '1a45eccd-8a21-4c4e-b2ab-1596c2f76eb4',
 '07bfd6c2-1b59-4af0-8dd7-4cce71dc5558',
 '22bef17f-0007-4edc-8515-e304b1a7de39',
 'aa750be9-f4f7-4321-bd70-0b876f3fd391',
 'bc67d71b-1cf6-4804-ac43-22866682d8d7',
 '2b3d1980-2a47-4cf4-9525-78c375fc511d',
 '6d918377-b5f3-47f7-b2eb-4fb8c5203643',
 'de9862dc-ec6d-42e8-b932-bc8311e9ddd7',
 '71f266f0-641a-424a-9865-936c9494188e',
 '9fce274e-ce17-4eaf-8788-d433d1675434',
 '00415341-fd9b-444b-83e5-1c564658e4fe',
 'd589712e-5837-4f84-a215-53bb4e847646',
 '6d3c5f23-9d8d-473b-ae0c-0d6447b24646',
 '90684fd3-a2f9-4ce9-858b-6c21650ee510',
 '247e01af-748e-4005-8361-9e3672897343',
 'e3b88972-78f8-4e37-bd34-a776a2836cc2',
 'fab1b8ed-4a01-476a-aeaa-a36c6c81cb36',
 'a88b8cf9-ed12-404c-b83b-e99d9f7f669f',
 '4f87430e-6836-47e6-82a7-64c3dd0ea9d4',
 '0999af95-e28a-4969-bc3a-0bdd4349670b',
 '3a5732e6-cd1d-48bc-9224-85624d777ed8',
 '58ca8dec-f035-4bbd-a256-2c86822a3ab8',
 '4efdd902-60b2-4d8b-a870-7a1733e6afee',
 '9a0b94ed-4a72-

In [59]:
retriever=vector_store.as_retriever(
    search_kwargs={"k": 10} #hyperparameter
)

In [60]:
retriever.invoke("what is llama model?")

[Document(id='9a0b94ed-4a72-478d-8e8b-7a2547366363', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with hyperref', 'creationdate': '2023-07-20T00:30:36+00:00', 'author': '', 'keywords': '', 'moddate': '2023-07-20T00:30:36+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.141592653-2.6-1.40.25 (TeX Live 2023) kpathsea version 6.3.5', 'subject': '', 'title': '', 'trapped': '/False', 'source': 'C:\\Users\\Artur Dragunov\\Documents\\GIT\\agentic-ai-bootcamp\\2-Langchain Basics\\llama2-bf0a30209b224e26e31087559688ce81.pdf', 'total_pages': 77, 'page': 3, 'page_label': '4'}, page_content='work (Section 6), and conclusions (Section 7).\n‡https://ai.meta.com/resources/models-and-libraries/llama/\n§We are delaying the release of the 34B model due to a lack of time to sufficiently red team.\n¶https://ai.meta.com/llama\n‖https://github.com/facebookresearch/llama\n4'),
 Document(id='7d6c6445-daab-40ea-968f-d41e5ad0c7d6', metadata={'producer': 'pdfTeX-1.40.25', 'creator': 'LaTeX with

In [62]:
from langchain_openai import ChatOpenAI

model = ChatOpenAI(model = "gpt-4.1-mini-2025-04-14")

In [None]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt") # RAG prompt template

In [66]:
import pprint
pprint.pprint(prompt.messages)

[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]


[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})]

In [67]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
# RunnablePassthrough() passes input data through unchanged
# it's essentially a "do nothing" operation that forwards whatever it receives.

In [None]:
# context(retriever),prompt(hub),model(openai),parser(langchain)

In [68]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)
    

In [69]:
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [70]:
rag_chain.invoke("what is llama model?")

'The Llama model is a series of large language models developed by Meta, designed for natural language generation tasks. Llama 2, the latest version, includes tuned assistant-like chat models and pretrained models for research and commercial use, trained with techniques like supervised fine-tuning and reinforcement learning from human feedback to improve helpfulness and safety. It supports models up to 70 billion parameters and competes with other open and closed-source models in performance.'