### Clear memory

In [1]:
%reset -f
import gc
gc.collect()

0

### Import

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import pymupdf, json, requests, re, sys
from sentence_transformers import SentenceTransformer
from pathlib import Path
from tqdm.auto import tqdm
import langchain
from langchain_huggingface import HuggingFaceEmbeddings
from typing import List, Tuple, Dict, Any, Optional
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sns.set_style('whitegrid')
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [4]:
sys.path.append('..')
from src.data_utils import *

# Data Loading
Load chunks, prepared in the first notebook

In [5]:
chunks = pd.read_json('../data/processed/chunks.json', orient='records')

chunks.head(3)

Unnamed: 0,chunk_id,text,page_num,char_count,start_char,end_char
0,0,User Guide AWS Toolkit for Microsoft Azure Dev...,1,134,0,134
1,1,AWS Toolkit for Microsoft Azure DevOps User Gu...,2,422,0,422
2,2,"s likely to cause confusion among customers, o...",2,260,322,822


In [6]:
print('Number of chunks:', chunks.shape[0])

Number of chunks: 569


# Embeddings Creation
Embeddings creation is different with LangChain. We need special wrapper that will be used by LangChain when it is needed (no need to explicitly manually create an embedding for each chunk). Here the same sentence transformer `all-MiniLM-L6-v2` model is used that comes within HuggingFace LangChain package. 

In [None]:
embedding_function = HuggingFaceEmbeddings(
    model_name='sentence-transformers/all-MiniLM-L6-v2',
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)

âœ… LangChain embedding function created


In [8]:
# Test with a sample text
test_text = "AWS provides cloud computing services"
test_embedding = embedding_function.embed_query(test_text)

print(f'ðŸ“Š Embedding dimension: {len(test_embedding)}')
print(f'ðŸ“Š First 5 values: {test_embedding[:5]}')

ðŸ“Š Embedding dimension: 384
ðŸ“Š First 5 values: [-0.04203960299491882, 0.04495583474636078, 0.016322309151291847, -0.03731227666139603, 0.03714952990412712]


### Create Embeddings for All Chunks (Optional Visualization)

**Note:** We won't actually create and store embeddings manually here. LangChain will do it automatically in the next section when we create the vector store.

But let's verify our embedding function works on all chunks:

In [9]:
# Test on first 3 chunks to verify
sample_texts = chunks['text'].head(3).tolist()

print('Testing embedding function on 3 sample chunks...\n')
for i, text in enumerate(sample_texts, 1):
    embedding = embedding_function.embed_query(text)
    print(f'Chunk {i}: {len(text)} chars â†’ {len(embedding)}-dim embedding âœ“')

print(f'\nâœ… Embedding function works correctly!')
print(f'ðŸ“¦ Ready to embed all {len(chunks)} chunks automatically in next section')

Testing embedding function on 3 sample chunks...

Chunk 1: 134 chars â†’ 384-dim embedding âœ“
Chunk 2: 421 chars â†’ 384-dim embedding âœ“
Chunk 3: 260 chars â†’ 384-dim embedding âœ“

âœ… Embedding function works correctly!
ðŸ“¦ Ready to embed all 569 chunks automatically in next section


---

## Key Difference from Notebook 2

**Notebook 2:**
```python
# We did this:
embeddings = model.encode(chunks['text'].tolist())  # Created all embeddings
chunks['embedding'] = list(embeddings)  # Stored them
```

**Notebook 3:**
```python
# We just create the function:
embedding_function = HuggingFaceEmbeddings(...)

# LangChain will call embedding_function.embed_query(text) 
# automatically for each chunk when we create the vector store!
```

**Why this way?** 
- Less code to write
- LangChain handles batching and optimization
- Easier to swap embedding models later

**Next:** We'll create the vector store, and LangChain will automatically embed all chunks!