### Clear memory

In [1]:
%reset -f
import gc
gc.collect()

0

### Import

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import pymupdf, json, requests, re, sys
from sentence_transformers import SentenceTransformer
from pathlib import Path
from tqdm.auto import tqdm
from typing import List, Tuple, Dict, Any, Optional
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sns.set_style('whitegrid')
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=UserWarning)

In [4]:
sys.path.append('..')
from src.data_utils import *

# Data Loading
Load chunks, prepared in the first notebook

In [5]:
df = pd.read_json('../data/processed/chunks.json', orient='records')

df.head(3)

Unnamed: 0,chunk_id,text,page_num,char_count,start_char,end_char
0,0,User Guide AWS Toolkit for Microsoft Azure Dev...,1,134,0,134
1,1,AWS Toolkit for Microsoft Azure DevOps User Gu...,2,582,0,582
2,2,AWS Toolkit for Microsoft Azure DevOps User Gu...,3,983,0,983


# Embeddings Creation

In [6]:
# Load embedding model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print(f"✅ Model loaded: {model.get_sentence_embedding_dimension()} dimensions")

Loading embedding model...
✅ Model loaded: 384 dimensions


In [7]:
# Create embeddings for all chunks
print(f"Creating embeddings for {len(df)} chunks...")

embeddings = model.encode(
    df['text'].tolist(),
    show_progress_bar=True,
    batch_size=32
)

print(f"✅ Created embeddings with shape: {embeddings.shape}")
# Expected output: (number_of_chunks, 384)

Creating embeddings for 300 chunks...


Batches:   0%|          | 0/10 [00:00<?, ?it/s]

✅ Created embeddings with shape: (300, 384)


In [8]:
# Let's inspect one embedding
print("Sample embedding (first 10 dimensions):")
print(embeddings[0][:10])
print(f"\nEmbedding statistics:")
print(f"Min value: {embeddings.min():.3f}")
print(f"Max value: {embeddings.max():.3f}")
print(f"Mean value: {embeddings.mean():.3f}")

Sample embedding (first 10 dimensions):
[-0.01388389  0.03978214 -0.03466231  0.01785599  0.07080441 -0.0241555
 -0.02924745 -0.09164825  0.0424548   0.14665551]

Embedding statistics:
Min value: -0.203
Max value: 0.218
Mean value: -0.001


In [9]:
# Add embeddings to dataframe (optional, for inspection)
df['embedding'] = list(embeddings)

print(f"DataFrame shape: {df.shape}")
df.head(3)

DataFrame shape: (300, 7)


Unnamed: 0,chunk_id,text,page_num,char_count,start_char,end_char,embedding
0,0,User Guide AWS Toolkit for Microsoft Azure Dev...,1,134,0,134,"[-0.013883891, 0.039782137, -0.03466231, 0.017..."
1,1,AWS Toolkit for Microsoft Azure DevOps User Gu...,2,582,0,582,"[-0.056738224, 0.02857147, -0.036580697, -0.01..."
2,2,AWS Toolkit for Microsoft Azure DevOps User Gu...,3,983,0,983,"[0.011543587, 0.018591559, -0.085423455, 0.008..."
