# Notebook 02 Embedding and Qdrant Storage

# 1 Setup & Imports

This notebook connects processed data (chunks.jsonl)
with an open-source embeddings model and Qdrant vector DB.

In [1]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm  # Progress bar for embedding generation
from typing import List, Dict

# SentenceTransformers provides pre-trained embedding models.
from sentence_transformers import SentenceTransformer

# Qdrant client library — connects to your local or remote Qdrant instance.
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest

# Paths 
DATA_DIR = "./data"
CHUNKS_FILE = os.path.join(DATA_DIR, "chunks.jsonl")


# Sanity check 
if not os.path.exists(CHUNKS_FILE):
    raise FileNotFoundError(f"Missing {CHUNKS_FILE}. Run Notebook 01 first!")

print(f"Using chunks file: {CHUNKS_FILE}")

Using chunks file: ./data/chunks.jsonl


# 2 Load Chunks from JSONL
We'll load all chunk records from the saved file.
Each chunk has: source, chunk_id, text, and type.

In [2]:
def load_chunks(file_path: str) -> List[Dict]:
    """Load a JSONL file into a Python list of dictionaries."""
    chunks = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            chunks.append(json.loads(line))
    return chunks

# --- Load and preview ---
all_chunks = load_chunks(CHUNKS_FILE)
print(f"Loaded {len(all_chunks)} text chunks.")
print("Example chunk:")
print(json.dumps(all_chunks[0], indent=2)[:100])

Loaded 1073 text chunks.
Example chunk:
{
  "id": "test_chunk0",
  "source": "./data/test.pdf",
  "chunk": "The Journey A Digital and Societ


# 3 Generate Embeddings

We'll use an open-source model from Hugging Face
(SentenceTransformers) to create vector embeddings.
These embeddings represent semantic meaning numerically.

In [3]:
##--- Generate BGE-M3 Embeddings ---
##--- Modified from using all-MiniLM-L6-v2 to BGE-M3 ---

from FlagEmbedding import BGEM3FlagModel
from tqdm import tqdm
import numpy as np

model_name = "BAAI/bge-m3"
print(f"Loading embedding model: {model_name}")

embedder = BGEM3FlagModel(model_name, use_fp16=False)

for chunk in tqdm(all_chunks, desc="Encoding text chunks with BGE-M3"):
    outputs = embedder.encode(
        chunk["chunk"],
        max_length=8192,
        return_dense=True,
        return_sparse=False,
        return_colbert_vecs=False
    )

    # FIX: dense_vecs IS ALREADY the full embedding vector
    embedding = outputs["dense_vecs"]  # shape (1024,)

    # Normalize (recommended for cosine search)
    norm = np.linalg.norm(embedding)
    if norm > 0:
        embedding = embedding / norm

    chunk["embedding"] = embedding.tolist()

print("Finished generating corrected BGE-M3 embeddings.")

Loading embedding model: BAAI/bge-m3


Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Encoding text chunks with BGE-M3:   0%|          | 0/1073 [00:00<?, ?it/s]You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Encoding text chunks with BGE-M3: 100%|██████████| 1073/1073 [07:13<00:00,  2.47it/s]

Finished generating corrected BGE-M3 embeddings.





# 4 Connect or Initialize Qdrant

This cell connects to a local Qdrant instance.
You can run Qdrant locally using Docker:
  docker run -p 6333:6333 qdrant/qdrant



In [4]:
# Create a Qdrant client connected to your local instance
client = QdrantClient("http://localhost:6333")

# Define collection name for our embeddings and metadata
COLLECTION_NAME = "enterprise_docs"

# BGE-M3 produces 1024-dimensional embeddings
VECTOR_DIM = 1024

# Recreate the collection for BGE-M3
client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=rest.VectorParams(
        size=VECTOR_DIM,            # Correct dim for BGE-M3
        distance=rest.Distance.COSINE
    )
)

print(f"Collection '{COLLECTION_NAME}' recreated with {VECTOR_DIM}-dim vectors for BGE-M3.")

  client.recreate_collection(


Collection 'enterprise_docs' recreated with 1024-dim vectors for BGE-M3.


# 5 Upload Embeddings and Metadata

We'll push each chunk’s vector and metadata to Qdrant.
This enables semantic search later on.

In [5]:
# Prepare points for Qdrant (each point = one vector + metadata)
points = []
for idx, chunk in enumerate(all_chunks):
    points.append(
        rest.PointStruct(
            id=idx,  # unique ID for this chunk
            vector=chunk["embedding"],  # the embedding vector
            payload={  # additional metadata
                "source": chunk["source"],
                "chunk_id": chunk["id"],
                "text": chunk["chunk"],
                "type": chunk.get("type", "unknown")
            }
        )
    )

# --- Upload to Qdrant ---
client.upsert(collection_name=COLLECTION_NAME, points=points)

print(f"Uploaded {len(points)} chunks to collection '{COLLECTION_NAME}'.")

Uploaded 1073 chunks to collection 'enterprise_docs'.


# 6 Test Semantic Search

Let's try asking a question, embed it, and search similar content
from our stored chunks in Qdrant.

In [None]:
# --- Example user query ---
query = "What does 6G offer?"

# Generate embedding for the query using BGE-M3
q_out = embedder.encode(
    query,
    max_length=8192,
    return_dense=True,
    return_sparse=False,
    return_colbert_vecs=False
)

# Extract dense embedding
query_vector = q_out["dense_vecs"]

# Normalize (recommended for cosine searches)
import numpy as np
norm = np.linalg.norm(query_vector)
if norm > 0:
    query_vector = query_vector / norm

query_vector = query_vector.tolist()

# --- Perform vector search ---
search_results = client.query_points(
    collection_name=COLLECTION_NAME,
    query=query_vector,
    limit=5
)

# --- Display results ---
print(f"\nQuery: {query}\n")
for i, hit in enumerate(search_results.points):
    print(f"Result {i+1} — Score: {hit.score:.4f}")
    print(f"Source: {hit.payload.get('source')}")
    print(f"Text snippet: {hit.payload.get('text')[:300]}...")
    print("-" * 80)


Query: What does 6G offer?

Result 1 — Score: 0.6629
Source: ./data/test.pdf
Text snippet: . The 6 G vision is to create a seamless reality where the physical and digital worlds, so far separated, are converged. This will enable seamless movement in a cyberphysical continuum of a connected physical world of senses, actions, and experiences, and its programmable digital representation. Wit...
--------------------------------------------------------------------------------
Result 2 — Score: 0.6524
Source: ./data/test.pdf
Text snippet: . II and overviewing the use cases (UC) that are expected to drive a digital and societal revolution in Sec. III. This is followed by introducing the paradigm shifts that formulate an evolved network architecture in Sec. IV. In Sec. V, we highlight the main 6 G technologies needed to realize the vis...
--------------------------------------------------------------------------------
Result 3 — Score: 0.6511
Source: ./data/test.pdf
Text snippet: . In additio

: 