# Large Dataset Example: Building a RAG Knowledge Base

This notebook demonstrates how to ingest a large dataset (**DBpedia-14**) into VectorDB to build a Retrieval Augmented Generation (RAG) system.

In [None]:
%pip install -e ../sdk
%pip install datasets sentence-transformers pandas

In [None]:
import uuid
import pandas as pd
from tqdm import tqdm
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from vectordb.client import Client

# 1. Connect
client = Client("http://localhost:8000")

# 2. Reset Collection
try: 
    client.delete_collection("rag_test")
except: 
    pass

collection = client.create_collection("rag_test", 384, "cosine")

# 3. Load Model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [3]:
ROWS = 50000

# Load the dataset
dataset = load_dataset("dbpedia_14", split=f"train[:{ROWS}]")

print(f"Loaded {len(dataset)} documents.") # type: ignore

Loaded 50000 documents.


In [4]:
BATCH_SIZE = 500  # Process 500 items at a time

print("Starting upsert...")

# Loop through the dataset in chunks
for i in tqdm(range(0, len(dataset), BATCH_SIZE)):
    # 1. Slice the batch
    batch = dataset[i : i + BATCH_SIZE]
    titles = batch['title']
    contents = batch['content']
    
    # 2. Prepare Metadata (what we want to retrieve later)
    # We combine title and content for better context
    metadatas = [
        {"title": t, "text": c[:500]} 
        for t, c in zip(titles, contents)
    ]
    
    # 3. Generate Embeddings
    vectors = model.encode(contents)
    
    # 4. Generate IDs
    ids = [str(uuid.uuid4()) for _ in range(len(vectors))]
    
    # 5. Upload
    collection.upsert(
        ids=ids, 
        vectors=vectors, 
        metadatas=metadatas, 
        batch_size=128
    )

print("Finished!")

Starting upsert...


100%|██████████| 100/100 [02:51<00:00,  1.72s/it]

Finished!





In [5]:
query = "What companies produce computer hardware?"

# 1. Embed Query
query_vec = model.encode(query)

# 2. Search DB
results = collection.search(query=query_vec, k=5)

# 3. Display
df = pd.DataFrame([
    {
        "Score": r.score, 
        "Title": r.vector.metadata["title"], 
        "Snippet": r.vector.metadata["text"]
    } for r in results
])
display(df)

Unnamed: 0,Score,Title,Snippet
0,0.5438,VIA Technologies,VIA Technologies Inc. (Chinese: 威盛電子; pinyin:...
1,0.5426,Beijing Yicheng Bioelectronics Technology Company,Beijing Yicheng Bioelectronics Technology Com...
2,0.5196,Computer Usage Company,Computer Usage Company (1955–1986) sometimes ...
3,0.5042,PC SYSTEMS,The PC SYSTEMS (Microprocessor Systems) is a ...
4,0.4823,CA Technologies,CA Technologies Inc. formerly CA Inc. and Com...
