# Quick Start: Vector DB

Thie shows a simple example for the vector DB which can store objects with multiple fields marked as vector fields (`EmbedField`).

The example is based on "code files" which can be queried by their code or by their summary.

In [13]:
# load environment variables from .env file
from dotenv import load_dotenv
load_dotenv(override=True)

True

In [14]:
from backend.database.embedding import MockEmbedding, OpenAIEmbedding
from backend.database.vector_db import VectorDB, EmbedData, Field, EmbedField

EmbedData.embedder_registry

{'MockEmbedding': backend.database.embedding.MockEmbedding,
 'OpenAIEmbedding': backend.database.embedding.OpenAIEmbedding}

In [15]:
class CodeFiles(EmbedData):
    path: str
    code: str = EmbedField(MockEmbedding)
    summary: str = EmbedField(OpenAIEmbedding)
    meta: dict = {}

a = CodeFiles(path='abc/def', code='print("hello")', summary='print hello world', meta={'a': 1, 'b': 2})
CodeFiles.model_json_schema()

{'properties': {'path': {'title': 'Path', 'type': 'string'},
  'code': {'embed': True,
   'embedding_dim': 2,
   'embedding_distance': 'Cosine',
   'embedding_model': 'MockEmbedding',
   'title': 'Code',
   'type': 'string'},
  'summary': {'embed': True,
   'embedding_dim': 1536,
   'embedding_distance': 'Cosine',
   'embedding_model': 'OpenAIEmbedding',
   'title': 'Summary',
   'type': 'string'},
  'meta': {'default': {}, 'title': 'Meta', 'type': 'object'}},
 'required': ['path', 'code', 'summary'],
 'title': 'CodeFiles',
 'type': 'object'}

In [16]:
EmbedData.embedder_registry

{'MockEmbedding': backend.database.embedding.MockEmbedding,
 'OpenAIEmbedding': backend.database.embedding.OpenAIEmbedding}

In [17]:
# this will actually call the embedding API, so make sure you configure the API key
CodeFiles.embed([a, CodeFiles(path='', code='', summary='hi')])

[{'code': [1.0, 1.0],
  'summary': [-0.024677587673068047,
   0.004810095764696598,
   -0.02494877018034458,
   -0.023267438635230064,
   -0.010596447624266148,
   -0.003966040909290314,
   -0.033138472586870193,
   -0.010589667595922947,
   -0.01776243932545185,
   -0.02092171274125576,
   0.02100306749343872,
   0.01838615909218788,
   -0.026372475549578667,
   -0.0012279471848160028,
   0.005525338929146528,
   0.00995916873216629,
   0.010664243251085281,
   -0.026291120797395706,
   0.009735443629324436,
   0.018182771280407906,
   -0.012989630922675133,
   0.00453213369473815,
   0.01612178608775139,
   -0.015687894076108932,
   -0.02376912720501423,
   -0.008379532024264336,
   0.017111601307988167,
   -0.024365726858377457,
   0.0196878332644701,
   -0.02901650406420231,
   0.01088118925690651,
   -0.006884640082716942,
   -0.013254033401608467,
   -0.013016749173402786,
   -0.003196561010554433,
   -0.004749079700559378,
   -0.006142278667539358,
   -0.020162401720881462,
   0

In [18]:
db = VectorDB("code_test", CodeFiles)

db.reset() # in case there are existing data in the database

In [19]:
db.client.get_collection("code_test")

CollectionInfo(status=<CollectionStatus.GREEN: 'green'>, optimizer_status=<OptimizersStatusOneOf.OK: 'ok'>, vectors_count=0, indexed_vectors_count=0, points_count=0, segments_count=1, config=CollectionConfig(params=CollectionParams(vectors={'code': VectorParams(size=2, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None), 'summary': VectorParams(size=1536, distance=<Distance.COSINE: 'Cosine'>, hnsw_config=None, quantization_config=None, on_disk=None)}, shard_number=None, sharding_method=None, replication_factor=None, write_consistency_factor=None, read_fan_out_factor=None, on_disk_payload=None, sparse_vectors=None), hnsw_config=HnswConfig(m=16, ef_construct=100, full_scan_threshold=10000, max_indexing_threads=0, on_disk=None, payload_m=None), optimizer_config=OptimizersConfig(deleted_threshold=0.2, vacuum_min_vector_number=1000, default_segment_number=0, max_segment_size=None, memmap_threshold=None, indexing_threshold=20000, flush_interval_sec

In [20]:
files = [
    CodeFiles(path='abc/def', code='print("hello")', summary='print hello world', meta={'a': 1, 'b': 2}),
    CodeFiles(path='xyz/123', code='[]', summary='empty list', meta={}),
]

db.add(files)

In [21]:
db.query(summary='a data structure')

[(0.77734324650812,
  CodeFiles(path='xyz/123', code='[]', summary='empty list', meta={})),
 (0.7530047196982539,
  CodeFiles(path='abc/def', code='print("hello")', summary='print hello world', meta={'a': 1, 'b': 2}))]

In [22]:
for p in db.iterate(batch=1, with_id=True):
    print(p)

('62434d11-1ff2-4f72-987d-473c318a2260', CodeFiles(path='xyz/123', code='[]', summary='empty list', meta={}))
('dad5bafc-5c23-4bf7-a59e-391692df4634', CodeFiles(path='abc/def', code='print("hello")', summary='print hello world', meta={'a': 1, 'b': 2}))


In [23]:
len(db)

2

In [24]:
[p for p in db]

[CodeFiles(path='xyz/123', code='[]', summary='empty list', meta={}),
 CodeFiles(path='abc/def', code='print("hello")', summary='print hello world', meta={'a': 1, 'b': 2})]