# LanceDB vector database

In [1]:
import lancedb

vector_db = lancedb.connect(uri = "vector_database")
vector_db

LanceDBConnection(uri='c:\\Users\\alexa\\Documents\\ML_and_AI\\09_lancedb_vector_database\\vector_database')

In [2]:
vector_db.uri

'c:\\Users\\alexa\\Documents\\ML_and_AI\\09_lancedb_vector_database\\vector_database'

## Create a table

In [33]:
import json

with open("animals_text_embeddings.json") as file:
    data = json.loads(file.read())

data

[{'text': 'A small brown dog running.', 'vector': [0.12, 0.85, 0.33]},
 {'text': 'A cat resting quietly on a sofa.', 'vector': [0.4, 0.91, 0.1]},
 {'text': 'A large gray elephant drinking water.',
  'vector': [0.88, 0.22, 0.55]},
 {'text': 'A fast cheetah sprinting across the savannah.',
  'vector': [0.95, 0.12, 0.72]},
 {'text': 'A colorful parrot perched on a branch.',
  'vector': [0.25, 0.66, 0.81]},
 {'text': 'A frog sitting on a lily pad.', 'vector': [0.14, 0.44, 0.27]}]

In [5]:
vector_db.create_table("animals", exist_ok=True, data=data)

LanceTable(name='animals', version=1, _conn=LanceDBConnection(uri='c:\\Users\\alexa\\Documents\\ML_and_AI\\09_lancedb_vector_database\\vector_database'))

In [6]:
vector_db.table_names()

['animals']

In [7]:
vector_db["animals"]

LanceTable(name='animals', version=1, _conn=LanceDBConnection(uri='c:\\Users\\alexa\\Documents\\ML_and_AI\\09_lancedb_vector_database\\vector_database'))

In [8]:
vector_db["animals"].head()

pyarrow.Table
text: string
vector: fixed_size_list<item: float>[3]
  child 0, item: float
----
text: [["A small brown dog running.","A cat resting quietly on a sofa.","A large gray elephant drinking water.","A fast cheetah sprinting across the savannah.","A colorful parrot perched on a branch."]]
vector: [[[0.12,0.85,0.33],[0.4,0.91,0.1],[0.88,0.22,0.55],[0.95,0.12,0.72],[0.25,0.66,0.81]]]

In [9]:
vector_db["animals"].to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"


In [10]:
more_data = [
    {"text": "A panda eating bamboo peacefully.", "vector": [0.51, 0.37, 0.82]},
    {"text": "A lion roaring loudly on a rock.", "vector": [0.93, 0.18, 0.41]},
]

vector_db["animals"].add(more_data)

AddResult(version=2)

In [11]:
vector_db["animals"].to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"
6,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
7,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"


## Create an empty table and drop it

In [14]:
from lancedb.pydantic import LanceModel

class JokeSchema(LanceModel):
    joke: str
    rating: int


vector_db.create_table(name ="jokes_throw_away_table", schema= JokeSchema)

LanceTable(name='jokes_throw_away_table', version=1, _conn=LanceDBConnection(uri='c:\\Users\\alexa\\Documents\\ML_and_AI\\09_lancedb_vector_database\\vector_database'))

In [15]:
# Drop throw_away_table
vector_db.table_names()

['animals', 'jokes', 'jokes_throw_away_table']

In [16]:
vector_db.drop_table("jokes_throw_away_table")

In [17]:
vector_db.table_names()

['animals', 'jokes']

## vector search in lancedb
search with ANN - Approx Nearest Neighbor


- search with vector directly
- search with natural text and it automatically calculates its embedding

In [18]:
vector_db["animals"].to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"
6,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
7,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"


In [20]:
query_vector = [.8,.2,.6]

vector_db["animals"].search(query_vector).limit(5).to_pandas()


Unnamed: 0,text,vector,_distance
0,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]",0.0093
1,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]",0.0433
2,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]",0.0534
3,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]",0.1614
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]",0.5582


In [24]:
try:
    vector_db["animals"].search("pandas").limit(3).to_pandas()
except RuntimeError as err:
    print(err)

lance error: Invalid user input: Cannot perform full text search unless an INVERTED index has been created on at least one column, C:\Users\runneradmin\.cargo\registry\src\index.crates.io-1949cf8c6b5b557f\lance-index-0.39.0\src\scalar\inverted\query.rs:703:25


## Embeddings model

In [None]:
from lancedb.embeddings import get_registry
import numpy as np

embedding_model = get_registry().get("gemini-text").create(name = "gemini-embedding-001")
test_embedding = np.array(embedding_model.compute_query_embeddings("hej p√• dig"))

test_embedding.shape # based on this we know that the model embeds vectors with dim 3072

(1, 3072)

In [28]:
vector_db.drop_table("jokes")

In [29]:
from lancedb.pydantic import LanceModel, Vector


class JokeModel(LanceModel):
    joke: str = embedding_model.SourceField() # joke becomes our source column
    vector: Vector(3072) = embedding_model.VectorField() # target vector column

vector_db.create_table("jokes", schema = JokeModel, exist_ok=True)
vector_db["jokes"]

LanceTable(name='jokes', version=1, _conn=LanceDBConnection(uri='c:\\Users\\alexa\\Documents\\ML_and_AI\\09_lancedb_vector_database\\vector_database'))

In [32]:
import pandas as pd
with open("jokes.json") as file:
    jokes_data = json.loads(file.read())

jokes_data

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 166: character maps to <undefined>