# LanceDB vector database

In [1]:
import lancedb

vector_db = lancedb.connect(uri="vector_database")

vector_db

LanceDBConnection(uri='/Users/hdonnersten/Library/CloudStorage/OneDrive-sti.se/Github/ai_engineering_hampus_donnersten_de24/09_lancedb_vector_database/vector_database')

In [2]:
vector_db.uri

'/Users/hdonnersten/Library/CloudStorage/OneDrive-sti.se/Github/ai_engineering_hampus_donnersten_de24/09_lancedb_vector_database/vector_database'

# Create a table

In [3]:
import json

with open("animals_text_embeddings.json") as file:
    data = json.loads(file.read())

data

[{'text': 'A small brown dog running.', 'vector': [0.12, 0.85, 0.33]},
 {'text': 'A cat resting quietly on a sofa.', 'vector': [0.4, 0.91, 0.1]},
 {'text': 'A large gray elephant drinking water.',
  'vector': [0.88, 0.22, 0.55]},
 {'text': 'A fast cheetah sprinting across the savannah.',
  'vector': [0.95, 0.12, 0.72]},
 {'text': 'A colorful parrot perched on a branch.',
  'vector': [0.25, 0.66, 0.81]},
 {'text': 'A frog sitting on a lily pad.', 'vector': [0.14, 0.44, 0.27]}]

In [4]:
vector_db.create_table("animals", exist_ok=True, data=data)

LanceTable(name='animals', version=1, _conn=LanceDBConnection(uri='/Users/hdonnersten/Library/CloudStorage/OneDrive-sti.se/Github/ai_engineering_hampus_donnersten_de24/09_lancedb_vector_database/vector_database'))

In [5]:
vector_db.table_names()

['animals']

In [None]:
# Operator overloaded __getitem__
vector_db["animals"]

LanceTable(name='animals', version=1, _conn=LanceDBConnection(uri='/Users/hdonnersten/Library/CloudStorage/OneDrive-sti.se/Github/ai_engineering_hampus_donnersten_de24/09_lancedb_vector_database/vector_database'))

In [7]:
vector_db["animals"].head()

pyarrow.Table
text: string
vector: fixed_size_list<item: float>[3]
  child 0, item: float
----
text: [["A small brown dog running.","A cat resting quietly on a sofa.","A large gray elephant drinking water.","A fast cheetah sprinting across the savannah.","A colorful parrot perched on a branch."]]
vector: [[[0.12,0.85,0.33],[0.4,0.91,0.1],[0.88,0.22,0.55],[0.95,0.12,0.72],[0.25,0.66,0.81]]]

In [8]:
vector_db["animals"].to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"


In [9]:
more_data = [
    {"text": "A panda eating bamboo peacefully.", "vector": [0.51, 0.37, 0.82]},
    {"text": "A lion roaring loudly on a rock.", "vector": [0.93, 0.18, 0.41]},
]


vector_db["animals"].add(more_data)

AddResult(version=2)

In [10]:
vector_db["animals"].to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"
6,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
7,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"


## Creates an empty table and drop it

In [13]:
from lancedb.pydantic import LanceModel

class JokeSchema(LanceModel):
    joke: str
    rating: int

vector_db.create_table(name="jokes_drop_table", schema=JokeSchema)

LanceTable(name='jokes_drop_table', version=1, _conn=LanceDBConnection(uri='/Users/hdonnersten/Library/CloudStorage/OneDrive-sti.se/Github/ai_engineering_hampus_donnersten_de24/09_lancedb_vector_database/vector_database'))

Drop the drop table

In [15]:
vector_db.table_names()

['animals', 'jokes', 'jokes_drop_table']

In [16]:
vector_db.drop_table("jokes_drop_table")

In [17]:
vector_db.table_names()

['animals', 'jokes']

## Vector search in lancedb

- search with ANN - Aproximate Nerest neighbour
- Search woth natural text and it will auto calculate its embedding

In [18]:
vector_db["animals"].to_pandas()

Unnamed: 0,text,vector
0,A small brown dog running.,"[0.12, 0.85, 0.33]"
1,A cat resting quietly on a sofa.,"[0.4, 0.91, 0.1]"
2,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]"
3,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]"
4,A colorful parrot perched on a branch.,"[0.25, 0.66, 0.81]"
5,A frog sitting on a lily pad.,"[0.14, 0.44, 0.27]"
6,A panda eating bamboo peacefully.,"[0.51, 0.37, 0.82]"
7,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]"


In [None]:
query_vector = [.8,.2,.6]

vector_db["animals"].search(query_vector).limit(3).to_pandas()


Unnamed: 0,text,vector,_distance
0,A large gray elephant drinking water.,"[0.88, 0.22, 0.55]",0.0093
1,A fast cheetah sprinting across the savannah.,"[0.95, 0.12, 0.72]",0.0433
2,A lion roaring loudly on a rock.,"[0.93, 0.18, 0.41]",0.0534


In [None]:
try:
    vector_db["animals"].search("pandas eat bamboo").limit(3).to_pandas()
except:
    

# Embeddings models

In [26]:
from lancedb.embeddings import get_registry
import numpy as np

embedding_model = get_registry().get("gemini-text").create(name="gemini-embedding-001")
embedding_model.compute_query_embeddings("Hej på dig")

test_embedding = np.array(embedding_model.compute_query_embeddings("hej på dig"))
test_embedding.shape


(1, 3072)

In [27]:
vector_db.drop_table("jokes")

In [28]:
from lancedb.pydantic import Vector, LanceModel

class JokeModel(LanceModel):
    joke: str = embedding_model.SourceField() # Joke becomes our source column 
    vector: Vector(3072) = embedding_model.VectorField() # Target vector column

vector_db.create_table("jokes", schema=JokeModel,exist_ok=True)
vector_db["jokes"]




LanceTable(name='jokes', version=1, _conn=LanceDBConnection(uri='/Users/hdonnersten/Library/CloudStorage/OneDrive-sti.se/Github/ai_engineering_hampus_donnersten_de24/09_lancedb_vector_database/vector_database'))

In [32]:
import pandas as pd
with open("jokes.json") as file:
    jokes_data = json.loads(file.read())

df_jokes = pd.DataFrame(jokes_data).rename({"jokes": "joke"},axis=1)
df_jokes

Unnamed: 0,joke
0,Parallel lines have so much in common—it’s sad...
1,"ETL stands for “Extract, Transform, Leave for ..."
2,What do you call a snake that runs your script...
3,"Gold walks into a bar. The bartender says, “Au..."
4,C# devs don’t argue; they just throw exceptions.
5,I asked the data lake if it had my file. It sa...
6,My math teacher said I’m average… how mean!
7,Python is so friendly even whitespace gets a say.
8,The C# compiler walked into a bar. The bartend...
9,Never trust an atom—they make up everything.


In [33]:
vector_db["jokes"].add(df_jokes)

AddResult(version=2)

In [34]:
vector_db["jokes"].to_pandas().iloc[0]["vector"]

array([-0.02400176,  0.01247358, -0.02414474, ...,  0.01160614,
        0.00044886,  0.01217596], shape=(3072,), dtype=float32)

In [35]:
vector_db["jokes"].search("Tell me a few data jokes").limit(5).to_pandas()

Unnamed: 0,joke,vector,_distance
0,Why do data engineers hate nature? Too many un...,"[-0.027916763, 0.0047387416, -0.018934403, -0....",0.569677
1,"Gold walks into a bar. The bartender says, “Au...","[-0.024867292, 0.013314825, -0.016261652, -0.0...",0.663104
2,"Data engineer motto: If it works, don’t touch ...","[-0.020296954, 0.020327171, -0.009069326, -0.0...",0.690496
3,I told a chemistry joke… there was no reaction.,"[-0.022922393, 0.017959604, -0.029222224, -0.0...",0.698602
4,The C# compiler walked into a bar. The bartend...,"[-0.01868987, 0.018796643, -0.009748903, -0.07...",0.708148


In [36]:
vector_db["jokes"].search("see sharp").limit(5).to_pandas()

Unnamed: 0,joke,vector,_distance
0,Why is C# like a musical? It has so many classes.,"[-0.012926053, 0.0034431089, -0.017093537, -0....",0.74563
1,C# devs don’t argue; they just throw exceptions.,"[-0.0068662865, -0.005415149, 0.0044965413, -0...",0.809269
2,The C# compiler walked into a bar. The bartend...,"[-0.01868987, 0.018796643, -0.009748903, -0.07...",0.813187
3,Why did the C# developer go broke? He kept usi...,"[-0.02175711, 0.0073709465, -0.015515881, -0.0...",0.816001
4,I tried to explain async/await to my friend… n...,"[-0.016174542, 0.012602222, -0.015254588, -0.0...",0.824324
