# Embeddings

1. experiment with embeddings
2. a vector table _with cassIO_
3. load some data
4. a first query

In [None]:
import os
from dotenv import find_dotenv, load_dotenv
dotenv_file = find_dotenv('.env')
load_dotenv(dotenv_file)

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
myEmbedding = OpenAIEmbeddings()

In [None]:
cat1 = myEmbedding.embed_query('The cat is on the table')

In [None]:
sum([x*x for x in cat1]) # cat1 * cat1

In [None]:
cat2 = myEmbedding.embed_query('A kitten lies upon a desk')

In [None]:
catZ = myEmbedding.embed_query('Yesterday, when I was mad')

In [None]:
def dotp(v1, v2):
    return sum([x*y for x, y in zip(v1, v2)])

In [None]:
dotp(cat1, cat2)

In [None]:
dotp(cat1, catZ)

## CassIO

In [None]:
from cassio.vector import VectorTable

In [None]:
import os
from dotenv import find_dotenv, load_dotenv
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

dotenv_file = find_dotenv('.env')
load_dotenv(dotenv_file)

ASTRA_DB_SECURE_BUNDLE_PATH = os.environ['ASTRA_DB_SECURE_BUNDLE_PATH']
ASTRA_DB_APPLICATION_TOKEN = os.environ['ASTRA_DB_APPLICATION_TOKEN']
ASTRA_DB_KEYSPACE = os.environ['ASTRA_DB_KEYSPACE']

cluster = Cluster(
    cloud={
        'secure_connect_bundle': ASTRA_DB_SECURE_BUNDLE_PATH,
    },
    auth_provider=PlainTextAuthProvider(
        'token',
        ASTRA_DB_APPLICATION_TOKEN,
    ),
)
session = cluster.connect()

In [None]:
mytable = VectorTable(
    session=session,
    keyspace=ASTRA_DB_KEYSPACE,
    table='test_vector_table',
    embedding_dimension=1536,
    auto_id=False,
)

In [None]:
mytable.put(
    document='A kitten lies upon a desk',
    embedding_vector=cat2,
    document_id='cat2',
    metadata={},
    ttl_seconds=None,
)

In [None]:
mytable.put(
    document='The cat is on the table',
    embedding_vector=cat1,
    document_id='cat1',
    metadata={},
    ttl_seconds=None,
)

In [None]:
def add_sentence(tab, sen, emb, idx):
    vec = emb.embed_query(sen)
    tab.put(
        document=sen,
        embedding_vector=vec,
        document_id=idx,
        metadata={},
        ttl_seconds=None,
    )

In [None]:
add_sentence(mytable, 'Yesterday, when I was mad', myEmbedding, 'vecZ')

In [None]:
# mytable.delete('vecZ')

In [None]:
query_string = 'where is the cat?' 
q_emb = myEmbedding.embed_query(query_string)
matches = mytable.search(
    q_emb,
    3,
    metric='cos',
    metric_threshold=None,
)

In [None]:
best_match = matches[0]

In [None]:
type(best_match)

In [None]:
best_match['document']

In [None]:
[m['distance'] for m in matches]