In [1]:
import os

In [2]:
ASTRA_DB_BUNDLE_PATH =os.getenv("ASTRA_DB_BUNDLE_PATH")
ASTRA_DB_TOKEN = os.getenv("TOKEN")
ASTRA_DB_CLIENT_ID = os.getenv("CLIENT_ID")
ASTRA_DB_CLIENT_SECRET = os.getenv("SECRET")
ASTRA_DB_KEYSPACE = os.getenv("KEYSPACE")
OPENAI_KEY= os.getenv("OPENAI_KEY")

In [5]:
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document

from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider

from datasets import load_dataset, Dataset

In [4]:
clound_config = {
    "secure_connect_bundle":  ASTRA_DB_BUNDLE_PATH
}
auth_provider = PlainTextAuthProvider(ASTRA_DB_CLIENT_ID, ASTRA_DB_CLIENT_SECRET)
cluster = Cluster(cloud=clound_config, auth_provider=auth_provider)
astra_session = cluster.connect()

llm = OpenAI(openai_api_key=OPENAI_KEY)
myEmbedding = OpenAIEmbeddings(openai_api_key=OPENAI_KEY)

In [6]:
listingCassandraStore = Cassandra(
    embedding=myEmbedding,
    session=astra_session,
    keyspace=ASTRA_DB_KEYSPACE,
    table_name="listings"
)

In [18]:
import sqlite3
import pandas as pd
def get_listings():
    """Returns a list of all listings."""
    # Connect to the database.
    db = sqlite3.connect("db/listings.db")
    cursor = db.cursor()

    # Get all listings from the database.
    cursor.execute("SELECT * FROM listings")

    # Create a list of all listings.
    listings = []
    for row in cursor.fetchall():
        listings.append(row)

    # Close the connection to the database.
    db.close()

    # Return the list of listings.
    return listings

items = get_listings()

df = pd.DataFrame(items, columns=["id", "title","price", "link"])
df["price"] = df["price"].apply(lambda p: int(p.replace('$', '').replace(',', '')))
df = df[["title", "price", "link"]]

docs = []
for id, row in df.iterrows():
    doc = Document(page_content=row["title"] + ": $"+str(row["price"]))
    docs.append(doc)

    # Getting embeddings from openAi is really slow. So until this gets optimized, it's limited to just 300 entries
    if id > 3000:
        break

inserted_ids = listingCassandraStore.add_documents(docs)
print(f"\nInserted {len(inserted_ids)} documents.")


Inserted 3002 documents.


In [8]:
vectorIndex = VectorStoreIndexWrapper(vectorstore=listingCassandraStore)

query = "What is the cheapest car that's on the market and has a year after 2011?"
answer = vectorIndex.query(question=query, llm=llm).strip()

print(answer)

print("Docs by relevance")
for doc, score in listingCassandraStore.similarity_search_with_score(query, k=4):
    print("Score:\t",score,"\n",doc)

The 2011 Subaru Impreza is the cheapest car that's on the market and has a year after 2011. It is priced at $9000.
Docs by relevance
Score:	 0.9075767619050981 
 page_content='2011 Subaru Impreza: $9000'
Score:	 0.9075196627005928 
 page_content='2011 Subaru Impreza: $9000'
Score:	 0.9075196627005928 
 page_content='2011 Subaru Impreza: $9000'
Score:	 0.9075196627005928 
 page_content='2011 Subaru Impreza: $9000'
