In [1]:
import os
from astrapy.db import AstraDB

# Initialization
db = AstraDB(
    token=os.environ["ASTRA_DB_APPLICATION_TOKEN"],
    api_endpoint=os.environ["ASTRA_DB_API_ENDPOINT"],
)

print(f"Connected to Astra DB: {db.get_collections()}")

Connected to Astra DB: {'status': {'collections': ['abc', 'abc2', 'movie_reviews', 'vdf_2024_9_11_2']}}


In [2]:
db.get_collections()["status"]["collections"]

['abc', 'abc2', 'movie_reviews', 'vdf_2024_9_11_2']

In [3]:
# db.collection("movie_reviews").count_documents()
db.collection("vdf_2024_9_11_2").count_documents()

{'status': {'count': 0}}

In [16]:
coll = db.collection("movie_reviews_2")

In [17]:
coll.find()

{'data': {'documents': [], 'nextPageState': None}}

In [18]:
from numpy import nan
from vdf_io.util import get_qdrant_id_from_id


coll.upsert_many(
    documents=[
        {
            "_id": get_qdrant_id_from_id("1"),
            "$vector": [0.1 for x in range(1536)],
            "id": "65b7686dbe973a95c33ed028",
            "title": "La Sapienza",
            "reviewid": "2265990",
            "creationdate": "2015-06-05",
            "criticname": "Boyd van Hoeij",
            "originalscore": "",
            "reviewstate": "fresh",
            "reviewtext": """The Sapience juxtaposes 
insights on how people are emotionally connected 
with ruminations on the buildings and spaces through
which they move, in which they live and, in 
Alexandre's case, which they also create.""",
            "__v": 0.0,
            "movie": None,
            "rating": nan,
        },
    ]
)

[1]

In [13]:
table = coll.find()["data"]["documents"]

In [15]:
# convert list of dicts to pd.DataFrame
import pandas as pd

df = pd.DataFrame(table)
df.head()

Unnamed: 0,_id,title,reviewid,creationdate,criticname,originalscore,reviewstate,reviewtext,$vector,__v,movie,rating
0,65b7686dbe973a95c33ed014,Toorbos,2760593,2021-01-29,Neil Young,,fresh,Built around a luminous and intriguing central...,"[-0.005884130485355854, 0.0010980357183143497,...",0.0,,
1,65b7686bbe973a95c33ecff1,Small Town Wisconsin,2733251,2020-10-12,Jared Mobarak,B,fresh,Small Town Wisconsin is always proving itself ...,"[0.02259608916938305, 0.00613283459097147, 0.0...",0.0,,
2,65b7686dbe973a95c33ed030,La Sapienza,2252452,2015-03-19,A.O. Scott,,fresh,The movie is an unapologetically rarefied unde...,"[0.0065908576361835, -0.02235131524503231, -0....",0.0,,
3,65b7686abe973a95c33ecfdf,Dangerous Men,2295338,2015-11-19,Peter Keough,0.5/4,rotten,"Conceivably, it could serve as a primer for st...","[-0.010224452242255211, -0.009304582141339779,...",0.0,,
4,65b7686bbe973a95c33ecfee,Small Town Wisconsin,102697854,2022-06-08,Brian Orndorf,B+,fresh,Naczek isn&apos;t interested in making a soap ...,"[-0.0116984061896801, 0.011588690802454948, 0....",0.0,,


In [5]:
resp = coll.find()

In [6]:
len(resp["data"]["documents"])

20

In [29]:
resp["data"]["documents"][0].keys()

dict_keys(['_id', 'title', 'reviewid', 'creationdate', 'criticname', 'originalscore', 'reviewstate', 'reviewtext', '$vector', '__v'])

In [20]:
# write to a new parquet file
import pyarrow as pa

table = pa.Table.from_pandas(coll.find().to_pandas())

for r in coll.paginated_find():
    print(type(r), r.keys())
    # append data into a parquet file

<class 'dict'> dict_keys(['_id', 'title', 'reviewid', 'creationdate', 'criticname', 'originalscore', 'reviewstate', 'reviewtext', '$vector', '__v'])
<class 'dict'> dict_keys(['_id', 'title', 'reviewid', 'creationdate', 'criticname', 'originalscore', 'reviewstate', 'reviewtext', '$vector', '__v'])
<class 'dict'> dict_keys(['_id', 'title', 'reviewid', 'creationdate', 'criticname', 'originalscore', 'reviewstate', 'reviewtext', '$vector', '__v'])
<class 'dict'> dict_keys(['_id', 'title', 'reviewid', 'creationdate', 'criticname', 'originalscore', 'reviewstate', 'reviewtext', '$vector', '__v'])
<class 'dict'> dict_keys(['_id', 'title', 'reviewid', 'creationdate', 'criticname', 'originalscore', 'reviewstate', 'reviewtext', '$vector', '__v'])
<class 'dict'> dict_keys(['_id', 'title', 'reviewid', 'creationdate', 'criticname', 'originalscore', 'reviewstate', 'reviewtext', '$vector', '__v'])
<class 'dict'> dict_keys(['_id', 'title', 'reviewid', 'creationdate', 'criticname', 'originalscore', 'revi

In [37]:
collection2 = db.create_collection(
    "abc2",
    dimension=14,
    metric="cosine",
)

In [47]:
from random import randint, random
from tqdm import tqdm


def get_random_vector():
    return [random() for _ in range(14)]


def get_random_name():
    return "".join([chr(randint(97, 122)) for _ in range(10)])


def get_random_price():
    return round(random() * 100, 2)


for i in tqdm(range(10000)):
    # create 100 documents with random vectors
    identifiers = collection2.insert_many(
        [
            {
                "name": get_random_name(),
                "price": get_random_price(),
                "vector": get_random_vector(),
            }
            for _ in range(20)
        ]
    )

  5%|▌         | 547/10000 [02:14<38:42,  4.07it/s]


KeyboardInterrupt: 

In [48]:
collection2.count_documents()

{'status': {'moreData': True, 'count': 1000}}

In [49]:
i = 0
for r in tqdm(collection2.paginated_find()):
    i += 1
print(i)

6600it [01:21, 80.83it/s]


KeyboardInterrupt: 

In [60]:
next_page_state = None
i = 0
id_set = set()
tot_docs = 0
while True:
    a = collection2.find(sort=None, options={"pageState": next_page_state})
    break
    id_set.update([x["_id"] for x in a["data"]["documents"]])
    tot_docs += len(a["data"]["documents"])
    if a["data"]["nextPageState"] is None:
        break
    next_page_state = a["data"]["nextPageState"]
    i += 1
    print(i, len(id_set), tot_docs)
# len(a["data"]["documents"])
# len(id_set)

In [70]:
a["data"]["documents"][0]

{'_id': '31b0d1f9-3235-446a-b0d1-f93235f46af6',
 'name': 'aczhlnqcny',
 'price': 62.82,
 'vector': [0.74880013328378,
  0.7402908353206266,
  0.286711539847475,
  0.859773929335073,
  0.7123989591928594,
  0.832631713336484,
  0.939970077167308,
  0.8432646772932154,
  0.31699426022325816,
  0.9966364468489568,
  0.7306610647771097,
  0.5086700116384444,
  0.9957476158468267,
  0.49095880968033223]}

In [69]:
len(collection2.find_one()["data"]["document"]["vector"])

14

In [71]:
mr = db.collection("movie_reviews")
mr.find_one()["data"]["document"].keys()

dict_keys(['_id', 'title', 'reviewid', 'creationdate', 'criticname', 'originalscore', 'reviewstate', 'reviewtext', '$vector', '__v'])