In [1]:
import os
from dotenv import load_dotenv
from astrapy import DataAPIClient
from astrapy.constants import VectorMetric
from astrapy.ids import UUID
from astrapy.exceptions import InsertManyException

load_dotenv()
# Initialize the client and get a "Database" object
client = DataAPIClient(os.getenv("ASTRA_DB_APPLICATION_TOKEN"))
database = client.get_database(os.getenv("ASTRA_DB_API_ENDPOINT"))
print(f"* Database: {database.info().name}\n")

* Database: amc_aime_similarity



In [39]:
collection = database.create_collection(
    "amc_similarity_full_size", 
    dimension=384,
    metric=VectorMetric.COSINE,
    check_exists=False,
)


print(f"* Collection: {collection.full_name}\n")

* Collection: default_keyspace.amc_similarity_full_size



In [2]:
import pandas as pd
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")
df = pd.read_csv('AMC12_data.csv')
solutions = df['Solution']
embeddings = model.encode(solutions)
print(embeddings.shape)



  from tqdm.autonotebook import tqdm, trange


(699, 384)


In [40]:
to_insert = []

df['$vector'] = None
for i in range(len(df)):
    if pd.isna(df['Solution'][i]):
        continue
    to_insert.append(
        {
            "Contest": str(df['Contest'][i]), 
            "Year": int(df['Year'][i]),
            "Number": int(df['Number'][i]),
            "Link": str(df['Link'][i]), 
            "Problem": str(df['Problem'][i]), 
            "Solution": str(df['Solution'][i])[:1000],
            "$vector": embeddings[i],
        }
    )

    collection.insert_one(to_insert[-1])

print("length:", str(len(to_insert)))
# try:
#     insertion_result = collection.insert_many(to_insert)
# except InsertManyException:
#     print("* Documents found on DB already. Let's move on.\n")

    

length: 685


In [12]:
query_vector = model.encode('n this solution we refer to moving to the left as decreasing the year or date number and moving to the right as' +  
                            'increasing the year or date number. Every non-leap year we move to the right results in moving one day to' + 
                            'the right because $365\equiv 1\pmod 7$. Every leap year we move to the right results in moving $2$' +  
                            'days to the right since $366\equiv 2\pmod 7$. A leap year is usually every four years, so 200 years' +  
                            'would have $\frac{200}{4}$ = $50$ leap years, but the problem says that 1900 does not count as a leap year.')

results = existing_collection.find(
    sort={"$vector": query_vector},
    limit=10,
)
print("Vector search results:")
for i, document in enumerate(results):
    print(str(i) + ": " + str(document))

Vector search results:
0: {'_id': 'ffdfe1e7-cd5b-424b-9fe1-e7cd5b224b99', 'Contest': 'AMC12A', 'Year': 2012, 'Number': 9, 'Link': 'https://artofproblemsolving.com/wiki/index.php/2012_AMC_12A_Problems/Problem_9', 'Problem': ' A year is a leap year if and only if the year number is divisible by 400 (such as 2000) or is divisible by 4 but not 100 (such as 2012).  The 200th anniversary of the birth of novelist Charles Dickens was celebrated on February 7, 2012, a Tuesday.  On what day of the week was Dickens born?', 'Solution': ' In this solution we refer to moving to the left as decreasing the year or date number and moving to the right as  increasing the year or date number. Every non-leap year we move to the right results in moving one day to the right because $365\\equiv 1\\pmod 7$ .  Every leap year we move to the right results in moving $2$ days to the right since $366\\equiv 2\\pmod 7$ . A leap year is usually every four years, so 200 years would have $\\frac{200}{4}$ = $50$ leap ye

In [34]:
import requests
import json

existing_collection = database.get_collection('amc_similarity_with_link')
print(existing_collection.estimated_document_count())
namespace = 'default-keyspace'
collection_name = 'amc_similarity_with_link'
base_url = os.getenv('ASTRA_DB_API_ENDPOINT') + '/api/json/v1/' + namespace + '/collections/' + collection_name


headers = {
    'X-Cassandra Token': os.getenv('ASTRA_DB_APPLICATION_TOKEN'),
    'Content-Type': 'application/json',
    'Accept': 'application/json'
}
info_url = base_url + '/info'
print(info_url)

response = requests.get(info_url, headers=headers)

if response.status_code == 200:
    collection_info = response.json()
    print("collection information:", json.dumps(collection_info, indent=2))
    if 'length' in collection_info:
        print('Collection Length:', collection_info['length'])
    else:
        print('length is not available')
else:
    print(f'Failed to get collection information. Status code: {response.status_code}, Error: {response.text}')


100
https://07a3dcb1-38b6-4c2d-a8df-bf068a0e2a8e-us-east-2.apps.astra.datastax.com/api/json/v1/default-keyspace/collections/amc_similarity_with_link/info
Failed to get collection information. Status code: 400, Error: <html>
<head><title>400 Bad Request</title></head>
<body>
<center><h1>400 Bad Request</h1></center>
<hr><center>nginx/1.21.4</center>
</body>
</html>

