In [1]:
import singlestoredb
import json
import numpy as np
import sys
import os
import logging

from dotenv import load_dotenv
load_dotenv()

True

In [2]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.StreamHandler(sys.stdout)
    ]
)

In [3]:
# 1. Set Environment Variables
HOST = os.getenv('SINGLESTORE_HOST')
PORT = int(os.getenv('SINGLESTORE_PORT', 3306))
USER = os.getenv('SINGLESTORE_USER')
PASSWORD = os.getenv('SINGLESTORE_PASSWORD')
DATABASE = os.getenv('SINGLESTORE_DATABASE')

In [4]:
print(HOST, PORT, USER, PASSWORD, DATABASE)

svc-3482219c-a389-4079-b18b-d50662524e8a-shared-dml.aws-virginia-6.svc.singlestore.com 3333 sb 1qus8i7myhm3NUQS0TX0dFoge7lvA1ip db_aurelia_7d548


In [5]:
# Validate that all parameters are available
if not all([HOST, PORT, USER, PASSWORD, DATABASE]):
    logging.error("One or more environment variables for connection parameters are missing.")
    sys.exit(1)

In [6]:
# 3. Establish Connection to SingleStore
try:
    connection = singlestoredb.connect(
        host=HOST,
        port=PORT,
        user=USER,
        password=PASSWORD,
        database=DATABASE
    )
    cursor = connection.cursor()
    logging.info("Successfully connected to SingleStore.")
except singlestoredb.Error as e:
    logging.error(f"SingleStore Error: {e}")
    sys.exit(1)
except Exception as e:
    logging.error(f"Unexpected Error: {e}")
    sys.exit(1)


2025-01-12 07:37:38,181 - INFO - Successfully connected to SingleStore.


In [7]:
# 4. Create Table with Vector and Metadata Columns
delete_table_query = """
DROP TABLE vectors_table;
"""

create_table_query = """
CREATE TABLE vectors_table (
    id INT AUTO_INCREMENT PRIMARY KEY,
    vector BLOB NOT NULL,
    file_name VARCHAR(255) NOT NULL,
    summary TEXT NOT NULL,
    content TEXT NOT NULL
);
"""

try:
    cursor.execute(delete_table_query)
    cursor.execute(create_table_query)
    connection.commit()
    logging.info("Table 'vectors_table' is ready.")
except singlestoredb.Error as e:
    logging.error(f"SingleStore Error while creating table: {e}")
    cursor.close()
    connection.close()
    sys.exit(1)
except Exception as e:
    logging.error(f"Unexpected Error while creating table: {e}")
    cursor.close()
    connection.close()
    sys.exit(1)

2025-01-12 07:37:38,526 - INFO - Table 'vectors_table' is ready.


In [8]:
# 5. Prepare Multiple Vectors and Metadata for Bulk Insert
vector_dimension = 3072 

# Example list of vectors and metadata
import json

with open('result.json', 'r') as f:
    vectors = json.load(f)

data_to_insert = vectors

# Prepare and format data for insertion
formatted_data = []
for data in data_to_insert:
    vector = data[0]
    file_name = data[1]
    summary = data[2]
    content = data[3]
    
    if len(vector) != vector_dimension:
        logging.error(f"Vector dimensionality mismatch: expected {vector_dimension}, got {len(vector)}")
        cursor.close()
        connection.close()
        sys.exit(1)
    
    # Ensure the vector is a JSON array
    vector_json = json.dumps(vector)
    formatted_data.append((vector_json, file_name, summary, content))

# Bulk Insert Data into the Table
insert_query = """
INSERT INTO vectors_table (vector, file_name, summary, content)
VALUES (JSON_ARRAY_PACK(%s), %s, %s, %s);
"""

try:
    cursor.executemany(insert_query, formatted_data)
    connection.commit()
    logging.info(f"Inserted {cursor.rowcount} rows successfully into 'vectors_table'.")
except singlestoredb.Error as e:
    logging.error(f"SingleStore Error while inserting data: {e}")
    cursor.close()
    connection.close()
    sys.exit(1)
except Exception as e:
    logging.error(f"Unexpected Error while inserting data: {e}")
    cursor.close()
    connection.close()
    sys.exit(1)

2025-01-12 07:37:39,290 - INFO - Inserted 5 rows successfully into 'vectors_table'.


In [9]:
def search_vectors(query_vector, top_t=5):
    if isinstance(query_vector, np.ndarray):
        query_vector = query_vector.tolist()
    elif not isinstance(query_vector, list):
        logging.error("Query vector must be a list or numpy array.")
        return []
    if len(query_vector) != vector_dimension:
        logging.error(f"Vector dimensionality mismatch: expected {vector_dimension}, got {len(query_vector)}")
        return []

    # Convert query vector to JSON array
    query_vector_json = json.dumps(query_vector)

    # SQL query to compute cosine similarity and retrieve top_t vectors
    # SingleStore's HNSW index optimizes this search
    search_query = """
    SELECT 
        id, 
        dot_product(vector, JSON_ARRAY_PACK(%s)) AS similarity,
        file_name,
        summary,
        content
    FROM vectors_table
    ORDER BY similarity DESC
    LIMIT %s;
    """
    try:
        cursor.execute(search_query, (query_vector_json, top_t))
        results = cursor.fetchall()
        # Verify if summary is being retrieved
        for row in results:
            if row[3] is None or row[3] == "":
                logging.warning(f"Summary missing for ID: {row[0]}")
        return results
    except singlestoredb.Error as e:
        logging.error(f"SingleStore Error while searching vectors: {e}")
        return []
    except Exception as e:
        logging.error(f"Unexpected Error while searching vectors: {e}")
        return []

# 7. Mock a Search
def mock_search():
    study_notes = input("Do you want to study the notes for all PDF files? (y/n): ").strip().lower()
    if study_notes == 'y':
        topic = input("Enter the topic you are interested in (e.g., fluid dynamics): ").strip()
        logging.info(f"Searching notes on topic: {topic}")
        # In a real scenario, you would convert the topic into a query_vector
        # For the mock, we'll just generate a random query vector
        query_vector = np.random.rand(vector_dimension).tolist()
    else:
        logging.info("Performing search without studying specific notes.")
        query_vector = np.random.rand(vector_dimension).tolist()

    top_t = 3
    logging.info("Performing a mock search with a query vector...")
    logging.info(f"Query vector: {query_vector}")
    results = search_vectors(query_vector, top_t)
    if not results:
        logging.info("No results found.")
        return
    logging.info(f"Top {top_t} similar vectors:")
    for row in results:
        vector_id, similarity, file_name, summary, content = row
        if summary:
            logging.info(f"ID: {vector_id}, Similarity: {similarity:.4f}, File: {file_name}, Summary: {summary}")
        else:
            logging.info(f"ID: {vector_id}, Similarity: {similarity:.4f}, File: {file_name}, Summary: [No Summary Available]")

mock_search() 

# 8. Clean Up
cursor.close()
connection.close()
logging.info("Connection to SingleStore closed.")


2025-01-12 07:37:50,173 - INFO - Searching notes on topic: fluid dynamics
2025-01-12 07:37:50,175 - INFO - Performing a mock search with a query vector...
2025-01-12 07:37:50,182 - INFO - Query vector: [0.9733300396380183, 0.8446984526298373, 0.24371510712614486, 0.5868033330514462, 0.8168856316740332, 0.21896857945906523, 0.08726810119219386, 0.4487935239543418, 0.07008293792957476, 0.1619977244049654, 0.5865919530805558, 0.7550199140009237, 0.2343462707436088, 0.5751399444904192, 0.011648956359064155, 0.8278011621636654, 0.9541888322054001, 0.18766045119858077, 0.8342835737703439, 0.18588538012116784, 0.2522374271856386, 0.3992957548141203, 0.007904255828046547, 0.880597953303014, 0.45138241141224134, 0.22364152313517882, 0.9634349830877251, 0.17757663977058957, 0.1578844230729345, 0.05069692398176229, 0.6320695766751415, 0.8216972799979175, 0.43248536580172126, 0.2630981950096092, 0.6057728009453746, 0.8865158141790467, 0.36324825732275257, 0.621580253552774, 0.664659820890593, 0.48