# Vector Search with AutoEmbedding Demo
Data Prep and Upload

In [1]:
import os
import json
from pymongo import MongoClient
from dotenv import load_dotenv
load_dotenv()
VCORE_URL = os.environ["VCORE_URL"]
AOAI_ENDPOINT = os.environ["AOAI_ENDPOINT"]
AOAI_KEY = os.environ["AOAI_KEY"]
db_name = 'yelp_dataset'

client = MongoClient(VCORE_URL)
db = client[db_name]
collection = db['business']


# Check if the data already exists in the database
if db_name in client.list_database_names():
    print(f"Database '{db_name}' exists. Skipping data upload.")
else:
    print(f"Preparing data upload to '{db_name}' database.")

    batch_size = 1000
    json_file_path = './datasets without embeddings\yelp_academic_dataset.business.json'
    
    with open(json_file_path, 'r', encoding="utf-8") as file:
        data = json.load(file)

    for i in range(0, len(data), batch_size):
        batch = data[i:i + batch_size]
        collection.insert_many(batch)
        print(f"Inserted batch {i // batch_size + 1}")

Database 'yelp_dataset' exists. Skipping data upload.


## I. Auto-embedding generation
Generate embeddings for records that do not have embeddings yet

In [2]:
import time
from pymongo import UpdateOne
batch_size = 15
total_updated = 0
iteration = 0

while True:
    iteration += 1

    # Find records to update
    records_to_update = collection.find({
        '$and': [
            {'embeddings': {'$exists': False}},
            {'description': {'$exists': True}}
        ]
    }).limit(batch_size)
    records_to_update = list(records_to_update)
    print(records_to_update)

    if not records_to_update:
        print(f"All rows have been updated. Total updated rows: {total_updated}")
        break

    total_updated += len(records_to_update)

    print(f"Iteration: {iteration}, has handled {total_updated} rows")

    # Prepare bulk operations
    bulk_ops = [
        UpdateOne({'_id': record['_id']},
                {'$generateEmbeddings': {'description': 'embeddings'}})
        for record in records_to_update
    ]

    if bulk_ops:
        result = db.business.bulk_write(bulk_ops)
        print(f"Bulk write result: {result.bulk_api_result}")

    time.sleep(0.5)  # Sleep for 500 ms



[{'_id': ObjectId('671b4443efc3fbf09a7d44be'), 'business_id': 'e31Rd2X0KYAS54ciEtsZyg', 'name': 'Green Taxi', 'address': '3340 McCaw Ave, Ste 114', 'city': 'Santa Barbara', 'state': 'CA', 'postal_code': '93105', 'latitude': 34.4384349379, 'longitude': -119.7362903171, 'stars': 2, 'review_count': 12, 'is_open': 1, 'attributes': None, 'categories': 'Hotels & Travel, Transportation, Taxis', 'hours': None, 'location': {'type': 'Point', 'coordinates': [-119.7362903171, 34.4384349379]}, 'description': "Green Taxi, located at 3340 McCaw Ave, Ste 114 in Santa Barbara, CA, is a taxi service that falls under the categories of Hotels & Travel, Transportation, and Taxis. With a modest rating of 2 stars based on 12 reviews, Green Taxi offers essential transportation services to the Santa Barbara area. Despite its lower rating, it remains operational and serves the community's travel needs. The business is situated conveniently with coordinates at a latitude of 34.4384349379 and a longitude of -119.

In [3]:
# query data
result = collection.find_one({
    "city": "Santa Barbara",
    "postal_code": "93105"
})

# Print the result
if result:
    business_embed = result.pop("embeddings")
    print(result)

{'_id': ObjectId('671b4443efc3fbf09a7d46ce'), 'business_id': 'gfjjUlMC05kry6qjYyKrcA', 'name': 'West Coast Vascular', 'address': '2323 Oak Park Ln, Ste 201', 'city': 'Santa Barbara', 'state': 'CA', 'postal_code': '93105', 'latitude': 34.4287211, 'longitude': -119.7250939, 'stars': 2, 'review_count': 5, 'is_open': 1, 'attributes': {'BusinessAcceptsCreditCards': 'False'}, 'categories': 'Health & Medical, Doctors, Vascular Medicine, Medical Centers, Surgeons', 'hours': {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}, 'location': {'type': 'Point', 'coordinates': [-119.7250939, 34.4287211]}, 'description': 'West Coast Vascular, located at 2323 Oak Park Ln, Ste 201, Santa Barbara, CA 93105, is a specialized medical center focusing on vascular medicine and surgery. Despite its convenient location and comprehensive services in the health and medical sectors, including doctors and surgeon

### II. Query Data
a. GeoSpacial Query
find nearby stores based on geographic proximity

In [4]:
# create index
command = {
    "createIndexes": "business",
    "indexes": [
        {
            "key": {
                "location": "2dsphere"
            },

            "name": "location_index"
        }
    ]

}
db.command(command)


{'raw': {'defaultShard': {'numIndexesBefore': 5,
   'numIndexesAfter': 5,
   'createdCollectionAutomatically': False,
   'note': 'all indexes already exist',
   'ok': 1}},
 'ok': 1}

In [5]:
# Define the user's location
user_location = {
    "type": "Point",
    "coordinates": [-119.7250939, 34.4287211]  # User's location (longitude, latitude)
}

# Perform the aggregation with $geoNear
pipeline = [
    {
        "$geoNear": {
            "key": "location",
            "near": user_location,
            "distanceField": "distance",  # Field to store the calculated distance
            "maxDistance": 5,  # Maximum distance in meters (5 km)
            "spherical": True,  # Use spherical geometry
            "$match": {
                "isOpen": True,  # Filter for open restaurants
                "stars": {"$gte": 2}  # Filter by rating >= 2
            },
            "$limit": 5  # Limit to top 5 results
        }
    }
]

# Execute the aggregation
results = collection.aggregate(pipeline)

# Print the results
for doc in results:
    doc.pop("embeddings")
    print(doc)

{'_id': ObjectId('671b4443efc3fbf09a7d46ce'), 'business_id': 'gfjjUlMC05kry6qjYyKrcA', 'name': 'West Coast Vascular', 'address': '2323 Oak Park Ln, Ste 201', 'city': 'Santa Barbara', 'state': 'CA', 'postal_code': '93105', 'latitude': 34.4287211, 'longitude': -119.7250939, 'stars': 2, 'review_count': 5, 'is_open': 1, 'attributes': {'BusinessAcceptsCreditCards': 'False'}, 'categories': 'Health & Medical, Doctors, Vascular Medicine, Medical Centers, Surgeons', 'hours': {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}, 'location': {'type': 'Point', 'coordinates': [-119.7250939, 34.4287211]}, 'description': 'West Coast Vascular, located at 2323 Oak Park Ln, Ste 201, Santa Barbara, CA 93105, is a specialized medical center focusing on vascular medicine and surgery. Despite its convenient location and comprehensive services in the health and medical sectors, including doctors and surgeon

b. Vector Search with DiskANN

In [6]:
# First pipeline: Vector search

# create index
command = {
    "createIndexes": "business",
    "indexes": [
        {
            "key": {
                "embeddings": "cosmosSearch"
            },

            "name": "diskann_index",
            "cosmosSearchOptions": {
                "kind": "vector-diskann",
                "maxDegree": 32,
                "lBuild": 64,
                "similarity": "L2",
                "dimensions": 1536
            }
        }
    ]

}
db.command(command)


{'raw': {'defaultShard': {'numIndexesBefore': 5,
   'numIndexesAfter': 5,
   'createdCollectionAutomatically': False,
   'note': 'all indexes already exist',
   'ok': 1}},
 'ok': 1}

In [7]:
# search query
vector_search_pipeline = [
    {
        '$search': {
            'cosmosSearch': {
                'path': 'embeddings',
                'vector': business_embed,
                'lSearch': 64,
                'k': 5  # Limit to top 5 closest vectors
            }
        }
    }
]
# Execute the aggregation
results = collection.aggregate(vector_search_pipeline)
# Print the results
for doc in results:
    doc.pop("embeddings")
    print(doc)


{'_id': ObjectId('671b4443efc3fbf09a7d46ce'), 'business_id': 'gfjjUlMC05kry6qjYyKrcA', 'name': 'West Coast Vascular', 'address': '2323 Oak Park Ln, Ste 201', 'city': 'Santa Barbara', 'state': 'CA', 'postal_code': '93105', 'latitude': 34.4287211, 'longitude': -119.7250939, 'stars': 2, 'review_count': 5, 'is_open': 1, 'attributes': {'BusinessAcceptsCreditCards': 'False'}, 'categories': 'Health & Medical, Doctors, Vascular Medicine, Medical Centers, Surgeons', 'hours': {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}, 'location': {'type': 'Point', 'coordinates': [-119.7250939, 34.4287211]}, 'description': 'West Coast Vascular, located at 2323 Oak Park Ln, Ste 201, Santa Barbara, CA 93105, is a specialized medical center focusing on vascular medicine and surgery. Despite its convenient location and comprehensive services in the health and medical sectors, including doctors and surgeon

c. Vector Search with filter 

In [8]:
# create index on filter related fields
command = { "createIndexes": "business", "indexes": [ { "key": { "is_open": 1 }, "name": "is_open" } ] }
db.command(command)

pipeline = [

                {
                    "$search": {
                        "cosmosSearch": {
                            "path": "embeddings",
                            "vector": business_embed,  # Replace with your actual vector
                            "k": 5,  # Limit to top 5 closest vectors
                            "filter": {"$and": [{ "is_open": { "$eq": 1 } }
                                                ]}
                        }
                    }
                },
                {
                    "$limit": 5  # Limit to top 5 results
                }
            ]

# Execute the aggregation
results = collection.aggregate(pipeline)

# Print the results
for doc in results:
    doc.pop("embeddings")
    print(doc)

{'_id': ObjectId('671b4443efc3fbf09a7d46ce'), 'business_id': 'gfjjUlMC05kry6qjYyKrcA', 'name': 'West Coast Vascular', 'address': '2323 Oak Park Ln, Ste 201', 'city': 'Santa Barbara', 'state': 'CA', 'postal_code': '93105', 'latitude': 34.4287211, 'longitude': -119.7250939, 'stars': 2, 'review_count': 5, 'is_open': 1, 'attributes': {'BusinessAcceptsCreditCards': 'False'}, 'categories': 'Health & Medical, Doctors, Vascular Medicine, Medical Centers, Surgeons', 'hours': {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}, 'location': {'type': 'Point', 'coordinates': [-119.7250939, 34.4287211]}, 'description': 'West Coast Vascular, located at 2323 Oak Park Ln, Ste 201, Santa Barbara, CA 93105, is a specialized medical center focusing on vascular medicine and surgery. Despite its convenient location and comprehensive services in the health and medical sectors, including doctors and surgeon

d. Combine Search Query with Geo Location

In [9]:
# Create an index on the location field
command = { "createIndexes": "business", "indexes": [ { "key": { "location": 1 }, "name": "location" } ] }
db.command(command)
# Search for the top 5 closest vectors to the query "Locksmith" within a 1 mile radius of longitude 119 W and latitude 34 N
pipeline = [

                {
                    "$search": {
                        "cosmosSearch": {
                            "path": "embeddings",
                            "query": "Locksmith",  # Replace with your query
                            "k": 5,  # Limit to top 5 closest vectors
                            "filter": {"$and": [
                                { "is_open": { "$eq": 1 } },
                                # all documents within a 1 mile radius of longitude 119 W and latitude 34 N.
                                #  The query converts the distance to radians by dividing by the approximate equatorial radius of the earth, 3963.2 miles
                                {"location": {"$geoWithin": {"$centerSphere":[[-119.7192861804, 34.4102485028], 1/3963.2 ]}}}
                                                ]}
                        }
                    }
                },
                {
                    "$limit": 5  # Limit to top 5 results
                }
            ]

# Execute the aggregation
results = collection.aggregate(pipeline)
restults = [results]
# Print the results
for doc in results:
    doc.pop("embeddings")
    print(doc)

{'_id': ObjectId('671b443defc3fbf09a7d0b96'), 'business_id': 'xI-srUR-M8jYQEm-3cVOdA', 'name': 'Master Locksmith - Santa Barbara', 'address': '', 'city': 'Santa Barbara', 'state': 'CA', 'postal_code': '93101', 'latitude': 34.420334, 'longitude': -119.7107494, 'stars': 4.5, 'review_count': 16, 'is_open': 1, 'attributes': {'ByAppointmentOnly': 'False', 'BusinessAcceptsCreditCards': 'True'}, 'categories': 'Garage Door Services, Keys & Locksmiths, Home Services', 'hours': {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}, 'location': {'type': 'Point', 'coordinates': [-119.7107494, 34.420334]}, 'description': '### Business Summary\n\nMaster Locksmith - Santa Barbara, located in the heart of Santa Barbara, CA, is a highly-rated locksmith service specializing in garage door services, key duplication, and various home services. With a commendable rating of 4.5 stars based on 16 reviews, th

In [10]:
# Alternative: Search Query with GeoNear Operator
pipeline = [
    {
        "$geoNear": {
            "key": "location",
            "near": user_location,
            "distanceField": "distance",  # Field to store the calculated distance
            "maxDistance": 5,  # Maximum distance in meters (5 km)
            "spherical": True,  # Use spherical geometry
            "$match": {
                "isOpen": True,  # Filter for open restaurants
                "stars": {"$gte": 2}  # Filter by rating >= 2
            },
            # "$limit": 5  # Limit to top 5 results
        }
    },
    {
        "$unionWith": {
            "coll": "business",  # Specify the collection to union with
            "pipeline": [
                {
                    "$search": {
                        "cosmosSearch": {
                            "path": "embeddings",
                            "query": "Locksmith",  # Replace with your actual vector
                            "k": 5  # Limit to top 5 closest vectors
                        }
                    }
                },
                {
                    "$limit": 5  # Limit to top 5 results
                }
            ]
        }
    }
]

# Execute the aggregation
results = collection.aggregate(pipeline)

# Print the results
for doc in results:
    print(doc)


{'_id': ObjectId('671b4443efc3fbf09a7d46ce'), 'business_id': 'gfjjUlMC05kry6qjYyKrcA', 'name': 'West Coast Vascular', 'address': '2323 Oak Park Ln, Ste 201', 'city': 'Santa Barbara', 'state': 'CA', 'postal_code': '93105', 'latitude': 34.4287211, 'longitude': -119.7250939, 'stars': 2, 'review_count': 5, 'is_open': 1, 'attributes': {'BusinessAcceptsCreditCards': 'False'}, 'categories': 'Health & Medical, Doctors, Vascular Medicine, Medical Centers, Surgeons', 'hours': {'Monday': '0:0-0:0', 'Tuesday': '0:0-0:0', 'Wednesday': '0:0-0:0', 'Thursday': '0:0-0:0', 'Friday': '0:0-0:0', 'Saturday': '0:0-0:0', 'Sunday': '0:0-0:0'}, 'location': {'type': 'Point', 'coordinates': [-119.7250939, 34.4287211]}, 'description': 'West Coast Vascular, located at 2323 Oak Park Ln, Ste 201, Santa Barbara, CA 93105, is a specialized medical center focusing on vascular medicine and surgery. Despite its convenient location and comprehensive services in the health and medical sectors, including doctors and surgeon