In [None]:
# import pandas as pd
from datetime import datetime, timedelta
from pymongo import MongoClient, UpdateOne
from azure.core.exceptions import AzureError
from azure.core.credentials import AzureKeyCredential
import json
from openai import AzureOpenAI
# from dotenv import dotenv_values
import os
# config = dotenv_values()

In [None]:
from dotenv import load_dotenv

# load_dotenv("variables.env", override=True)

MONGO_CONNECTION_STRING= os.getenv("MONGO_CONNECTION_STRING_DISKANN")
AOAI_KEY = os.getenv("AOAI_KEY")
AOAI_ENDPOINT =  os.getenv("AOAI_ENDPOINT")
API_VERSION =  os.getenv("API_VERSION")

In [5]:
mongo_client = MongoClient(MONGO_CONNECTION_STRING)

db = mongo_client['contoso_bookings']

# Create collection if it doesn't exist
COLLECTION_NAME = "listings"

collection = db[COLLECTION_NAME]

if COLLECTION_NAME not in db.list_collection_names():
    db.create_collection(COLLECTION_NAME)
    print("Created collection '{}'.\n".format(COLLECTION_NAME))
else:
    print("Using collection: '{}'.\n".format(COLLECTION_NAME))

Using collection: 'listings'.



In [42]:
db.command({
  'createIndexes': 'listings',
  'indexes': [
    {
      'name': 'listingIndex',
      'key': {
        "embeddings": "cosmosSearch"
      },
      'cosmosSearchOptions': {
        'kind': 'vector-diskann',
        'numLists': 1,
        'similarity': 'COS',
        'dimensions': 1536
      }
    }
  ]
})

{'raw': {'defaultShard': {'numIndexesBefore': 3,
   'numIndexesAfter': 4,
   'createdCollectionAutomatically': False,
   'ok': 1}},
 'ok': 1}

In [10]:
# Load JSON data from file
with open("data/datasets without embeddings/small_for_testing.json", 'r') as file:
    data = json.load(file)

print(data[0])

{'id': '360', 'listing_url': 'https://www.airbnb.com/rooms/360', 'source': 'city scrape', 'name': 'Sit in the Peaceful Garden of the Chickadee Cottage in LoHi', 'description': "Enjoy the famous Colorado weather and unplug in indoor & outdoor living. <br />Our charming cottage has a serene ambiance throughout every area. <br />Spend a sunny afternoon out on the hammock or enjoy the garden parlor sofa relax, read, or play a game, sink into the shared hot tub, practice yoga on the deck.<br />We are located next to downtown and in the neighborhood of lower highlands, <br />Short walks to superb coffee shops, restaurants, microbrews, distilleries, dispensaries & downtown. We are 420 outdoors only. LGBT Friendly, allergy-free, fragrance-free & pet-free. Ozone sterilized.<br /><br />Chickadee Cottage is the largest of our guest cottages.<br /><br />LOCATION: <br />The cottage is located in the center of Lower Highlands (LOHI) next to the Navajo Street Arts District along with the Bug Theater.

In [71]:
import time
batch_size = 100
total_updated = 0
iteration = 0

while True:
    iteration += 1

    # Find records to update
    # NOTE: This does not seem to handle projections, is it possible to do this another way without stuffing the desired data into one field?
    # records_to_update = collection.aggregate([
    #     {
    #     '$match': {
    #             '$and': [
    #                 {'embeddings': {'$exists': False}},
    #                 {'description': {'$exists': True}}
    #             ]
    #         }
    #     },
    #     {
    #         '$project': {
    #             'data_to_embed': {
    #                 '$concat': [
    #                     {'$ifNull': ['$name', '']},
    #                     ' ',
    #                     {'$ifNull': ['$description', '']},
    #                     ' ',
    #                     {'$ifNull': ['$neighborhood_overview', '']}
    #                 ]
    #             }
    #         }
    #     },
    #     {
    #         "$limit": batch_size 
    #     }
    # ])

    records_to_update = collection.find({
    '$and': [
        {'embeddings': {'$exists': False}},
        {'description': {'$exists': True}}
    ]
    }).limit(batch_size)

    records_to_update = list(records_to_update)
    print(records_to_update)

    if not records_to_update:
        print(f"All rows have been updated. Total updated rows: {total_updated}")
        break

    total_updated += len(records_to_update)

    print(f"Iteration: {iteration}, has handled {total_updated} rows")

    # Prepare bulk operations
    bulk_ops = [
        UpdateOne({'_id': record['_id']},
                {'$generateEmbeddings': {'data_to_embed': 'embeddings'}})
        for record in records_to_update
    ]

    if bulk_ops:
        result = collection.bulk_write(bulk_ops)
        print(f"Bulk write result: {result.bulk_api_result}")

    time.sleep(0.5)  # Sleep for 500 ms


[{'_id': ObjectId('6722ee7991bc8b29de105e77'), 'id': '30011181', 'listing_url': 'https://www.airbnb.com/rooms/30011181', 'source': 'city scrape', 'name': 'New Central Modern RINO Studio', 'description': 'Location, location, location! Modern, new, downtown building with Japanese restaurant, yoga studio and juice bar in the building as well as private gardens. High-end finishes and furnishings. Steps to best and trendiest restaurants, coffee shops and bars in the RINO Arts District. 5-star Ramble Hotel a two minute walk from front door.  One queen bed and one very comfortable pull out couch available for up to four guests. Coffee, WiFi and Netflix included. Non-smoking building. No parties.', 'neighborhood_overview': 'Check out a comprehensive guide on the best neighborhood to be in Denver:<br /><br />https://www.5280.com/2017/04/5280-neighborhood-guide-rino/', 'latitude': '39.75699', 'longitude': '-104.98504', 'price': '$120.00', 'amenities': '["Hangers", "Essentials", "Dishwasher", "Mi

In [None]:
# Define the user's location
user_location = {
    "type": "Point",
    "coordinates": [-105.0020980834961, 39.766414642333984]  # User's location (longitude, latitude)
}

In [8]:
command = {
    "createIndexes": "listings",
    "indexes": [
        {
            "key": {
                "location": "2dsphere"
            },

            "name": "location_index"
        }
    ]
}
db.command(command)


{'raw': {'defaultShard': {'numIndexesBefore': 4,
   'numIndexesAfter': 4,
   'createdCollectionAutomatically': False,
   'note': 'all indexes already exist',
   'ok': 1}},
 'ok': 1}

In [None]:

def search_listings(query, limit=5):
   # Create an index on the location field
   # TODO: Keyword search 
    command = { "createIndexes": "listings", "indexes": [ { "key": { "location": 1 }, "name": "location" } ] }
    db.command(command)

    # Search for the top 5 closest vectors to the query within a 30 mile radius of user's location
    pipeline = [
                {
                    "$search": {
                        "cosmosSearch": {
                            "path": "embeddings",
                            "query": query,  # Replace with your query
                            "k": 5,  # Limit to top 5 closest vectors
                            "filter": {"$and": [
                                # { "is_open": { "$eq": 1 } },
                                #  The query converts the distance to radians by dividing by the approximate equatorial radius of the earth, 3963.2 miles
                                {"location": {"$geoWithin": 
                                                {"$centerSphere":[user_location["coordinates"], 30/3963.2 ]}}}
                                ]
                            }
                        }
                    }
                },
                {

                    "$limit": 5  # Limit to top 5 results
                },
                {
                    '$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' }, 

                }
            ]
    # Execute the aggregation
    results = collection.aggregate(pipeline)
    

    # Print the results
    for doc in results:

        print(f"Similarity Score: {doc['similarityScore']}")  
        print(f"Location: {doc['document'].get('location', 'N/A')}")  
        print(f"Description: {doc['document'].get('description', 'N/A')}")  
        print(f"Nighborhood Overview: {doc['document'].get('neighborhood_overview', 'N/A')}")
        print(f"Price per day: {doc['document'].get('price', 'N/A')}") 
        print(f"Listing Url: {doc['document'].get('listing_url', 'N/A')}\n") 



In [72]:
query = "quiet home with hot tub"

search_listings(query)

Similarity Score: 0.8678984563290169
Location: {'type': 'Point', 'coordinates': [-104.93183, 39.75742]}
Description: A remodeled space in a great Denver neighborhood with a separate and secure keyless entrance.  Just 2 blocks from recently opened café, brew house, wine bar and pizza joint.  If you prefer to stay in, there’s a kitchen, living room, workout space and bath.  Don’t forget you will also have access to the hot tub.  Enjoy a comfortable queen bed and take advantage of the provided coffee to get your day started.  If you have a larger party please inquire about our 2 bedroom suite option.
Nighborhood Overview: 
Price per day: $117.00
Listing Url: https://www.airbnb.com/rooms/37567133

Similarity Score: 0.8668637116984977
Location: {'type': 'Point', 'coordinates': [-104.95573, 39.75546]}
Description: 
Nighborhood Overview: .
Price per day: $160.00
Listing Url: https://www.airbnb.com/rooms/4080404

Similarity Score: 0.8648680013575798
Location: {'type': 'Point', 'coordinates': [