In [2]:
# import pandas as pd
from datetime import datetime, timedelta
from pymongo import MongoClient, UpdateOne
import json
import os
from dotenv import load_dotenv

In [None]:
# uncomment to use env file
# load_dotenv("variables.env", override=True)
load_dotenv(override=True)

MONGO_CONNECTION_STRING= os.getenv("MONGO_CONNECTION_STRING_DISKANN")
COLLECTION_NAME=os.getenv("COLLECTION_NAME") if os.getenv("COLLECTION_NAME") else "listings"
DATABASE_NAME= os.getenv("DATABASE_NAME") if os.getenv("DATABASE_NAME") else "contoso_bookings"
AOAI_KEY = os.getenv("AOAI_KEY")
AOAI_ENDPOINT =  os.getenv("AOAI_ENDPOINT")
API_VERSION =  os.getenv("API_VERSION")
AOAI_CHAT_MODEL_DEPLOYMENT = os.getenv("AOAI_COMPLETION_DEPLOYMENT")

print(AOAI_KEY)

a9c67fe3987f44e3a5438e551c33d9db


In [None]:
mongo_client = MongoClient(MONGO_CONNECTION_STRING)

db = mongo_client[DATABASE_NAME]
collection = db[COLLECTION_NAME]

if COLLECTION_NAME not in db.list_collection_names():
    db.create_collection(COLLECTION_NAME)
    print("Created collection '{}'.\n".format(COLLECTION_NAME))
else:
    print("Using collection: '{}'.\n".format(COLLECTION_NAME))

Using collection: 'listings'.



In [137]:
db.command({
  'createIndexes': 'listings',
  'indexes': [
    {
      'name': 'listingIndex',
      'key': {
        "embeddings": "cosmosSearch"
      },
      'cosmosSearchOptions': {
        'kind': 'vector-diskann',
        'numLists': 1,
        'similarity': 'COS',
        'dimensions': 1536
      }
    }
  ]
})

{'raw': {'defaultShard': {'numIndexesBefore': 4,
   'numIndexesAfter': 4,
   'createdCollectionAutomatically': False,
   'note': 'all indexes already exist',
   'ok': 1}},
 'ok': 1}

In [138]:
# Load JSON data from file
with open("data/datasets without embeddings/small_for_testing.json", 'r') as file:
    data = json.load(file)

print(data[0])

{'id': '360', 'listing_url': 'https://www.airbnb.com/rooms/360', 'source': 'city scrape', 'name': 'Sit in the Peaceful Garden of the Chickadee Cottage in LoHi', 'description': "Enjoy the famous Colorado weather and unplug in indoor & outdoor living. <br />Our charming cottage has a serene ambiance throughout every area. <br />Spend a sunny afternoon out on the hammock or enjoy the garden parlor sofa relax, read, or play a game, sink into the shared hot tub, practice yoga on the deck.<br />We are located next to downtown and in the neighborhood of lower highlands, <br />Short walks to superb coffee shops, restaurants, microbrews, distilleries, dispensaries & downtown. We are 420 outdoors only. LGBT Friendly, allergy-free, fragrance-free & pet-free. Ozone sterilized.<br /><br />Chickadee Cottage is the largest of our guest cottages.<br /><br />LOCATION: <br />The cottage is located in the center of Lower Highlands (LOHI) next to the Navajo Street Arts District along with the Bug Theater.

In [None]:
import time
batch_size = 15
total_updated = 0
iteration = 0

while True:
    iteration += 1

    # Find records to update
    # NOTE: This does not seem to handle projections, is it possible to do this another way without stuffing the desired data into one field?
    # records_to_update = collection.aggregate([
    #     {
    #     '$match': {
    #             '$and': [
    #                 {'embeddings': {'$exists': False}},
    #                 {'description': {'$exists': True}}
    #             ]
    #         }
    #     },
    #     {
    #         '$project': {
    #             'data_to_embed': {
    #                 '$concat': [
    #                     {'$ifNull': ['$name', '']},
    #                     ' ',
    #                     {'$ifNull': ['$description', '']},
    #                     ' ',
    #                     {'$ifNull': ['$neighborhood_overview', '']}
    #                 ]
    #             }
    #         }
    #     },
    #     {
    #         "$limit": batch_size 
    #     }
    # ])

    records_to_update = collection.find({
    '$and': [
        {'embeddings': {'$exists': False}},
        {'description': {'$exists': True}}
    ]
    }).limit(batch_size)

    records_to_update = list(records_to_update)
    print(records_to_update)

    if not records_to_update:
        print(f"All rows have been updated. Total updated rows: {total_updated}")
        break

    total_updated += len(records_to_update)

    print(f"Iteration: {iteration}, has handled {total_updated} rows")

    # Prepare bulk operations
    bulk_ops = [
        UpdateOne({'_id': record['_id']},
                {'$generateEmbeddings': {'data_to_embed': 'embeddings'}})
        for record in records_to_update
    ]

    if bulk_ops:
        result = collection.bulk_write(bulk_ops)
        print(f"Bulk write result: {result.bulk_api_result}")

    time.sleep(0.5)  # Sleep for 500 ms


[{'_id': ObjectId('67252c67739892c59d79506b'), 'id': '35764557', 'listing_url': 'https://www.airbnb.com/rooms/35764557', 'source': 'city scrape', 'name': 'Sunny Private Apartment Near RiNo', 'description': 'This newer private 1br apartment sits above the garage behind host residence, is centrally located in a residential neighborhood northeast of downtown, and 5-10 minute driving distance from LoDo, RiNo, breweries, downtown, & other shopping, dining, & entertainment; City Park, Zoo, & museums. Enjoy your own space with all the amenities, free off-street parking, & shared back yard & patio with plenty of seating for relaxing & dining. (For groups over 4, we have a room on a separate listing.)', 'neighborhood_overview': 'The Cole neighborhood is a diverse residential neighborhood next to the popular RiNo district, which is more industrial but changing quickly. You may notice construction in and around the neighborhoods as Denver is growing very rapidly and we have several big projects u

In [150]:
command = {
    "createIndexes": "listings",
    "indexes": [
        {
            "key": {
                "location": "2dsphere"
            },

            "name": "location_index"
        }
    ]
}
db.command(command)


{'raw': {'defaultShard': {'numIndexesBefore': 5,
   'numIndexesAfter': 5,
   'createdCollectionAutomatically': False,
   'note': 'all indexes already exist',
   'ok': 1}},
 'ok': 1}

In [151]:
collection.create_index('amenities')

'amenities_1'

In [36]:
# Define the user's location
user_location = {
    "type": "Point",
    "coordinates": [-105.0020980834961, 39.766414642333984]  # User's location (longitude, latitude)
}

def search_listings(query, limit=5):
   # Create an index on the location field
   # TODO: Keyword search 
    command = { "createIndexes": "listings", "indexes": [ { "key": { "location": 1 }, "name": "location" } ] }
    db.command(command)

    # Search for the top 5 closest vectors to the query within a 30 mile radius of user's location
    pipeline = [
                {
                    "$search": {
                        "cosmosSearch": {
                            "path": "embeddings",
                            "query": query,  # Replace with your query
                            "k": 5,  # Limit to top 5 closest vectors
                            "filter": {"$and": [
                                { "amenities": { "$in": ["Dishwasher", "Gym"] }},
                                #  The query converts the distance to radians by dividing by the approximate equatorial radius of the earth, 3963.2 miles
                                {"location": {"$geoWithin": 
                                                {"$centerSphere":[user_location["coordinates"], 30/3963.2 ]}}}
                                ]
                            }
                        }
                    }
                },
                {

                    "$limit": 5  # Limit to top 5 results
                },
                {
                    '$project': { 'similarityScore': { '$meta': 'searchScore' }, 'document' : '$$ROOT' }, 

                }
            ]
    # Execute the aggregation
    results = collection.aggregate(pipeline)
    
    results_dict = []

    # Print the results
    for doc in results:

        print(f"Similarity Score: {doc['similarityScore']}")  
        print(f"id: {doc['_id']}")
        print(f"Name: {doc['document'].get('name', 'N/A')}")  
        print(f"Location: {doc['document'].get('location', 'N/A')}")  
        print(f"Description: {doc['document'].get('description', 'N/A')}")  
        print(f"Neighborhood Overview: {doc['document'].get('neighborhood_overview', 'N/A')}")
        print(f"Price per day: {doc['document'].get('price', 'N/A')}") 
        print(f"Amenities: {doc['document'].get('amenities', 'N/A')}")
        print(f"Listing Url: {doc['document'].get('listing_url', 'N/A')}\n") 
    
        result = {
            "similarityScore": doc['similarityScore'],
            "id": doc['_id'],
            "name": doc['document'].get('name', 'N/A'),
            "location": doc['document'].get('location', 'N/A'),
            "description": doc['document'].get('description', 'N/A'),
            "neighborhood_overview": doc['document'].get('neighborhood_overview', 'N/A'),
            "price": doc['document'].get('price', 'N/A'),
            "amenities": doc['document'].get('amenities', 'N/A'),
            "listing_url": doc['document'].get('listing_url', 'N/A')
        }
        results_dict.append(result)

    return results_dict



In [None]:
query = "quiet home with hot tub"

search_listings(query)

In [3]:
import openai
openai.api_type = os.getenv("OPENAI_API_TYPE", "azure")
openai.base_url = os.getenv("AZURE_OPENAI_ENDPOINT",AOAI_ENDPOINT )
openai.api_version = API_VERSION
openai.api_key = os.getenv("OPENAI_API_KEY", AOAI_KEY)

In [None]:
# Create RAG Function
from langchain.prompts import ChatPromptTemplate
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
from langchain_community.vectorstores.azure_cosmos_db import AzureCosmosDBVectorSearch


openai_embeddings_model = os.getenv("AZURE_OPENAI_EMBEDDINGS_MODEL_NAME", "text-embedding-ada-002")
openai_embeddings_deployment = os.getenv("AZURE_OPENAI_EMBEDDINGS_DEPLOYMENT_NAME", "embeddings")

azure_openai_embeddings: AzureOpenAIEmbeddings = AzureOpenAIEmbeddings(
    model=openai_embeddings_model,
    azure_deployment=openai_embeddings_deployment,
    api_key=AOAI_KEY,
    azure_endpoint=AOAI_ENDPOINT,
)

openai_chat_model = os.getenv("AZURE_OPENAI_CHAT_MODEL_NAME", "gpt-3.5-turbo")
openai_chat_deployment = os.getenv("AOAI_CHAT_DEPLOYMENT_NAME", AOAI_CHAT_MODEL_DEPLOYMENT)

azure_openai_chat: AzureChatOpenAI = AzureChatOpenAI(
    model=openai_chat_model,
    azure_deployment=openai_chat_deployment,
    api_key=AOAI_KEY,
    azure_endpoint=AOAI_ENDPOINT,
    api_version=API_VERSION,

)

# Test the chat flow
chat_response = azure_openai_chat.invoke("Tell me a joke")
print(chat_response.content)

Why don't scientists trust atoms? Because they make up everything!


In [5]:
from typing import List
from bson import ObjectId
from langchain_core.callbacks import CallbackManagerForRetrieverRun
from langchain_core.documents import Document
from langchain_core.retrievers import BaseRetriever

## Custom Retriever

class CustomRetriever(BaseRetriever):
    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
        search_results = search_listings(query)
    
        documents = [] # List of Document objects
        for result in search_results:
            document = Document(
                id={result['id']},
                page_content=result['name'],
                metadata=result
            )
            documents.append(document)
        return documents


In [None]:
query = "quiet house with hot tub"
retriever = CustomRetriever()
# retriever.invoke(query)

Similarity Score: 0.8543511964710722
id: 67252c67739892c59d794e66
Name: Sunny Bungalow with Private Hot Tub Near Downtown
Location: {'type': 'Point', 'coordinates': [-104.94234, 39.7423]}
Description: This Bungalow-style single-family home is perfect for multi-family parties or roommate situations.  It features 2 full bedrooms plus a loft-style "guest" bedroom, 2 separate kitchens, hardwood floors, theater, covered front porch, natural light, fenced backyard, a hot tub, and a vibrant, hip neighborhood next to City Park and minutes from downtown!  The garden-level bedroom can be closed off into its own studio apartment complete with its own kitchen and bathroom for multifamily parties.
Neighborhood Overview: The South City Park / Congress Park Neighborhood is home to quiet residential streets bordered by lively entertainment.  Within walking distance from the house are bars, restaurants, breweries, diners, gyms, yoga studios, salons, coffee shops, retail stores, ice cream parlors, liquo

[Document(id="{ObjectId('67252c67739892c59d794e66')}", metadata={'similarityScore': 0.8543511964710722, 'id': ObjectId('67252c67739892c59d794e66'), 'name': 'Sunny Bungalow with Private Hot Tub Near Downtown', 'location': {'type': 'Point', 'coordinates': [-104.94234, 39.7423]}, 'description': 'This Bungalow-style single-family home is perfect for multi-family parties or roommate situations.  It features 2 full bedrooms plus a loft-style "guest" bedroom, 2 separate kitchens, hardwood floors, theater, covered front porch, natural light, fenced backyard, a hot tub, and a vibrant, hip neighborhood next to City Park and minutes from downtown!  The garden-level bedroom can be closed off into its own studio apartment complete with its own kitchen and bathroom for multifamily parties.', 'neighborhood_overview': "The South City Park / Congress Park Neighborhood is home to quiet residential streets bordered by lively entertainment.  Within walking distance from the house are bars, restaurants, br

In [None]:
from langchain.prompts import ChatPromptTemplate

REPHRASE_PROMPT = """\
Given the following conversation and a follow up question, rephrase the follow up \
question to be a standalone question.

Chat History:
{chat_history}
Follow Up Input: {question}
Standalone Question:"""

CONTEXT_PROMPT = """\
You are a chatbot, tasked with answering any question about \
rental listings from the context. You can also answer questions about the particular areas, and provide suggestions for things to do.\
You may ask a follow up question about things the user likes to do while on vacation or if there's a particular point of interest.

Generate a response of 100 words or less for the \
given question based solely on the provided search results. \
You must only use information from the provided search results. Use an unbiased and \
fun tone. Do not repeat text. Your response must be solely based on the provided context.

If there is nothing in the context is relevant to the question at hand, just say \
"I'm not sure." Don't try to make up an answer.

Anything between the following `context` html blocks is retrieved from a knowledge \
bank, not part of the conversation with the user. 

<context>
    {context} 
<context/>

REMEMBER: If there is no relevant information within the context, just say "I'm \
not sure." Don't try to make up an answer. Anything between the preceding 'context' \
html blocks is retrieved from a knowledge bank, not part of the conversation with the \
user.\

User Question: {input}

Chatbot Response:"""

rephrase_prompt_template = ChatPromptTemplate.from_template(REPHRASE_PROMPT)
context_prompt_template = ChatPromptTemplate.from_template(CONTEXT_PROMPT)

In [48]:
# Use a custom retriever
document_retriever = retriever

# Rephrase Chain
rephrase_chain = rephrase_prompt_template | azure_openai_chat
# Context Chain
context_chain = context_prompt_template | azure_openai_chat

In [47]:
messages = [{"content": "Do you have any houses in quiet neighborhoods?", "role": "user"}]

rephrased_question = rephrase_chain.invoke({"chat_history": messages[:-1], "question": messages[-1]})
print(rephrased_question.content)

What type of neighborhood are you looking for when it comes to houses?


In [None]:
# Get the context from the database
context = document_retriever.invoke(str(rephrased_question.content))


Similarity Score: 0.8197907805442868
id: 67252c67739892c59d794fb7
Name: Cute House in $million neighborhood
Location: {'type': 'Point', 'coordinates': [-104.95458, 39.68801]}
Description: Fully furnished, pet friendly fully fenced backyard, quiet professionals neighbourhood of million dollar homes.  Newly renovated. Big screen smart tv for cinema viewing. New fridge heating hot water roof and siding. 10 minutes to Cherry Creek upmarket shopping restaurants and bars, Washington Park activities, DU, Gaylord and Pearl st restaurants, bars, farmers mkt, Wholefoods, Starbucks, fast food. 15minutes to downtown, Single garage, 2 off street parking spaces. Tons of on street parking.
Neighborhood Overview: Upscale multi-million dollar homes. Quiet, professionals, close to great restaurants, bars and an awesome park with lots to do.
Price per day: $110.00
Amenities: ['Hangers', 'Essentials', 'Dishwasher', 'Books and reading material', 'Dedicated workspace', 'Toaster', 'Microwave', 'Oven', 'Bakin

In [None]:
# Generate a response based on the context
response = context_chain.invoke({"context": context, "input": rephrased_question.content})
print(response.content)

Here are a few rental listings located in quiet neighborhoods:
- 17th & Larimer Private, Quiet Condo - $113.00/night
- Beautifully Renovated 1 bedroom in Quiet Community - $55.00/night
- Cute House in $million neighborhood - $110.00/night
- Cozy 1BD-parking, yard, laundry - $115.00/night


In [52]:
messages.append({"content": response.content, "role": "assistant"})

In [None]:
# Test with another question to see if the chat history is maintained
messages.append({"content": "Which rental listings are quiet?", "role": "user"})

rephrased_question = rephrase_chain.invoke({"chat_history": messages[:-1], "question": messages[-1]})
context = document_retriever.invoke(str(rephrased_question.content))

response = context_chain.invoke({"context": context, "input": rephrased_question.content})

Similarity Score: 0.8145037889480649
id: 67252c67739892c59d794d43
Name: 17th & Larimer Private, Quiet Condo
Location: {'type': 'Point', 'coordinates': [-104.99687, 39.74981]}
Description: This condo is a QUIET building, still close to everything in downtown Denver! Walk to 16th Street Mall, Coors Field, Theater District, Pepsi Center, and more than 100 restaurants - then retire to your own personal space for rest & relaxation...<br /><br />30-night minimum stay, and we are required to sign a simple lease agreement to be kept on file with HOA.
Neighborhood Overview: 
Price per day: $113.00
Amenities: ['Hangers', 'Essentials', 'Children’s books and toys for ages 5-10 years old and 10+ years old', 'Books and reading material', 'Dedicated workspace', 'Bluetooth sound system', 'Dishwasher', 'Microwave', 'Toaster', 'Barbecue utensils', 'Shampoo', 'Hair dryer', 'Free parking garage on premises – 1 space', 'Oven', 'Baking sheet', 'Shared sauna', 'Dishes and silverware', 'Host greets you', 'Cit

In [57]:
print("Rephrased Question: ", rephrased_question.content)
print("LLM Response: ", response.content)

Rephrased Question:  Can you provide me with a list of rental listings that are located in quiet neighborhoods?
LLM Response:  Here are some rental listings located in quiet neighborhoods: 17th & Larimer Private, Quiet Condo ($113.00), Beautifully Renovated 1 bedroom in Quiet Community ($55.00), Cute House in $million neighborhood ($110.00), and Cozy 1BD-parking, yard, laundry ($115.00).
