This is a starter notebook for the project, you'll have to import the libraries you'll need, you can find a list of the ones available in this workspace in the requirements.txt file in this workspace. 

## Setting up environment

In [1]:
import os

os.environ["OPENAI_API_KEY"] = "voc-183321194312667737058046757cd49b1d8b6.91865541"
os.environ["OPENAI_API_BASE"] = "https://openai.vocareum.com/v1"

from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

In [None]:
pip install chromadb

In [None]:
pip install sentence-transformers==2.2.2

In [2]:
# Libraries
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from sentence_transformers import SentenceTransformer
from langchain.prompts import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate

model_name = "gpt-3.5-turbo"
temperature = 0.0
llm = OpenAI(model_name=model_name, temperature=temperature, max_tokens = 2000)



## Synthetic Data Generation
#### Generating Real Estate Listings with an LLM
use a Large Language Model (LLM) to generate at least 10 diverse and realistic real estate listings containing facts about the real estate.

In [3]:
# Define the prompt for generating listings
prompt_template = """
Generate 15 diverse real estate listings in a structured format:
Index: [number]
Neighborhood: [location in Singapore]
Price: [SGD]
Bedrooms and bathrooms: [Number]
House Size: [Size in sqft]
Features: [comma-separated]
Neighborhood Description: [max 2 sentences]
Ensure the listings are realistic and varied in style and price.
"""

In [4]:
# Function to generate listings
def generate_real_estate_listings(prompt):
    try:
        # Call the OpenAI LLM to generate the listings
        response = llm(prompt)
        return response  # LangChain's OpenAI LLM directly returns the generated text
    except Exception as e:
        return f"An error occurred: {e}"

# Generate the listings
listings = generate_real_estate_listings(prompt_template)

In [5]:
# Print and save the generated listings
print("Generated Real Estate Listings:")
print(listings)

Generated Real Estate Listings:
1
Neighborhood: Orchard
Price: $5,000,000
Bedrooms and bathrooms: 4 bedrooms, 3 bathrooms
House Size: 2,500 sqft
Features: Swimming pool, garden, high-end finishes
Neighborhood Description: Orchard is a prestigious area known for its luxury shopping and dining options.

2
Neighborhood: Tiong Bahru
Price: $1,200,000
Bedrooms and bathrooms: 2 bedrooms, 2 bathrooms
House Size: 1,000 sqft
Features: Renovated kitchen, balcony, city views
Neighborhood Description: Tiong Bahru is a trendy neighborhood with a mix of heritage buildings and modern cafes.

3
Neighborhood: Sentosa Cove
Price: $10,000,000
Bedrooms and bathrooms: 5 bedrooms, 5 bathrooms
House Size: 5,000 sqft
Features: Private yacht berth, waterfront views, rooftop terrace
Neighborhood Description: Sentosa Cove is a luxurious waterfront enclave with exclusive amenities.

4
Neighborhood: Bukit Timah
Price: $3,500,000
Bedrooms and bathrooms: 3 bedrooms, 4 bathrooms
House Size: 3,200 sqft
Features: Gourm

In [6]:
# Save the listings to a file for submission
with open("listings.txt", "w") as file:
    file.write(listings)

## Semantic Search

#### Creating a Vector Database and Storing Listings
create a vector database and successfully store real estate listing embeddings within it. The database should effectively store and organize the embeddings generated from the LLM-created listings.

In [7]:
model_name = "gpt-3.5-turbo"
temperature = 0.0
llm = OpenAI(model_name=model_name, temperature=temperature, max_tokens = 900)

In [8]:
import numpy as np
from sentence_transformers import SentenceTransformer
import chromadb
import os
from typing import List, Union

# Define the model to use for generating embeddings
MODEL_NAME = 'paraphrase-MiniLM-L6-v2'

In [9]:
# Define the model to use for generating embeddings
MODEL_NAME = 'paraphrase-MiniLM-L6-v2'

In [10]:
# Function to read and correctly parse listings from a text file
def read_listings(file_path: str) -> List[str]:
    """
    Reads listings from a text file, ensuring each listing is stored as a whole rather than line-by-line.
    
    Parameters:
    file_path (str): Path to the text file containing listings.
    
    Returns:
    List[str]: A list of properly formatted listings.
    """
    with open(file_path, "r", encoding="utf-8") as fh:
        content = fh.read().strip()  # Read the whole file and strip leading/trailing whitespace
    listings = content.split("\n\n")  # Split listings by an empty line assuming each listing is separated
    return [listing.replace("\n", " ") for listing in listings]  # Ensure each listing is a single string

In [11]:
# Function to generate embeddings for listings
def generate_embeddings(input_data: Union[str, List[str]]) -> np.ndarray:
    """
    Generates embeddings for the given listings using Sentence Transformers.

    Parameters:
    input_data (Union[str, List[str]]): Single listing or list of listings.

    Returns:
    np.ndarray: Array of embeddings.
    """
    model = SentenceTransformer(MODEL_NAME)
    embeddings = model.encode(input_data)
    return embeddings

# Read and process listings
listings = read_listings("listings.txt")

# Generate embeddings
embeddings = generate_embeddings(listings)

In [12]:
# Ensure embeddings and listings match in length
assert len(embeddings) == len(listings), "Mismatch between embeddings and listings length!"

# Save embeddings for later use
np.save("listings_embeddings.npy", embeddings)

# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./chroma_real_estate")

😠

In [13]:
# Check if collection exists and delete before re-adding to prevent duplicates
collection_name = "real_estate_listings"
existing_collections = client.list_collections()

if collection_name in [col.name for col in existing_collections]:
    client.delete_collection(name=collection_name)

In [14]:
# Create or get the collection
collection = client.get_or_create_collection(name=collection_name)

# Prepare data for insertion
ids = [f"listing_{i}" for i in range(len(listings))]
metadata = [{"listing": listing} for listing in listings]  # Ensure full listing is stored, not just one line
# Prepare data for insertion
ids = [f"listing_{i}" for i in range(len(listings))]
metadata = [{"listing": listing} for listing in listings]  # Store the entire listing text

# Add data to ChromaDB
collection.add(
    ids=ids,
    embeddings=embeddings.tolist(),  # Convert numpy array to list
    metadatas=metadata
)

# Retrieve and print a sample stored listing to confirm correct formatting
retrieved_data = collection.get()
print("\n✅ Retrieved data sample:")
for item in retrieved_data['metadatas'][:1]:
    print(item)


✅ Retrieved data sample:
{'listing': '1 Neighborhood: Orchard Price: $5,000,000 Bedrooms and bathrooms: 4 bedrooms, 3 bathrooms House Size: 2,500 sqft Features: Swimming pool, garden, high-end finishes Neighborhood Description: Orchard is a prestigious area known for its luxury shopping and dining options.'}


In [15]:
# Confirm successful insertion
print(f"✅ Inserted {len(ids)} listings into '{collection_name}' collection.")

✅ Inserted 15 listings into 'real_estate_listings' collection.


#### Semantic Search of Listings Based on Buyer Preferences

The application must include a functionality where listings are semantically searched based on given buyer preferences. The search should return listings that closely match the input preferences.

In [16]:
# Define the model to use for generating embeddings
model = SentenceTransformer(MODEL_NAME)

# Initialize ChromaDB client
client = chromadb.PersistentClient(path="./chroma_real_estate")
collection_name = "real_estate_listings"
collection = client.get_or_create_collection(name=collection_name)

In [29]:
import re
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

def extract_keywords(text):
    """Extracts key nouns and adjectives from a text while removing common stopwords."""
    text = text.lower()
    words = re.findall(r'\b[a-zA-Z]+\b', text)  # Extract words only (no punctuation)
    important_words = [word for word in words if word not in ENGLISH_STOP_WORDS]  # Remove stopwords
    return set(important_words)  # Convert to set for matching

def infer_budget(query):
    """Infer price budget from the query if any price-related terms exist."""
    price_mapping = {
        "cheap": 500000,
        "affordable": 800000,
        "budget-friendly": 1000000,
        "mid-range": 2000000,
        "luxury": 5000000,
        "high-end": 10000000
    }
    for term, price in price_mapping.items():
        if term in query.lower():
            return price
    return None  # No budget constraint mentioned

In [31]:
def extract_query_components(query):
    """Extracts structured components like bedrooms, house type, features, and budget."""
    components = {
        "bedrooms": re.search(r'(\d+)-bedroom', query),
        "house_type": re.search(r'(house|apartment|condo|villa|bungalow)', query),
        "features": re.findall(r'(modern kitchen|spacious|swimming pool|gym|garden|balcony|public transportation)', query),
        "budget": re.search(r'(\d+[mk]?)', query)  # Looks for price indicators like '2M' or '500K'
    }
    
    # Format extracted components
    structured_query = {
        "bedrooms": components["bedrooms"].group() if components["bedrooms"] else None,
        "house_type": components["house_type"].group() if components["house_type"] else None,
        "features": components["features"] if components["features"] else [],
        "budget": components["budget"].group() if components["budget"] else None
    }
    
    return structured_query

print("\n\U0001F3E1 Testing HomeMatch with Auto-Generated Queries:")
print("------------------------------------------------------")

test_queries = [
    "I want a spacious 3-bedroom house with a modern kitchen for 2 million.",
    "Looking for a luxury apartment near town with easy access to public transportation, budget 5M.",
    "Seeking a budget-friendly home with 2 bedrooms in a quiet suburban neighborhood, around 800K.",
    "Need a home with a swimming pool, a gym, and a garden for 3.5 million."
]

for idx, query in enumerate(test_queries, 1):
    print(f"\n📝 **Test Query {idx}:** {query}")
    
    # Extract structured components
    query_components = extract_query_components(query)
    print(f"🔹 Extracted Query Components: {query_components}")
    
    # Generate embedding for query
    query_embedding = model.encode([query]).tolist()

    # Perform semantic search in ChromaDB
    query_results = collection.query(
        query_embeddings=query_embedding,
        n_results=5
    )

    # Extract query keywords
    query_keywords = extract_keywords(query)

    print("\n🔍 **Matching Listings:**")
    
    # Ensure valid results exist
    if not query_results["metadatas"]:
        print("❌ No matching listings found.")
        continue

    for i, (listing, score) in enumerate(zip(query_results["metadatas"][0], query_results["distances"][0])):
        similarity_score = 1 / (1 + score)  # Normalize similarity score
        
        # Ensure listing is correctly formatted
        if isinstance(listing, dict) and "listing" in listing:
            listing_text = listing["listing"].lower()
            listing_price = int(re.search(r'\$(\d+[,\d]*)', listing_text).group(1).replace(',', '')) if re.search(r'\$(\d+[,\d]*)', listing_text) else None
        else:
            print("❌ Error: Unexpected data structure in query results.")
            continue

        # Extract listing keywords
        listing_keywords = extract_keywords(listing_text)
        
        # Find matched keywords
        matched_keywords = query_keywords.intersection(listing_keywords)
        
        # Apply price adjustment if budget is present
        if query_components["budget"] and listing_price:
            budget_value = int(re.sub(r'[^0-9]', '', query_components["budget"])) * (1_000_000 if 'm' in query_components["budget"].lower() else 1_000)
            price_penalty = abs(listing_price - budget_value) / budget_value
            similarity_score *= (1 - min(price_penalty, 0.3))  # Cap penalty at 30%
        
        print(f"\n🔹 Match {i+1} (Adjusted Similarity Score: {similarity_score:.2f}):")
        print(listing["listing"])

        # Show keyword matches
        if matched_keywords:
            print(f"✅ Matched Keywords: {', '.join(matched_keywords)}")
        else:
            print("⚠️ No strong keyword matches found.")

    print("\n" + "-" * 50)  # Separator for readability


🏡 Testing HomeMatch with Auto-Generated Queries:
------------------------------------------------------

📝 **Test Query 1:** I want a spacious 3-bedroom house with a modern kitchen for 2 million.
🔹 Extracted Query Components: {'bedrooms': '3-bedroom', 'house_type': 'house', 'features': ['spacious', 'modern kitchen'], 'budget': '3'}

🔍 **Matching Listings:**

🔹 Match 1 (Adjusted Similarity Score: 0.02):
2 Neighborhood: Tiong Bahru Price: $1,200,000 Bedrooms and bathrooms: 2 bedrooms, 2 bathrooms House Size: 1,000 sqft Features: Renovated kitchen, balcony, city views Neighborhood Description: Tiong Bahru is a trendy neighborhood with a mix of heritage buildings and modern cafes.
✅ Matched Keywords: modern, house, kitchen

🔹 Match 2 (Adjusted Similarity Score: 0.02):
3 Neighborhood: Sentosa Cove Price: $10,000,000 Bedrooms and bathrooms: 5 bedrooms, 5 bathrooms House Size: 5,000 sqft Features: Private yacht berth, waterfront views, rooftop terrace Neighborhood Description: Sentosa Cove i

#### Test with personal input

In [33]:
# Step 1: Collect buyer preferences
questions = [   
    "Please describe your ideal house. You may share details on your preferred neighbourhood, number of rooms, amenities, or more."
]

print("\nPlease answer the following question to specify your preferenes:")
answers = [input(q + "\n") for q in questions]


Please answer the following question to specify your preferenes:
Please describe your ideal house. You may share details on your preferred neighbourhood, number of rooms, amenities, or more.
I want a 2 person apartment with transport nearby. I want it affordable.


In [34]:
# Step 2: Convert preferences into an embedding
preferences_text = " ".join(answers)  # Combine answers into a single text
preferences_embedding = model.encode(preferences_text)

# Step 3: Query the ChromaDB database with user preferences
query_results = collection.query(
    query_embeddings=[preferences_embedding.tolist()],
    n_results=5  # Retrieve top 5 most similar listings
)

In [35]:
print("\n🏡 Here are the top matching properties for you:")
for i, (listing, score) in enumerate(zip(query_results["metadatas"], query_results["distances"][0])):
    similarity_score = 1 / (1 + score)  # Convert distance to similarity score
    print(f"\n🔹 Match {i+1} (Similarity Score: {similarity_score:.2f}):")
    
    # Ensure correct metadata access
    if isinstance(listing, list) and len(listing) > 0:
        print(listing[0]["listing"])  # Access first dictionary in the list
    else:
        print(listing["listing"])  # If it's already a dictionary


🏡 Here are the top matching properties for you:

🔹 Match 1 (Similarity Score: 0.03):
2 Neighborhood: Tiong Bahru Price: $1,200,000 Bedrooms and bathrooms: 2 bedrooms, 2 bathrooms House Size: 1,000 sqft Features: Renovated kitchen, balcony, city views Neighborhood Description: Tiong Bahru is a trendy neighborhood with a mix of heritage buildings and modern cafes.


## Augmented Response Generation

#### Logic for Searching and Augmenting Listing Descriptions

The project must demonstrate a logical flow where buyer preferences are used to search and then augment the description of real estate listings. The augmentation should personalize the listing without changing factual information.

In [36]:
# Initialize ChromaDB client and connect to the collection
client = chromadb.PersistentClient(path="./chroma_real_estate")
collection = client.get_collection(name="real_estate_listings")

In [37]:
# Chain of Thought Examples for Personalizing Listings
example1 = {
    "buyer_preferences": "Looking for a modern 3-bedroom home with a large backyard for children and a pet-friendly neighborhood.",
    "listing": "Spacious 3-bedroom house with an open-plan kitchen and a cozy living room.",
    "personalized_listing": """This modern 3-bedroom home is perfect for a growing family. It features a spacious open-plan kitchen where you can prepare meals while keeping an eye on the kids. The backyard is large, ideal for outdoor activities, BBQ nights, and a safe space for pets to roam. The neighborhood is pet-friendly and includes nearby parks and walking trails for a comfortable, active lifestyle."""
}

example2 = {
    "buyer_preferences": "Luxury apartment with a city view, close to public transport, ideal for a professional working in downtown.",
    "listing": "High-rise condo with a balcony and modern amenities.",
    "personalized_listing": """Experience the ultimate urban lifestyle in this luxurious high-rise condo, designed for professionals who seek convenience and elegance. The unit boasts a stunning city view from its private balcony, allowing you to unwind after a busy workday. Located just minutes away from major public transport hubs, this residence ensures a hassle-free commute to downtown offices, restaurants, and entertainment centers."""
}

In [38]:
# Define Prompt Template
example_prompt = PromptTemplate(
    input_variables=["buyer_preferences", "listing", "personalized_listing"],
    template="Buyer Preferences: {buyer_preferences}\nOriginal Listing: {listing}\nPersonalized Listing: {personalized_listing}\n"
)

# Create Few-Shot Prompt Template
few_shot_prompt = FewShotPromptTemplate(
    examples=[example1, example2],
    example_prompt=example_prompt,
    suffix="Buyer Preferences: {buyer_preferences}\nOriginal Listing: {listing}\nPersonalized Listing:",
    input_variables=["buyer_preferences", "listing"]
)

In [39]:
# Function to detect and correct factual inaccuracies
def correct_factual_inaccuracies(original_listing: str, personalized_listing: str) -> str:
    correction_prompt = f"""
    Original Listing: {original_listing}
    Personalized Listing: {personalized_listing}
    
    Ensure the personalized listing is factually accurate based on the original listing. Correct any inaccuracies while keeping it engaging.
    
    Corrected Personalized Listing:
    """
    
    return llm(correction_prompt).strip()

#### Use of LLM for Generating Personalized Descriptions

The submission must utilize an LLM to generate personalized descriptions for the real estate listings based on buyer preferences. The descriptions should be unique, appealing, and tailored to the preferences provided.

In [40]:
# Function to personalize a listing with factual accuracy
def personalize_listing(buyer_preferences: str, retrieved_listings: list):
    """
    Personalizes real estate listings based on buyer's preferences using Chain of Thought reasoning.
    Ensures factual accuracy by detecting and correcting inaccuracies.
    
    Parameters:
    - buyer_preferences (str): Buyer's specific preferences.
    - retrieved_listings (list): List of top matching listings.
    
    Returns:
    - List of personalized listings.
    """
    personalized_listings = []
    
    for listing in retrieved_listings:
        prompt_text = few_shot_prompt.format(
            buyer_preferences=buyer_preferences,
            listing=listing["listing"]
        )
        personalized_description = llm(prompt_text).strip()
        
        # Loop until factual accuracy is achieved
        while True:
            corrected_description = correct_factual_inaccuracies(listing["listing"], personalized_description)
            if corrected_description == personalized_description:
                break  # Stop if no changes were made
            personalized_description = corrected_description
        
        personalized_listings.append({
            "original_listing": listing["listing"],
            "personalized_listing": personalized_description
        })
    
    return personalized_listings

In [42]:
# Example usage
print("\nPlease answer the following question to specify your preferences:")
questions = ["Please describe your ideal house. You may share details on your preferred neighborhood, number of rooms, amenities, or more."]
buyer_query = input(questions[0] + "\n")

retrieved_listings = collection.query(
    query_embeddings=[SentenceTransformer('paraphrase-MiniLM-L6-v2').encode(buyer_query).tolist()],
    n_results=3
)["metadatas"][0]


Please answer the following question to specify your preferences:
Please describe your ideal house. You may share details on your preferred neighborhood, number of rooms, amenities, or more.
I want a house for 4, modern style. My budget is 1M.


In [44]:
# Generate personalized listings
personalized_results = personalize_listing(buyer_query, retrieved_listings)

# Print results
print("\n=== Personalized Real Estate Listings ===\n")
for idx, result in enumerate(personalized_results, 1):
    print(f"{idx}. **Original Listing:** {result['original_listing']}\n")
    print(f"   **Personalized Listing:** {result['personalized_listing']}\n")



=== Personalized Real Estate Listings ===

1. **Original Listing:** 4 Neighborhood: Bukit Timah Price: $3,500,000 Bedrooms and bathrooms: 3 bedrooms, 4 bathrooms House Size: 3,200 sqft Features: Gourmet kitchen, home office, landscaped garden Neighborhood Description: Bukit Timah is a sought-after residential area known for its lush greenery and prestigious schools.

   **Personalized Listing:** I have found a luxurious 3-bedroom house in Bukit Timah priced at $3,500,000. This spacious home boasts a gourmet kitchen, a home office, and a beautifully landscaped garden perfect for outdoor relaxation. Located in the sought-after residential area of Bukit Timah, known for its lush greenery and prestigious schools, this property offers a perfect blend of luxury and comfort. The house is 3,200 sqft in size. Contact me for more details and to schedule a viewing of this stunning property.

2. **Original Listing:** 6 Neighborhood: Holland Village Price: $2,500,000 Bedrooms and bathrooms: 3 bedr

In [45]:
# Function to personalize a listing
def personalize_listing(buyer_preferences: str, retrieved_listings: list):
    """
    Personalizes real estate listings based on buyer's preferences using Chain of Thought reasoning.
    
    Parameters:
    - buyer_preferences (str): Buyer's specific preferences.
    - retrieved_listings (list): List of top matching listings.
    
    Returns:
    - List of personalized listings.
    """
    personalized_listings = []
    
    for listing in retrieved_listings:
        prompt_text = few_shot_prompt.format(
            buyer_preferences=buyer_preferences,
            listing=listing["listing"]
        )
        personalized_description = llm(prompt_text).strip()
        
        personalized_listings.append({
            "original_listing": listing["listing"],
            "personalized_listing": personalized_description
        })
    
    return personalized_listings

In [46]:
# Example usage
buyer_query = "A cozy 2-bedroom apartment with a home office space, close to cafes and parks."
retrieved_listings = collection.query(
    query_embeddings=[SentenceTransformer('paraphrase-MiniLM-L6-v2').encode(buyer_query).tolist()],
    n_results=3
)["metadatas"][0]

# Generate personalized listings
personalized_results = personalize_listing(buyer_query, retrieved_listings)

In [47]:
# Print results
print("\n=== Personalized Real Estate Listings ===\n")
for idx, result in enumerate(personalized_results, 1):
    print(f"{idx}. **Original Listing:** {result['original_listing']}\n")
    print(f"   **Personalized Listing:** {result['personalized_listing']}\n")


=== Personalized Real Estate Listings ===

1. **Original Listing:** 2 Neighborhood: Tiong Bahru Price: $1,200,000 Bedrooms and bathrooms: 2 bedrooms, 2 bathrooms House Size: 1,000 sqft Features: Renovated kitchen, balcony, city views Neighborhood Description: Tiong Bahru is a trendy neighborhood with a mix of heritage buildings and modern cafes.

   **Personalized Listing:** This cozy 2-bedroom apartment is perfect for those looking for a comfortable living space with a home office area. The renovated kitchen and balcony offer a peaceful retreat, while the city views add a touch of urban charm. Located in the trendy neighborhood of Tiong Bahru, you'll have easy access to cafes, parks, and a vibrant community atmosphere. Don't miss out on this opportunity to live in a stylish and convenient location.

2. **Original Listing:** 4 Neighborhood: Bukit Timah Price: $3,500,000 Bedrooms and bathrooms: 3 bedrooms, 4 bathrooms House Size: 3,200 sqft Features: Gourmet kitchen, home office, lands