# Project Config

In [1]:
# config.py
import os
from dotenv import load_dotenv

load_dotenv()

# API Keys
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_API_BASE = "https://openai.vocareum.com/v1"

# LLM Configuration
LLM_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"

# Vector DB Configuration
VECTOR_DB_PATH = "./vector_db"

# Application Settings
NUM_LISTINGS_TO_GENERATE = 15
NUM_LISTINGS_TO_RETURN = 3

In [2]:
OPENAI_API_KEY

'voc-8957811371266772085699678d66e1475c97.62448043'

# Testing OpenAI API key

In [3]:
from openai import OpenAI
client = OpenAI(
    base_url = "https://openai.vocareum.com/v1",
    api_key = OPENAI_API_KEY,
)


response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Give me a list of 10 neighborhoods in São Paulo"}
    ]
)
print(response.choices[0].message.content)


Certainly! Here is a list of 10 neighborhoods in São Paulo:

1. **Moema** - Known for its upscale residential areas and proximity to Ibirapuera Park.
2. **Pinheiros** - A vibrant area with a mix of cultural attractions, restaurants, and nightlife.
3. **Vila Madalena** - Famous for its bohemian atmosphere, street art, and lively bar scene.
4. **Jardins** - An affluent neighborhood with luxury shopping streets and fine dining.
5. **Itaim Bibi** - A commercial and residential district with a blend of modern skyscrapers and entertainment venues.
6. **Campo Belo** - A primarily residential area with good connectivity to other parts of the city.
7. **Santana** - Located in the northern part of the city, known for its parks and educational institutions.
8. **Morumbi** - Home to the Morumbi Stadium and a number of upscale residences.
9. **Liberdade** - The heart of São Paulo's Japanese community, known for its Asian markets and cultural festivals.
10. **Bela Vista** - Includes the iconic Aveni

# Testing OpenAI API key

In [4]:
OPENAI_API_KEY

'voc-8957811371266772085699678d66e1475c97.62448043'

In [5]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
llm = ChatOpenAI(
    model=LLM_MODEL, 
    openai_api_key=OPENAI_API_KEY,
    openai_api_base="https://openai.vocareum.com/v1"
)

prompt = 'Which Disney movie made the most money in theathers?'

response = llm([HumanMessage(content=prompt)])
listing_text = response.content

  llm = ChatOpenAI(
  response = llm([HumanMessage(content=prompt)])


In [6]:
listing_text

'As of now, the Disney movie that has made the most money in theaters is "Avengers: Endgame" which was released in 2019. It grossed over $2.798 billion worldwide, making it the highest-grossing film of all time.'

# Data generation

In [7]:
# data_generation.py
import json
import random
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage

NUM_LISTINGS_TO_GENERATE=10

def generate_listings(num_listings=NUM_LISTINGS_TO_GENERATE):
    """Generate synthetic real estate listings using LLM"""
    
    llm = ChatOpenAI(
        model=LLM_MODEL, 
        openai_api_key=OPENAI_API_KEY,
        openai_api_base="https://openai.vocareum.com/v1"
    )
    
    '''
    neighborhoods = ["Green Oaks", "Riverside Heights", "Downtown Central", 
                     "Sunset Hills", "Maple Grove", "Harbor View", 
                     "Oakwood Estates", "Pine Valley", "Lakeshore", "Hillcrest"]
    '''
    neighborhoods = ["Jardins", "Pinheiros", "Vila Madalena", "Moema", 
                     "Itaim Bibi", "Morumbi", "Liberdade", "Bela Vista", 
                     "Brooklin", "Paraiso"
                    ]
    
    listings = []
    
    for i in range(num_listings):
        # Create a prompt for the LLM to generate a diverse listing
        neighborhood = random.choice(neighborhoods)
        bedrooms = random.randint(1, 5)
        bathrooms = random.randint(1, 4)
        price = random.randint(200, 1500) * 1000
        size = random.randint(800, 4000)
        
        prompt = f"""
        Generate a detailed real estate listing with the following specifications:
        - Neighborhood: {neighborhood}
        - Price: ${price:,}
        - Bedrooms: {bedrooms}
        - Bathrooms: {bathrooms}
        - House Size: {size} sqft
        
        Include a property description highlighting unique features and a separate neighborhood description.
        Format the output exactly as follows:
        
        Neighborhood: [neighborhood name]
        Price: [price]
        Bedrooms: [number]
        Bathrooms: [number]
        House Size: [size] sqft

        Description: [detailed property description]

        Neighborhood Description: [neighborhood description]
        """
        
        response = llm([HumanMessage(content=prompt)])
        listing_text = response.content
        
        # Parse the generated listing into structured format
        listing_data = {}
        sections = listing_text.split("\n\n")
        
        # Parse basic info
        basic_info = sections[0].strip().split("\n")
        for line in basic_info:
            if ":" in line:
                key, value = line.split(":", 1)
                listing_data[key.strip()] = value.strip()
        
        # Parse description and neighborhood
        for section in sections[1:]:
            if section.startswith("Description:"):
                listing_data["Description"] = section.replace("Description:", "", 1).strip()
            elif section.startswith("Neighborhood Description:"):
                listing_data["Neighborhood Description"] = section.replace("Neighborhood Description:", "", 1).strip()
        
        listings.append(listing_data)
    
    # Save listings to file
    with open("listings.json", "w") as f:
        json.dump(listings, f, indent=2)
    
    return listings

## Generating listings

In [8]:
listings = generate_listings(10)

In [9]:
listings[0]

{'Neighborhood': 'Itaim Bibi',
 'Price': '$708,000',
 'Bedrooms': '1',
 'Bathrooms': '3',
 'House Size': '3821 sqft',
 'Description': "This stunning property in Itaim Bibi offers luxurious living with a spacious 3821 sqft house featuring 1 bedroom and 3 bathrooms. The modern design and high-end finishes make this home truly impressive. The open concept living area is perfect for entertaining guests, while the gourmet kitchen is a chef's dream. The master suite boasts a spa-like bathroom with a soaking tub and walk-in shower. Enjoy the outdoor patio and garden for relaxing and unwinding.",
 'Neighborhood Description': 'Itaim Bibi is a vibrant and upscale neighborhood in São Paulo, known for its trendy restaurants, upscale boutiques, and lively nightlife. The area is also home to several parks and green spaces, providing a peaceful escape from the bustling city life. With its convenient location and luxurious amenities, Itaim Bibi is the perfect place to call home for those seeking a hig

# Vector Store

## Creating the Vector table model and embedding

In [10]:
import json
import os
import lancedb
import pyarrow as pa
from langchain_openai import OpenAIEmbeddings

# Set API key as environment variable
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# Connect to LanceDB
db = lancedb.connect("~/real_estate_db")

# Create embeddings object
embeddings = OpenAIEmbeddings(
    openai_api_base="https://openai.vocareum.com/v1",
    model="text-embedding-ada-002"
)

# Load listings
with open("listings.json", "r") as f:
    listings = json.load(f)

# Prepare data with embeddings
formatted_listings = []
for listing in listings:
    full_text = f"""
    Neighborhood: {listing.get('Neighborhood', '')}
    Price: {listing.get('Price', '')}
    Bedrooms: {listing.get('Bedrooms', '')}
    Bathrooms: {listing.get('Bathrooms', '')}
    House Size: {listing.get('House Size', '')}
    
    {listing.get('Description', '')}
    
    {listing.get('Neighborhood Description', '')}
    """
    
    # Generate embedding
    vector = embeddings.embed_query(full_text)
    
    formatted_listing = {
        "neighborhood": listing.get('Neighborhood', ''),
        "price": listing.get('Price', ''),
        "bedrooms": listing.get('Bedrooms', ''),
        "bathrooms": listing.get('Bathrooms', ''),
        "house_size": listing.get('House Size', ''),
        "description": listing.get('Description', ''),
        "neighborhood_description": listing.get('Neighborhood Description', ''),
        "full_text": full_text,
        "vector": vector
    }
    
    formatted_listings.append(formatted_listing)

# Create proper PyArrow schema
table_schema = pa.schema([
    pa.field("neighborhood", pa.string()),
    pa.field("price", pa.string()),
    pa.field("bedrooms", pa.string()),
    pa.field("bathrooms", pa.string()),
    pa.field("house_size", pa.string()),
    pa.field("description", pa.string()),
    pa.field("neighborhood_description", pa.string()),
    pa.field("full_text", pa.string()),
    pa.field("vector", pa.list_(pa.float32(), 1536))
])

# Create table with proper schema
try:
    table = db.create_table("real_estate_listings", schema=table_schema, mode="overwrite")
    table.add(formatted_listings)
    print(f"Successfully added {len(formatted_listings)} listings with embeddings to LanceDB")
except Exception as e:
    print(f"Error: {e}")


Successfully added 10 listings with embeddings to LanceDB


## Testing vector search

In [11]:
table.head().to_pandas()

Unnamed: 0,neighborhood,price,bedrooms,bathrooms,house_size,description,neighborhood_description,full_text,vector
0,Itaim Bibi,"$708,000",1,3,3821 sqft,This stunning property in Itaim Bibi offers lu...,Itaim Bibi is a vibrant and upscale neighborho...,\n Neighborhood: Itaim Bibi\n Price: $70...,"[0.0020712118, 0.028116658, -0.02076859, -0.00..."
1,Itaim Bibi,"$496,000",5,1,2341 sqft,This stunning 5-bedroom house in Itaim Bibi of...,Itaim Bibi is a trendy and upscale neighborhoo...,\n Neighborhood: Itaim Bibi\n Price: $49...,"[-0.0038030809, 0.034622982, -0.022891559, -0...."
2,Bela Vista,"$1,230,000",1,4,1620 sqft,This stunning property in Bela Vista offers a ...,Bela Vista is a prestigious neighborhood known...,"\n Neighborhood: Bela Vista\n Price: $1,...","[0.01703549, 0.030518677, 0.007169426, -0.0113..."
3,Pinheiros,"$207,000",4,4,3441 sqft,"This stunning 4-bedroom, 4-bathroom home in Pi...",Pinheiros is a vibrant and trendy neighborhood...,\n Neighborhood: Pinheiros\n Price: $207...,"[0.013517212, 0.027740596, -0.025968501, -0.00..."
4,Brooklin,"$1,075,000",1,2,1222 sqft,This stunning property in Brooklin offers a un...,Brooklin is a highly sought-after neighborhood...,"\n Neighborhood: Brooklin\n Price: $1,07...","[0.013028152, 0.020615552, -0.015535797, -0.01..."


In [12]:
print(db.table_names())

['real_estate_listings']


In [13]:
brooklin_properties = table.search().where("neighborhood = 'Brooklin'").to_pandas()
brooklin_properties

Unnamed: 0,neighborhood,price,bedrooms,bathrooms,house_size,description,neighborhood_description,full_text,vector
0,Brooklin,"$1,075,000",1,2,1222 sqft,This stunning property in Brooklin offers a un...,Brooklin is a highly sought-after neighborhood...,"\n Neighborhood: Brooklin\n Price: $1,07...","[0.013028152, 0.020615552, -0.015535797, -0.01..."
1,Brooklin,"$1,081,000",4,4,3927 sqft,"This stunning 4-bedroom, 4-bathroom home in Br...",Brooklin is a charming and family-friendly nei...,"\n Neighborhood: Brooklin\n Price: $1,08...","[0.017218994, 0.016692536, -0.017989418, -0.00..."


In [18]:
# Connect to your database
db = lancedb.connect("~/real_estate_db")
table = db.open_table("real_estate_listings")

# Perform a vector search with a natural language query
query = "spacious family home with modern design"
query_vector = embeddings.embed_query(query)

In [20]:
results = table.search(
    query_vector,  # Pass the vector directly instead of text
    vector_column_name='vector'
).limit(3).to_pandas()

In [21]:
# Display results
for i, result in results.iterrows():
    print(f"\nMatch #{i+1} - Similarity Score: {result['_distance']:.4f}")
    print(f"Neighborhood: {result['neighborhood']}")
    print(f"Price: {result['price']}")
    print(f"Description: {result['description'][:100]}...")


Match #1 - Similarity Score: 0.3902
Neighborhood: Brooklin
Price: $1,081,000
Description: This stunning 4-bedroom, 4-bathroom home in Brooklin offers a spacious 3927 sqft of living space. Th...

Match #2 - Similarity Score: 0.3944
Neighborhood: Morumbi
Price: $393,000
Description: This beautiful property in the sought-after neighborhood of Morumbi features 3 bedrooms, 4 bathrooms...

Match #3 - Similarity Score: 0.3967
Neighborhood: Itaim Bibi
Price: $496,000
Description: This stunning 5-bedroom house in Itaim Bibi offers a spacious and luxurious living space perfect for...


# Preference Parser

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from config import OPENAI_API_KEY, LLM_MODEL

class PreferenceParser:
    def __init__(self):
        """Initialize the preference parser"""
        self.llm = ChatOpenAI(model=LLM_MODEL, openai_api_key=OPENAI_API_KEY)
    
    def collect_preferences(self, questions=None, answers=None):
        """Collect buyer preferences either from predefined Q&A or interactively"""
        if questions and answers:
            return self._process_predefined_preferences(questions, answers)
        else:
            return self._collect_interactive_preferences()
    
    def _process_predefined_preferences(self, questions, answers):
        """Process predefined questions and answers"""
        # Combine Q&A for context
        qa_pairs = "\n".join([f"Q: {q}\nA: {a}" for q, a in zip(questions, answers)])
        
        # Use LLM to extract key preferences
        prompt = f"""
        Based on the following buyer's responses to questions about their home preferences:
        
        {qa_pairs}
        
        Create a detailed summary of their preferences that can be used to search for matching properties.
        Focus on extracting specific details about:
        - Property size and layout
        - Important features and amenities
        - Neighborhood characteristics
        - Location requirements
        - Price range (if mentioned)
        
        Format your response as a comprehensive paragraph describing their ideal home.
        """
        
        response = self.llm([HumanMessage(content=prompt)])
        return response.content
    
    def _collect_interactive_preferences(self):
        """Collect preferences interactively from user input"""
        print("Please tell us about your ideal home:")
        
        questions = [
            "How big do you want your house to be?",
            "What are 3 most important things for you in choosing this property?",
            "Which amenities would you like?",
            "Which transportation options are important to you?",
            "How urban do you want your neighborhood to be?"
        ]
        
        answers = []
        for question in questions:
            print(f"\n{question}")
            answer = input("> ")
            answers.append(answer)
        
        return self._process_predefined_preferences(questions, answers)

if __name__ == "__main__":
    # Test preference parser when run directly
    parser = PreferenceParser()
    
    # Test with predefined Q&A
    questions = [
        "How big do you want your house to be?",
        "What are 3 most important things for you in choosing this property?",
        "Which amenities would you like?",
        "Which transportation options are important to you?",
        "How urban do you want your neighborhood to be?"
    ]
    
    answers = [
        "A comfortable three-bedroom house with a spacious kitchen and a cozy living room.",
        "A quiet neighborhood, good local schools, and convenient shopping options.",
        "A backyard for gardening, a two-car garage, and a modern, energy-efficient heating system.",
        "Easy access to a reliable bus line, proximity to a major highway, and bike-friendly roads.",
        "A balance between suburban tranquility and access to urban amenities like restaurants and theaters."
    ]
    
    preferences = parser.collect_preferences(questions, answers)
    print("\nProcessed Preferences:")
    print(preferences)


# Listing Personalizer

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage
from config import OPENAI_API_KEY, LLM_MODEL

class ListingPersonalizer:
    def __init__(self):
        """Initialize the listing personalizer"""
        self.llm = ChatOpenAI(model=LLM_MODEL, openai_api_key=OPENAI_API_KEY)
    
    def personalize_listing(self, listing, preferences):
        """Personalize a listing based on buyer preferences"""
        # Extract listing details
        neighborhood = listing.get('Neighborhood', '')
        price = listing.get('Price', '')
        bedrooms = listing.get('Bedrooms', '')
        bathrooms = listing.get('Bathrooms', '')
        house_size = listing.get('House Size', '')
        description = listing.get('Description', '')
        neighborhood_desc = listing.get('Neighborhood Description', '')
        
        # Create a prompt for personalization
        prompt = f"""
        You are a real estate agent helping a potential buyer find their perfect home.
        
        BUYER PREFERENCES:
        {preferences}
        
        ORIGINAL LISTING:
        Neighborhood: {neighborhood}
        Price: {price}
        Bedrooms: {bedrooms}
        Bathrooms: {bathrooms}
        House Size: {house_size}
        
        Original Description: {description}
        
        Original Neighborhood Description: {neighborhood_desc}
        
        TASK:
        Rewrite the property description to highlight aspects that align with the buyer's preferences.
        Do NOT change any factual information about the property.
        Do NOT invent new features that aren't mentioned in the original description.
        DO emphasize existing features that match what the buyer is looking for.
        Make the description personal and engaging, addressing the buyer's specific needs.
        
        Format your response as:
        
        PERSONALIZED DESCRIPTION:
        [Your personalized property description]
        
        PERSONALIZED NEIGHBORHOOD DESCRIPTION:
        [Your personalized neighborhood description]
        """
        
        response = self.llm([HumanMessage(content=prompt)])
        personalized_text = response.content
        
        # Extract personalized descriptions
        personalized_property_desc = ""
        personalized_neighborhood_desc = ""
        
        if "PERSONALIZED DESCRIPTION:" in personalized_text:
            parts = personalized_text.split("PERSONALIZED NEIGHBORHOOD DESCRIPTION:")
            personalized_property_desc = parts[0].replace("PERSONALIZED DESCRIPTION:", "").strip()
            
            if len(parts) > 1:
                personalized_neighborhood_desc = parts[1].strip()
        
        # Create personalized listing
        personalized_listing = listing.copy()
        personalized_listing["Original Description"] = description
        personalized_listing["Original Neighborhood Description"] = neighborhood_desc
        personalized_listing["Description"] = personalized_property_desc
        personalized_listing["Neighborhood Description"] = personalized_neighborhood_desc
        
        return personalized_listing

if __name__ == "__main__":
    # Test listing personalizer when run directly
    personalizer = ListingPersonalizer()
    
    # Sample listing
    listing = {
        "Neighborhood": "Green Oaks",
        "Price": "$800,000",
        "Bedrooms": "3",
        "Bathrooms": "2",
        "House Size": "2,000 sqft",
        "Description": "Welcome to this eco-friendly oasis nestled in the heart of Green Oaks. This charming 3-bedroom, 2-bathroom home boasts energy-efficient features such as solar panels and a well-insulated structure. Natural light floods the living spaces, highlighting the beautiful hardwood floors and eco-conscious finishes. The open-concept kitchen and dining area lead to a spacious backyard with a vegetable garden, perfect for the eco-conscious family. Embrace sustainable living without compromising on style in this Green Oaks gem.",
        "Neighborhood Description": "Green Oaks is a close-knit, environmentally-conscious community with access to organic grocery stores, community gardens, and bike paths. Take a stroll through the nearby Green Oaks Park or grab a cup of coffee at the cozy Green Bean Cafe. With easy access to public transportation and bike lanes, commuting is a breeze."
    }
    
    # Sample preferences
    preferences = "The buyer is looking for a family-friendly home with 3+ bedrooms in a quiet neighborhood. They prioritize good schools, a backyard for their children to play, and energy-efficient features. They enjoy gardening and need space for this hobby. The family has one car but would appreciate access to public transportation. They value community and want to be close to parks and family-oriented activities."
    
    personalized = personalizer.personalize_listing(listing, preferences)
    
    print("ORIGINAL DESCRIPTION:")
    print(listing["Description"])
    print("\nPERSONALIZED DESCRIPTION:")
    print(personalized["Description"])
    print("\nORIGINAL NEIGHBORHOOD DESCRIPTION:")
    print(listing["Neighborhood Description"])
    print("\nPERSONALIZED NEIGHBORHOOD DESCRIPTION:")
    print(personalized["Neighborhood Description"])

# Main app

In [None]:
import json
from data_generation import generate_listings
from vector_store import VectorStore
from preference_parser import PreferenceParser
from listing_personalizer import ListingPersonalizer
from config import NUM_LISTINGS_TO_RETURN

class HomeMatch:
    def __init__(self):
        """Initialize the HomeMatch application"""
        self.vector_store = VectorStore()
        self.preference_parser = PreferenceParser()
        self.listing_personalizer = ListingPersonalizer()
    
    def setup(self, regenerate_listings=False):
        """Set up the application by generating listings and loading them into the vector store"""
        if regenerate_listings:
            print("Generating new real estate listings...")
            self.listings = generate_listings()
            print(f"Generated {len(self.listings)} listings")
        else:
            try:
                with open("listings.json", "r") as f:
                    self.listings = json.load(f)
                print(f"Loaded {len(self.listings)} existing listings from file")
            except FileNotFoundError:
                print("No existing listings found. Generating new listings...")
                self.listings = generate_listings()
                print(f"Generated {len(self.listings)} listings")
        
        # Load listings into vector store
        print("Loading listings into vector database...")
        count = self.vector_store.load_listings()
        print(f"Loaded {count} listings into vector database")
    
    def process_buyer_preferences(self, questions=None, answers=None):
        """Process buyer preferences from questions and answers or collect interactively"""
        print("Processing buyer preferences...")
        self.preferences = self.preference_parser.collect_preferences(questions, answers)
        print("Buyer preferences processed")
        return self.preferences
    
    def find_matching_listings(self, n_results=NUM_LISTINGS_TO_RETURN):
        """Find listings that match the buyer's preferences"""
        print(f"Searching for up to {n_results} matching listings...")
        matches = self.vector_store.search_listings(self.preferences, n_results)
        print(f"Found {len(matches)} matching listings")
        return matches
    
    def personalize_listings(self, matches):
        """Personalize the matched listings based on buyer preferences"""
        print("Personalizing listings...")
        personalized_listings = []
        
        for match in matches:
            print(f"Personalizing listing in {match.get('Neighborhood', 'Unknown')}...")
            personalized = self.listing_personalizer.personalize_listing(match, self.preferences)
            personalized_listings.append(personalized)
        
        print(f"Personalized {len(personalized_listings)} listings")
        return personalized_listings
    
    def display_personalized_listings(self, personalized_listings):
        """Display the personalized listings"""
        print("\n" + "="*80)
        print("PERSONALIZED LISTINGS FOR YOU")
        print("="*80)
        
        for i, listing in enumerate(personalized_listings, 1):
            print(f"\nLISTING {i}: {listing.get('Neighborhood', 'Unknown')}")
            print("-"*80)
            print(f"Price: {listing.get('Price', 'N/A')}")
            print(f"Bedrooms: {listing.get('Bedrooms', 'N/A')}")
            print(f"Bathrooms: {listing.get('Bathrooms', 'N/A')}")
            print(f"House Size: {listing.get('House Size', 'N/A')}")
            print("\nDESCRIPTION:")
            print(listing.get('Description', 'No description available'))
            print("\nNEIGHBORHOOD:")
            print(listing.get('Neighborhood Description', 'No neighborhood description available'))
            print("-"*80)