In [1]:
from dotenv import load_dotenv
load_dotenv()
from tqdm.auto import tqdm
from uuid import uuid4
from langchain_openai import OpenAI, OpenAIEmbeddings, ChatOpenAI
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from langchain.text_splitter import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain.document_loaders.csv_loader import CSVLoader
from langchain.chains.question_answering import load_qa_chain
from pydantic import BaseModel, Field, PositiveInt, model_validator

## 1. Data Generation
In this section of the jupyter notebook, we will generate the data for the project. There are multiple possible fields that can be used in a realestate listing, but for this project, I have tried to limit the fields to the most common ones. 

In [2]:
model_name = "gpt-3.5-turbo-instruct"
temperature = 1
llm = OpenAI(model_name=model_name, temperature=temperature, max_tokens=3000)

In [4]:
# Defining the output template for easier parsing
class RealestateResponseFormatter(BaseModel):
    house_size: PositiveInt = Field(description="The size of a house in square feet.", gt=500, lt=5000, multiple_of=50)
    neighborhood: str =Field(description="A fictional neighborhood name at which the house is located.")
    cost: PositiveInt = Field(description="The cost of the house in USD.", gt=100000, multiple_of=1500)
    bedrooms: PositiveInt = Field(description="The number of bedrooms in the house.", gt=0, lt=10)
    bathrooms: PositiveInt = Field(description="The number of bathrooms in the house.", gt=0, lt=10)
    garage: bool = Field(description="Whether the house has a garage or not.")
    pool: bool = Field(description="Whether the house has a pool or not.")
    yard: bool = Field(description="Whether the house has a yard or not.")
    basement: bool = Field(description="Whether the house has a basement or not.")
    nearby_features: str = Field(description="A list of nearby features, such as schools, parks, shopping centers, etc.")
    house_description: str = Field(description="A description of the house. This should be a few sentences long, and describe the house using the other fields defined.")
    neighborhood_description: str = Field(description="A description of the neighborhood. This should be a few sentences long, and describe the neighborhood, including any amenities, security, schools, parks, etc.")

# Defining the pydantic parser
parser = PydanticOutputParser(pydantic_object=RealestateResponseFormatter)
all_listings =[] # List to store all the listings

In [5]:
# defining the prompt template to generate a realestate listing
listing_generation_prompt = PromptTemplate(
    template="Generate Real-estate Listings in the provided format. Make the listing as unique as possible.  \n{format_instructions}\n",
    partial_variables={
        "format_instructions": parser.get_format_instructions()
    }
)

# defining the prompt template to generate a realestate listing
listing_generation_prompt = listing_generation_prompt.format()
print(listing_generation_prompt)

Generate Real-estate Listings in the provided format. Make the listing as unique as possible.  
The output should be formatted as a JSON instance that conforms to the JSON schema below.

As an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}
the object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.

Here is the output schema:
```
{"properties": {"house_size": {"description": "The size of a house in square feet.", "exclusiveMaximum": 5000, "exclusiveMinimum": 0, "multipleOf": 50, "title": "House Size", "type": "integer"}, "neighborhood": {"description": "A fictional neighborhood name at which the house is located.", "title": "Neighborhood", "type": "string"}, "cost": {"description": "The cost of the house in USD.", "exclusiveMinimum": 0, "multipleOf": 1500, "title": "Cost", "

In [7]:
# Iteratively making a call to the model to generate listings.
print(len(all_listings))
for i in tqdm(range(50)):
    try:
        if len(all_listings)<20:
            listing_output = llm(listing_generation_prompt)
            parsed_listing = parser.parse(listing_output)
            parsed_listing = parsed_listing.model_dump()
            all_listings.append(parsed_listing)
    except Exception as e:
        continue

1


  0%|          | 0/50 [00:00<?, ?it/s]

In [8]:
# Saving the generated listings into a csv file
import pandas as pd 
df = pd.DataFrame(all_listings)
print(df.shape)
df.to_csv("house_listings.csv", index=False)

(20, 12)


In [9]:
def get_final_text(house_size, neighborhood, cost, bedrooms, bathrooms, garage, pool, yard, basement, house_description, neighbourhood_description):
    pool_text = "The house has a pool." if pool else "The house does not have a pool."
    garage_text = "The house has a garage." if garage else "The house does not have a garage."
    yard_text = "The house has a yard." if yard else "The house does not have a yard."
    basement_text = "The house has a basement." if basement else "The house does not have a basement."
    final_text = f"""The house occupies a size of {house_size} square feet, and is located in the {neighborhood} neighborhood.\n{neighbourhood_description}.\n
    The house has {bedrooms} bedrooms, {bathrooms} bathrooms, and costs {cost} USD.{house_description}.
    """ 
    final_text += pool_text + garage_text + yard_text + basement_text + house_description + neighbourhood_description
    return final_text

final_text = df.apply(lambda x: get_final_text(x.house_size, x.neighborhood, x.cost, x.bedrooms, x.bathrooms, x.garage, x.pool, x.yard, x.basement, x.house_description, x.neighborhood_description), axis=1)
final_text.to_csv("final_house.csv", index=False, header=False)

### Observations
* When the data is generated all at once, it is observed that the content of the fields was repeated. The temperature changes have made a difference, but the content of the fields is still repeated after a certain point.
* The maximum tokens were also another factor to consider iterative generation. For each call, the maximum tokens were set to 2000. 


## 2. [Semantic Search] Defining and storing the Vector Database 
1. To create the embedding vectors, I am using OpenAIembeddings.
2. A ChromaDB is being used to store the vectors.


### Creating a Vector Database and Storing Listings

In [10]:
#defining the embeddings and vector store
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

vector_store = Chroma(
    collection_name = 'house_listings',
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [11]:
# Loading the data from the csv file using LangChain's CSVLoader
loader = CSVLoader(file_path= 'final_house.csv')
house_data = loader.load()
splitter= CharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
split_docs = splitter.split_documents(house_data)

Created a chunk of size 1710, which is longer than the specified 1000
Created a chunk of size 1515, which is longer than the specified 1000
Created a chunk of size 1635, which is longer than the specified 1000
Created a chunk of size 1681, which is longer than the specified 1000
Created a chunk of size 1827, which is longer than the specified 1000
Created a chunk of size 1585, which is longer than the specified 1000
Created a chunk of size 1623, which is longer than the specified 1000
Created a chunk of size 1669, which is longer than the specified 1000
Created a chunk of size 1716, which is longer than the specified 1000
Created a chunk of size 1598, which is longer than the specified 1000
Created a chunk of size 1667, which is longer than the specified 1000
Created a chunk of size 1544, which is longer than the specified 1000
Created a chunk of size 1650, which is longer than the specified 1000
Created a chunk of size 1600, which is longer than the specified 1000
Created a chunk of s

In [17]:
vector_store.add_documents(documents=split_docs)

['584631ed-6d59-47b5-8972-3a65d8ede1df',
 '58d070ae-7318-4c5a-86c7-a788dff25267',
 'f223e27e-08f8-45b9-ae4c-75327b21b15c',
 '6d83bec2-20b8-4e59-b627-f6bf8b07c3fe',
 '15aa7d5f-9bd3-4e5a-bae3-1e2bf2567d3f',
 '65a35a10-fea8-452f-ab26-20ebeba4ac9e',
 '99ead569-e615-4032-af19-396f225d9808',
 'e1e83702-656d-4360-bc89-1d2b347b5bb1',
 'c41ebdd3-a21b-43da-a423-37408472db83',
 'e4df1f09-0d04-49ca-99ea-9611513e5dc8',
 '6bdf9fcf-14fb-4f58-a2a1-6b0477ea4bd8',
 'd460e1c9-476a-415d-8742-5467755e10bf',
 '05bc609f-e814-4891-a0ff-2bafef1a0f4c',
 'a5951780-1b75-4317-aa60-e83ce0207efa',
 'd506323f-702b-47fd-a7d4-8265c10e11e0',
 '3c777565-daeb-4471-94e4-48ad43e62919',
 'af1e10e0-ce76-4f24-a11b-f537a8b33646',
 '85bd9bdb-b3e4-4c7c-895d-473f948bf3dd',
 'f1212935-ffc8-4f5c-814f-b4f7d7d5be7f',
 'b225ef0f-4389-4f5d-9284-86c47ab7a415',
 '50fc457d-ddbc-4d4b-8cd2-9dd5faae41a3',
 'b99bc413-aca1-4e16-814b-086c3b9912bb',
 'c0529b40-00f4-4fa3-80a7-7e1aa3d224eb',
 'da543dc6-798c-4d7b-a4ed-0a69bf37fb0f',
 'd6535cc9-8b17-

### Semantic Search of listings with a simple query of preferences

In [13]:
sample_query = """
I am looking for a house with good ventilation, a pool and a yard. 
I would like to have a house in a neighborhood with good schools and parks. 
I am looking for a house with a cost of around 100000 USD."""

# Querying the vector store with the sample query
results = vector_store.similarity_search(query=sample_query, k=2)
print(results[0].page_content)

The house has 4 bedrooms, 2 bathrooms, and costs 450000 USD.Spacious two-story home in a friendly neighborhood. Features a beautiful backyard with a pool and plenty of room for outdoor activities. The house has four bedrooms, including a master suite, and two bathrooms. The garage is a spacious two-car garage. The kitchen has been recently renovated with modern appliances and an open layout. The basement is unfinished, providing room for potential expansion..
    The house has a pool.The house has a garage.The house has a yard.The house does not have a basement.Spacious two-story home in a friendly neighborhood. Features a beautiful backyard with a pool and plenty of room for outdoor activities. The house has four bedrooms, including a master suite, and two bathrooms. The garage is a spacious two-car garage. The kitchen has been recently renovated with modern appliances and an open layout. The basement is unfinished, providing room for potential expansion.Pleasantville is a lovely neig

## 3. Augmented Response Generation
In this section, the goal is to generate an augmented response for the user query. The response will be generated by using the semantic search to find the most similar listing to the user query.

Points to note:
1. The user query is currently hardcoded based on a dictionary of preferences. The additional preferences can be added or removed from the dictionary.

In [14]:
response_llm = ChatOpenAI(model_name='gpt-3.5-turbo', temperature=0.7, max_tokens=3000)

In [19]:
def get_user_preference_query(preferences):
    query = f"""I am looking for a house with the following requirements:\n\t- A house with a size of {preferences['size']} square feet.\n\t- A house with {preferences['bedrooms']} bedrooms.\n\t- A house with {preferences['bathrooms']} bathrooms.\n\t- A house with a budget of {preferences['budget']} USD.\n\t- A house in a locality that is {', '.join(preferences['locality_requirements'])}."""
    
    good_to_have_things = ['garage', 'pool', 'yard', 'basement']
    for thing in good_to_have_things:
        if thing in preferences:
            if preferences[thing]:
                query += f"\t- A house with a {thing}.\n"
            elif not preferences[thing]:
                query += f"\t- A house without a {thing}.\n"
        else:
            query += f"\t- A house with or without a {thing}.\n"
    return query + '\nBased on these requirements, provide me with a few options.'

### Test-1: Use of LLM for Generating Personalized Descriptions

In [15]:
user_prefrences= {
    'pool': True,
    'garage': True,
    'size': 2000,
    'budget': 300000,
    'locality_requirements': [
        'in a secure environment',
        'near good schools',
        'clean',
        'mountain view'
    ],
    'bedrooms': 3,
    'bathrooms': 2
}

user_query = get_user_preference_query(user_prefrences)

#prompt template to augment the user query with listings available
house_rec_prompt = PromptTemplate(
    template="{query}\nContext:\n{context}",
    input_variables=["query", "context"]
)

In [23]:
#searching listing descriptions
similar_houses = vector_store.similarity_search(query=user_query, k=3)

#Using LLM to generate house recommendations
chain = load_qa_chain(response_llm, prompt=house_rec_prompt, chain_type='stuff')
response = chain.run(query=user_query, input_documents=similar_houses)
print(response)

I'm sorry, but based on your requirements, I wasn't able to find a house that meets all of your criteria within your budget of 150,000 USD. The options provided above are slightly above your budget, but they do meet most of your requirements. If you are open to increasing your budget or adjusting your requirements, I can provide you with more options. Let me know how you would like to proceed.


### Test-2 : Use of LLM for Generating Personalized Descriptions

In [None]:
user_prefrences= {
    'garage': True,
    'size': 2000,
    'budget': 450000,
    'locality_requirements': [
        'in a secure environment',
        'near good schools',
        'clean',
        'mountain view'
    ],
    'bedrooms': 4,
    'bathrooms': 3
}

# Searching for listing options based on the user preferences
user_query = get_user_preference_query(user_prefrences)
similar_houses = vector_store.similarity_search(query=user_query, k=3)

response = chain.run(query=user_query, input_documents=similar_houses)
print(response)

I have found two options that meet your requirements:

1. Eastwood Heights:
- Size: 3500 square feet
- Bedrooms: 4
- Bathrooms: 3
- Price: 450000 USD
- Garage: Yes
- Pool: No
- Yard: Yes
- Basement: No
- Location: Eastwood Heights, a safe and picturesque neighborhood with top-rated schools and convenient amenities.

2. Willow Heights:
- Size: 3000 square feet
- Bedrooms: 4
- Bathrooms: 3
- Price: 450000 USD
- Garage: Yes
- Pool: No
- Yard: Yes
- Basement: Yes
- Location: Willow Heights, a peaceful and family-friendly neighborhood with great schools and amenities.

Both of these options offer spacious living areas, modern designs, and outdoor spaces perfect for families. You can consider visiting these houses to see if they meet all your requirements.
