In [5]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Load data
df = pd.read_csv("data.csv")

# Fill missing values
text_cols = ['property_description', 'property_overview', 'building_type', 'city']
for col in text_cols:
    df[col] = df[col].fillna("")

# Combine ALL columns into a single search string
df["search_text"] = (
    "Area: " + df["area"].astype(str) + " sqft | " +
    "Type: " + df["building_type"] + " | " +
    "Price: $" + df["price"].astype(str) + " | " +
    "Beds: " + df["num_bed_rooms"].astype(str) + " | Baths: " + df["num_bath_rooms"].astype(str) + " | " +
    "Location: " + df["locality"] + ", " + df["city"] + " | " +
    "Amenities: Relaxation (" + df["relaxation_amenity_count"].astype(str) + "), " +
    "Security (" + df["security_amenity_count"].astype(str) + "), " +
    "Cleaning (" + df["maintenance_or_cleaning_amenity_count"].astype(str) + ")"
)

# Generate embeddings
embedder = SentenceTransformer("all-MiniLM-L6-v2")  # Lightweight model
df["embedding"] = [embedder.encode(str(text)) for text in df["search_text"]]

# Save
df.to_parquet("real_estate_embeddings.parquet")
print("Preprocessing done! Embeddings saved.")

  from tqdm.autonotebook import tqdm, trange
  df = pd.read_csv("data.csv")


Preprocessing done! Embeddings saved.
