<a href="https://colab.research.google.com/github/Domauser/data_labelling/blob/main/Doma_Home_Classification_1_0.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# ==========================================================
# Doma Property Image Tagging
# Author: Daniel Accetta
# Created: Aug 3, 2024
# Last Revised: Sept 14, 2025
# Goal: Apply AI tags to home images to power Doma Search
# -----------------------
# Phase 1. Imports
# -----------------------
!pip install -U -q google-generativeai

import os #creating files and directories
import re #regular expression
import random #for the initial model
import logging
import tqdm #allows fast, extensible progress for lopos and iterations
import numpy as np #mathmetical operations
import pandas as pd #manipulations in dataframes
from io import StringIO #needed for reading and writing files
from datetime import datetime
import google.generativeai as genai
from google.colab import userdata, drive, files
# Import the Image class from PIL
from PIL import Image
import cv2
import glob
from ast import literal_eval
import json

# -----------------------
# Phase 2. Configuration
# -----------------------
dataset_path = "/content/drive/MyDrive/Company Ideas/Real Estate Mobile App/Product/Sample Data/properties.csv"
prompt_path = "/content/drive/MyDrive/Colab Notebooks/Doma Project/prompt_final.txt"
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

def safe_json_parse(json_string: str) -> dict:
    """
    Safely attempts to parse a JSON string.

    Args:
        json_string: The string to parse.

    Returns:
        A dictionary if parsing is successful, otherwise an empty dictionary.
    """
    try:
        # Attempt to clean the string to remove common issues before parsing
        clean_json_string = json_string.strip()
        # If the string is enclosed in code blocks, extract the content
        if clean_json_string.startswith('```json'):
            clean_json_string = clean_json_string[7:]
            if clean_json_string.endswith('```'):
                clean_json_string = clean_json_string[:-3]
        elif clean_json_string.startswith('```'):
             clean_json_string = clean_json_string[3:]
             if clean_json_string.endswith('```'):
                clean_json_string = clean_json_string[:-3]

        # Use literal_eval as a fallback for less strict parsing
        parsed = literal_eval(clean_json_string)
        return parsed if isinstance(parsed, dict) else {}
    except (SyntaxError, ValueError, json.JSONDecodeError) as e:
        logging.error(f"Error parsing JSON: {e}")
        return {}

# -----------------------
# Phase 3. Setup API + Drive
# -----------------------
def setup_environment(api_key: str = None):
    """Mount Google Drive and configure API key."""
    drive.mount("/content/drive", force_remount=True)
    if not api_key:
        api_key = userdata.get("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("Google API Key not found in Colab Secrets.")
    os.environ["GOOGLE_API_KEY"] = api_key
    genai.configure(api_key=api_key)

def main(api_key: str = None):
    setup_environment(api_key)
    dataset = load_data(dataset_path, nrows=1000) # Change based on file size
    prompt = load_prompt(prompt_path)

    df_images = extract_images(dataset)
    df_clean = clean_dataset(dataset)
    df_tagged = apply_ai_tags(df_clean, prompt)

    save_outputs(df_tagged, df_images)
    logging.info("Pipeline completed successfully ✅")

# -----------------------
# Phase 4. Data Loading
# -----------------------
def load_data(path: str, nrows: int = 1000) -> pd.DataFrame:
    """Load dataset from CSV."""
    try:
        df = pd.read_csv(path, nrows=nrows)
        logging.info(f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns.")
        return df
    except Exception as e:
        logging.error(f"Error loading dataset: {e}")
        raise

def load_prompt(path: str) -> str:
    """Load text prompt for AI tagging."""
    with open(path, "r") as f:
        return f.read()

# -----------------------
# Phase 5. Preprocessing
# -----------------------
def extract_images(df: pd.DataFrame) -> pd.DataFrame:
    """Extract multiple image URLs per listing."""
    df_listing_images = pd.DataFrame()
    df_listing_images["id"] = df["id"]

    for i in range(1, 12):
        df_listing_images[f"image {i}"] = df["imageURLs"].str.split(",").str.get(i+1)
    return df_listing_images

def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """Select and rename relevant property fields."""
    clean = pd.DataFrame({
        "id": df["id"],
        "propertyImage": df["imageURLs"].str.split(",").str.get(1),
        "propertyType": df["propertyType"],
        "bedrooms": df["numBedroom"],
        "bathrooms": df["numBathroom"],
        "floors": df["numFloor"],
        "rooms": df["numRoom"],
        "parking": df["parking"],
        "floorSizeValue": df["floorSizeValue"],
        "floorSizeUnit": df["floorSizeUnit"],
        "lotSize": df["lotSizeValue"],
        "lotSizeUnit": df["lotSizeUnit"],
        "yearBuilt": df["yearBuilt"],
        "address": df["address"],
        "city": df["city"],
        "state": df["province"],
        "country": df["country"],
        "county": df["county"],
        "zipCode": df["postalCode"],
        "latitude": df["latitude"],
        "longitude": df["longitude"],
        "listingName": df["listingName"],
        "price": df["mostRecentPriceAmount"],
        "status": df["mostRecentStatus"],
        "statusDate": df["mostRecentStatusDate"],
        "mlsNumber": df["mlsNumber"],
        "neighborhoods": df["neighborhoods"],
        "features": df["features"],
        "description": df["descriptions"],
        "recommendation": np.random.randint(1, 100, size=len(df))
    })

    # Define AI placeholder fields
    ai_fields = [
        "name", "style", "pitch", "insight", "total_costs", "living_space",
        "size", "layout", "condition", "natural_light", "storage_space",
        "updates", "energy_efficiency", "outdoor_space", "parking", "noise_level",
        "neighborhood", "school_district", "commute", "walkability", "zoning",
        "HOA", "resale_trends", "enviormental_risks", "community_features", "lifestyle_fit"
    ]
    for f in ai_fields:
        clean[f] = ""

    return clean

# -----------------------
# Phase 6. AI Tagging
# -----------------------
def apply_ai_tags(df: pd.DataFrame, prompt: str) -> pd.DataFrame:
    model = genai.GenerativeModel("gemini-1.5-flash")
    results = []

    for _, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Tagging listings"):
        row_dict = row.to_dict()
        response = model.generate_content(f"{prompt}\n{row_dict}")

        # Find first {...} block
        match = re.search(r"\{.*\}", response.text, re.DOTALL)
        if match:
            clean_json = match.group(0)
        try:
             #parsed = json.loads(response.text)
             parsed = safe_json_parse(response.text)
        except json.JSONDecodeError:
            logging.error(f"AI tagging failed for ID {row['id']}: Invalid JSON response")
            parsed = {}
            print(f"Parsed JSON for ID {row['id']}: {parsed}")  # Debug)

        # Define AI placeholder fields
        ai_fields = [
            "name", "style", "pitch", "insight", "total_costs", "living_space",
            "size", "layout", "condition", "natural_light", "storage_space",
            "updates", "energy_efficiency", "outdoor_space", "parking", "noise_level",
            "neighborhood", "school_district", "commute", "walkability", "zoning",
            "HOA", "resale_trends", "enviormental_risks", "community_features", "lifestyle_fit"
        ]
        for field in ai_fields:
            row[field] = parsed.get(field, "")

        results.append(row)

    return pd.DataFrame(results)

# -----------------------
# Phase 7. Save Outputs
# -----------------------
#def save_outputs(df: pd.DataFrame, df_images: pd.DataFrame):
def save_outputs(df_main, df_images):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    main_path = f"/content/drive/MyDrive/Colab Notebooks/Doma Project/response_{timestamp}.csv"
    img_path = f"/content/drive/MyDrive/Colab Notebooks/Doma Project/response_images_{timestamp}.csv"

    df_main.to_csv(main_path, index=False)
    df_images.to_csv(img_path, index=False)
    print(f"✅ Saved: {main_path}")
    print(f"✅ Saved: {img_path}")
    logging.info(f"Saved outputs: {main_path}, {img_path}")

# -----------------------
# Phase 8. Pipeline Runner
# -----------------------

main()

# END OF NOTEBOOK

Mounted at /content/drive


Tagging listings:  20%|██        | 1/5 [00:05<00:22,  5.51s/it]

```json
{
  "name": "Redhawk Vista Estate",
  "style": "Contemporary",
  "pitch": "Spacious family home with stunning views!",
  "insight": "This 6-bedroom home significantly surpasses the average size for homes in this gated community of Redhawk.  The inclusion of solar panels is a substantial upgrade, adding significant value and reducing long-term energy costs. The large lot offers impressive privacy compared to many others in the area.",
  "total_costs": "$1,058,000 (excluding taxes, insurance, and HOA fees)",
  "living_space": "3,971 sq ft",
  "size": "6 beds, 4 baths",
  "layout": "Two-story with multiple living areas, open kitchen/family room, downstairs bedroom (potential guest suite).",
  "condition": "Built in 2002, well-maintained; recent updates mentioned in listing.",
  "natural_light": "Abundant natural light throughout; descriptions suggest large windows and potentially a sunroom.",
  "storage_space": "Information not explicitly provided but likely includes multiple clos

Tagging listings:  40%|████      | 2/5 [00:10<00:16,  5.43s/it]

```json
{
  "name": "Woodbridge Jewel",
  "style": "Contemporary",
  "pitch": "Modern 5-bed, ready now!",
  "insight": "This new construction home offers exceptional value compared to similar properties in Wylie, boasting a larger square footage and updated features at a competitive price point.  The location within the Manors at Woodbridge community provides access to amenities and a desirable school district, while still maintaining affordability relative to other areas with similar schools.",
  "total_costs": "$599,000 (plus taxes, insurance, and $40/month HOA)",
  "living_space": "3023 sq ft",
  "size": "5 beds, 3.5 baths",
  "layout": "Open-concept main floor with 2-story family room,  separate dining, game room and 4 upstairs bedrooms; 1 downstairs bedroom. ",
  "condition": "New construction (2023), excellent condition",
  "natural_light": "Abundant natural light throughout",
  "storage_space": "2-car garage, walk-in closets, and ample storage",
  "updates": "Modern kitchen with

Tagging listings:  60%|██████    | 3/5 [00:15<00:09,  4.98s/it]

```json
{
  "name": "Sapphire Grove Sanctuary",
  "style": "Contemporary",
  "pitch": "New build, open concept, family-friendly!",
  "insight": "This new construction home offers a surprisingly spacious layout for its price point compared to other similar homes in the Southeast San Antonio area.  The inclusion of a loft adds significant living space, often absent in homes of this size. The community amenities are a plus, particularly appealing to families.",
  "total_costs": "$243,999 + taxes, insurance, $107/month HOA",
  "living_space": "1952 sq ft",
  "size": "4 beds, 2.5 baths",
  "layout": "Open-concept main floor, loft upstairs, all bedrooms upstairs",
  "condition": "New construction (2024), excellent condition",
  "natural_light": "Abundant natural light throughout",
  "storage_space": "Two-car garage, walk-in closets, loft storage",
  "updates": "All new appliances and finishes",
  "energy_efficiency": "Energy-efficient appliances, potentially solar-ready",
  "outdoor_space": 

Tagging listings:  80%|████████  | 4/5 [00:19<00:04,  4.56s/it]

```json
{
  "name": "Marion Oaks Oasis",
  "style": "Ranch",
  "pitch": "Spacious 4-bed ranch, new construction!",
  "insight": "This new construction Ranch home offers significantly more living space than comparable properties in Marion Oaks at a competitive price point.  The open-concept design is highly desirable for families and entertaining.  While located in a growing area, the neighborhood offers a quiet and family-friendly setting.",
  "total_costs": "$325,990 (estimated, buyer to verify taxes, insurance)",
  "living_space": "1828 sq ft",
  "size": "4 beds, 2 baths",
  "layout": "Open-concept main floor",
  "condition": "New construction (completion March 2025)",
  "natural_light": "Northeast facing, good sunlight",
  "storage_space": "2-car garage, closets",
  "updates": "All new appliances, smart home system",
  "energy_efficiency": "Central A/C, electric heating",
  "outdoor_space": "Yard, potential for landscaping",
  "parking": "2-car garage",
  "noise_level": "Quiet, subu

Tagging listings: 100%|██████████| 5/5 [00:24<00:00,  4.85s/it]

```json
{
  "name": "Lanthorne Lodge",
  "style": "Raised Ranch",
  "pitch": "Monroe oasis: Pool, 3 garages, updated!",
  "insight": "This raised ranch offers exceptional value compared to similar properties in Monroe, boasting a larger-than-average lot size and substantial living space.  The extensive updates, including the kitchen and bathrooms, are significant upgrades rarely found in homes of this age. The three-car garage is a standout feature, and while the property is situated in a quiet neighborhood, its proximity to amenities is a key advantage.",
  "total_costs": "$649,000 (plus taxes, insurance)",
  "living_space": "2913 sq ft",
  "size": "3 beds, 3 baths",
  "layout": "Split-level with formal dining and open family room",
  "condition": "Built 1963, extensively updated",
  "natural_light": "Abundant natural light throughout",
  "storage_space": "3-car garage, partially finished basement, ample closets",
  "updates": "Custom kitchen, updated bathrooms, new boiler, herringbon




In [3]:
def safe_json_parse(json_string: str) -> dict:
    """
    Safely attempts to parse a JSON string.

    Args:
        json_string: The string to parse.

    Returns:
        A dictionary if parsing is successful, otherwise an empty dictionary.
    """
    try:
        # Attempt to clean the string to remove common issues before parsing
        clean_json_string = json_string.strip()
        # If the string is enclosed in code blocks, extract the content
        if clean_json_string.startswith('```json'):
            clean_json_string = clean_json_string[7:]
            if clean_json_string.endswith('```'):
                clean_json_string = clean_json_string[:-3]
        elif clean_json_string.startswith('```'):
             clean_json_string = clean_json_string[3:]
             if clean_json_string.endswith('```'):
                clean_json_string = clean_json_string[:-3]

        # Use literal_eval as a fallback for less strict parsing
        parsed = literal_eval(clean_json_string)
        return parsed if isinstance(parsed, dict) else {}
    except (SyntaxError, ValueError, json.JSONDecodeError) as e:
        logging.error(f"Error parsing JSON: {e}")
        return {}

In [None]:
response = """{
  "name": "Redhawk Ridge Retreat",
  "style": "Contemporary",
  "pitch": "Luxury living with panoramic views!",
  "insight": "This 6-bedroom home boasts significantly more living space than comparable properties in Redhawk, offering ample room for a large family.  The inclusion of solar panels is a rare and valuable asset, significantly lowering energy costs. The location within the gated Vintage community provides extra security and privacy.",
  "total_costs": "$1,058,000 (price) + Property Taxes + Insurance + HOA Fees",
  "living_space": "3971 sq ft",
  "size": "6 beds, 4 baths",
  "layout": "Two-story; formal dining and living rooms, open-concept kitchen and family room. Large master suite and 5 additional bedrooms.",
  "condition": "Built 2002, well-maintained; recent updates based on listing description.",
  "natural_light": "Abundant natural light, especially with the view.",
  "storage_space": "Ample closet space; walk-in pantry.",
  "updates": "Gourmet kitchen, updated bathrooms based on descriptions, new carpet and paint mentioned in some listings.",
  "energy_efficiency": "Solar panels; whole-house fan mentioned in some listings.",
  "outdoor_space": "Large backyard; saltwater pool and spa; built-in BBQ; fire pit; koi pond.",
  "parking": "Attached garage (size unspecified).",
  "noise_level": "Quiet, gated community.",
  "neighborhood": "Gated community in Redhawk, Temecula; known for its luxury homes and amenities.",
  "school_district": "Temecula Valley Unified School District; check specific school ratings.",
  "commute": "Proximity to amenities and freeways to be determined based on buyer's needs.",
  "walkability": "Limited walkability; car-dependent community.",
  "zoning": "Residential; check specific zoning regulations.",
  "HOA": "HOA fees applicable within the gated community; inquire for details.",
  "resale_trends": "Strong appreciation in Redhawk; check recent sales data.",
  "enviormental_risks": "High earthquake risk in the area; check specific flood risk.",
  "community_features": "Gated community; pool; resort-style amenities; views.",
  "lifestyle_fit": "Families seeking luxury, privacy, and resort-style living."
}"""

print(response)
print(type(response))

import json

parsed = json.loads(response)
print(parsed["name"])        # should print: Redhawk Ridge Retreat
print(parsed["style"])       # should print: Contemporary
print(parsed["pitch"])       # should print: Luxury living...

{
  "name": "Redhawk Ridge Retreat",
  "style": "Contemporary",
  "pitch": "Luxury living with panoramic views!",
  "insight": "This 6-bedroom home boasts significantly more living space than comparable properties in Redhawk, offering ample room for a large family.  The inclusion of solar panels is a rare and valuable asset, significantly lowering energy costs. The location within the gated Vintage community provides extra security and privacy.",
  "total_costs": "$1,058,000 (price) + Property Taxes + Insurance + HOA Fees",
  "living_space": "3971 sq ft",
  "size": "6 beds, 4 baths",
  "layout": "Two-story; formal dining and living rooms, open-concept kitchen and family room. Large master suite and 5 additional bedrooms.",
  "condition": "Built 2002, well-maintained; recent updates based on listing description.",
  "natural_light": "Abundant natural light, especially with the view.",
  "storage_space": "Ample closet space; walk-in pantry.",
  "updates": "Gourmet kitchen, updated bathro

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
