<a href="https://colab.research.google.com/github/Domauser/data_labelling/blob/main/Doma_Home_Classification_0_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ==========================================================
# Doma Property Image Tagging
# Author: Daniel Accetta
# Created: Aug 3, 2024
# Last Revised: Sept 7, 2025
# Goal: Apply AI tags to home images to power Doma Search
# -----------------------
# Phase 1. Imports
# -----------------------
!pip install -U -q google-generativeai

import os #creating files and directories
import re #regular expression
import random #for the initial model
import logging
import tqdm #allows fast, extensible progress for lopos and iterations
import numpy as np #mathmetical operations
import pandas as pd #manipulations in dataframes
from io import StringIO #needed for reading and writing files
from datetime import datetime
import google.generativeai as genai
from google.colab import userdata, drive, files
# Import the Image class from PIL
from PIL import Image
import cv2
import glob
from ast import literal_eval
import json

# -----------------------
# Phase 2. Configuration
# -----------------------
dataset_path = "/content/drive/MyDrive/Company Ideas/Real Estate Mobile App/Product/Sample Data/properties.csv"
prompt_path = "/content/drive/MyDrive/Colab Notebooks/Doma Project/prompt_final.txt"
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")

# -----------------------
# Phase 3. Setup API + Drive
# -----------------------
def setup_environment(api_key: str = None):
    """Mount Google Drive and configure API key."""
    drive.mount("/content/drive", force_remount=True)
    if not api_key:
        api_key = userdata.get("GOOGLE_API_KEY")
    if not api_key:
        raise ValueError("Google API Key not found in Colab Secrets.")
    os.environ["GOOGLE_API_KEY"] = api_key
    genai.configure(api_key=api_key)

def main(api_key: str = None):
    setup_environment(api_key)
    dataset = load_data(dataset_path, nrows=1) # Changed to make things easier for testing
    prompt = load_prompt(prompt_path)

    df_images = extract_images(dataset)
    df_clean = clean_dataset(dataset)
    df_tagged = apply_ai_tags(df_clean, prompt)

    save_outputs(df_tagged, df_images)
    logging.info("Pipeline completed successfully ✅")

# -----------------------
# Phase 4. Data Loading
# -----------------------
def load_data(path: str, nrows: int = 1000) -> pd.DataFrame:
    """Load dataset from CSV."""
    try:
        df = pd.read_csv(path, nrows=nrows)
        logging.info(f"Loaded dataset with {df.shape[0]} rows and {df.shape[1]} columns.")
        return df
    except Exception as e:
        logging.error(f"Error loading dataset: {e}")
        raise

def load_prompt(path: str) -> str:
    """Load text prompt for AI tagging."""
    with open(path, "r") as f:
        return f.read()

# -----------------------
# Phase 5. Preprocessing
# -----------------------
def extract_images(df: pd.DataFrame) -> pd.DataFrame:
    """Extract multiple image URLs per listing."""
    df_listing_images = pd.DataFrame()
    df_listing_images["id"] = df["id"]

    for i in range(1, 12):
        df_listing_images[f"image {i}"] = df["imageURLs"].str.split(",").str.get(i+1)
    return df_listing_images

def clean_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """Select and rename relevant property fields."""
    clean = pd.DataFrame({
        "id": df["id"],
        "propertyImage": df["imageURLs"].str.split(",").str.get(1),
        "propertyType": df["propertyType"],
        "bedrooms": df["numBedroom"],
        "bathrooms": df["numBathroom"],
        "floors": df["numFloor"],
        "rooms": df["numRoom"],
        "parking": df["parking"],
        "floorSizeValue": df["floorSizeValue"],
        "floorSizeUnit": df["floorSizeUnit"],
        "lotSize": df["lotSizeValue"],
        "lotSizeUnit": df["lotSizeUnit"],
        "yearBuilt": df["yearBuilt"],
        "address": df["address"],
        "city": df["city"],
        "state": df["province"],
        "country": df["country"],
        "county": df["county"],
        "zipCode": df["postalCode"],
        "latitude": df["latitude"],
        "longitude": df["longitude"],
        "listingName": df["listingName"],
        "price": df["mostRecentPriceAmount"],
        "status": df["mostRecentStatus"],
        "statusDate": df["mostRecentStatusDate"],
        "mlsNumber": df["mlsNumber"],
        "neighborhoods": df["neighborhoods"],
        "features": df["features"],
        "description": df["descriptions"],
        "recommendation": np.random.randint(1, 100, size=len(df))
    })

    # Define AI placeholder fields
    ai_fields = [
        "name", "style", "pitch", "insight", "total_costs", "living_space",
        "size", "layout", "condition", "natural_light", "storage_space",
        "updates", "energy_efficiency", "outdoor_space", "parking", "noise_level",
        "neighborhood", "school_district", "commute", "walkability", "zoning",
        "HOA", "resale_trends", "enviormental_risks", "community_features", "lifestyle_fit"
    ]
    for f in ai_fields:
        clean[f] = ""

    return clean

# -----------------------
# Phase 6. AI Tagging
# -----------------------
def apply_ai_tags(df: pd.DataFrame, prompt: str) -> pd.DataFrame:
    model = genai.GenerativeModel("gemini-1.5-flash")
    results = []

    for _, row in tqdm.tqdm(df.iterrows(), total=len(df), desc="Tagging listings"):
        row_dict = row.to_dict()
        response = model.generate_content(f"{prompt}\n{row_dict}")
        print(response.text)
        print(type(response.text))

        # Find first {...} block
        match = re.search(r"\{.*\}", response.text, re.DOTALL)
            if match:
                clean_json = match.group(0)
        try:
             parsed = json.loads(response.text)
             print(f"Parsed JSON for ID {row['id']}: {parsed}")  # Debug
             print(parsed["name"])        # should print: Redhawk Ridge Retreat
             print(parsed["style"])       # should print: Contemporary
             print(parsed["pitch"])       # should print: Luxury living...
        except json.JSONDecodeError:
            logging.error(f"AI tagging failed for ID {row['id']}: Invalid JSON response")
            parsed = {}
            print(f"Parsed JSON for ID {row['id']}: {parsed}")  # Debug

        # Define AI placeholder fields
        ai_fields = [
            "name", "style", "pitch", "insight", "total_costs", "living_space",
            "size", "layout", "condition", "natural_light", "storage_space",
            "updates", "energy_efficiency", "outdoor_space", "parking", "noise_level",
            "neighborhood", "school_district", "commute", "walkability", "zoning",
            "HOA", "resale_trends", "enviormental_risks", "community_features", "lifestyle_fit"
        ]
        for field in ai_fields:
            row[field] = parsed.get(field, "")

        results.append(row)

    return pd.DataFrame(results)
#    df["ai_response"] = responses
#    return df


# -----------------------
# Phase 7. Save Outputs
# -----------------------
#def save_outputs(df: pd.DataFrame, df_images: pd.DataFrame):
def save_outputs(df_main, df_images):
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    main_path = f"/content/drive/MyDrive/Colab Notebooks/Doma Project/response_{timestamp}.csv"
    img_path = f"/content/drive/MyDrive/Colab Notebooks/Doma Project/response_images_{timestamp}.csv"

    df_main.to_csv(main_path, index=False)
    df_images.to_csv(img_path, index=False)
    print(f"✅ Saved: {main_path}")
    print(f"✅ Saved: {img_path}")

    #df.to_csv(output_tags, index=False)
    #df_images.to_csv(output_images, index=False)
    logging.info(f"Saved outputs: {main_path}, {img_path}")

# -----------------------
# Phase 8. Pipeline Runner
# -----------------------

main()

# END OF NOTEBOOK

# Iterate through each row of the DataFrame
#for index, row in df_temp.iterrows():
#    # Convert the row to a dictionary
#    row_dict = row.to_dict()
    # Pass the dictionary and the prompt to the model
    # Modified to pass the row data as part of the prompt string
#    response = model.generate_content(f"{prompt}\n{row_dict}")
    # Process the response
#    temp = pd.DataFrame({response.text}, index=[0])
#    df = pd.concat([df, temp], ignore_index=True)

# Need to validate that df is writing temp from response

IndentationError: unexpected indent (ipython-input-2715087291.py, line 158)

In [None]:
response = """{
  "name": "Redhawk Ridge Retreat",
  "style": "Contemporary",
  "pitch": "Luxury living with panoramic views!",
  "insight": "This 6-bedroom home boasts significantly more living space than comparable properties in Redhawk, offering ample room for a large family.  The inclusion of solar panels is a rare and valuable asset, significantly lowering energy costs. The location within the gated Vintage community provides extra security and privacy.",
  "total_costs": "$1,058,000 (price) + Property Taxes + Insurance + HOA Fees",
  "living_space": "3971 sq ft",
  "size": "6 beds, 4 baths",
  "layout": "Two-story; formal dining and living rooms, open-concept kitchen and family room. Large master suite and 5 additional bedrooms.",
  "condition": "Built 2002, well-maintained; recent updates based on listing description.",
  "natural_light": "Abundant natural light, especially with the view.",
  "storage_space": "Ample closet space; walk-in pantry.",
  "updates": "Gourmet kitchen, updated bathrooms based on descriptions, new carpet and paint mentioned in some listings.",
  "energy_efficiency": "Solar panels; whole-house fan mentioned in some listings.",
  "outdoor_space": "Large backyard; saltwater pool and spa; built-in BBQ; fire pit; koi pond.",
  "parking": "Attached garage (size unspecified).",
  "noise_level": "Quiet, gated community.",
  "neighborhood": "Gated community in Redhawk, Temecula; known for its luxury homes and amenities.",
  "school_district": "Temecula Valley Unified School District; check specific school ratings.",
  "commute": "Proximity to amenities and freeways to be determined based on buyer's needs.",
  "walkability": "Limited walkability; car-dependent community.",
  "zoning": "Residential; check specific zoning regulations.",
  "HOA": "HOA fees applicable within the gated community; inquire for details.",
  "resale_trends": "Strong appreciation in Redhawk; check recent sales data.",
  "enviormental_risks": "High earthquake risk in the area; check specific flood risk.",
  "community_features": "Gated community; pool; resort-style amenities; views.",
  "lifestyle_fit": "Families seeking luxury, privacy, and resort-style living."
}"""

print(response)
print(type(response))

import json

parsed = json.loads(response)
print(parsed["name"])        # should print: Redhawk Ridge Retreat
print(parsed["style"])       # should print: Contemporary
print(parsed["pitch"])       # should print: Luxury living...

{
  "name": "Redhawk Ridge Retreat",
  "style": "Contemporary",
  "pitch": "Luxury living with panoramic views!",
  "insight": "This 6-bedroom home boasts significantly more living space than comparable properties in Redhawk, offering ample room for a large family.  The inclusion of solar panels is a rare and valuable asset, significantly lowering energy costs. The location within the gated Vintage community provides extra security and privacy.",
  "total_costs": "$1,058,000 (price) + Property Taxes + Insurance + HOA Fees",
  "living_space": "3971 sq ft",
  "size": "6 beds, 4 baths",
  "layout": "Two-story; formal dining and living rooms, open-concept kitchen and family room. Large master suite and 5 additional bedrooms.",
  "condition": "Built 2002, well-maintained; recent updates based on listing description.",
  "natural_light": "Abundant natural light, especially with the view.",
  "storage_space": "Ample closet space; walk-in pantry.",
  "updates": "Gourmet kitchen, updated bathro

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
