In [0]:
%sql
select * from mycatalog.hp_prd_data.fp_home_features limit 10

In [0]:
%sql
select * from mycatalog.hp_prd_data.fp_parsed_data limit 10

In [0]:
%sql
select 
fp_parsed.*,
fp_hm_feat.top_features
from mycatalog.hp_prd_data.fp_parsed_data fp_parsed
inner join mycatalog.hp_prd_data.fp_home_features fp_hm_feat
on fp_parsed.propertyId = fp_hm_feat.propertyId
limit 10

In [0]:
_sqldf.columns

In [0]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType

def create_template(row):
    return (
        f"Property Type: {row.propertyType}. Located in {row.location}, {row.city}, {row.state}, {row.countryCode}, "
        f"ZIP {row.postalCode}. Built in {row.yearBuilt}, approximately {row.sqft} sqft "
        f"on a {row.lot_size} sqft lot with {row.beds} bedrooms and {row.baths} bathrooms "
        f"({row.fullbaths} full, {row.partialBaths} partial). Listed price is ${row.house_price}. "
        f"Stories: {row.stories}. Additional Features: {row.top_features}. "
        f"Neighborhood population: {row.zip_population}, density: {row.zip_density}. "
    )

template_udf = udf(create_template, StringType())



In [0]:
df = spark.sql("""
            select 
            fp_parsed.*,
            fp_hm_feat.top_features
            from mycatalog.hp_prd_data.fp_parsed_data fp_parsed
            inner join mycatalog.hp_prd_data.fp_home_features fp_hm_feat
            on fp_parsed.propertyId = fp_hm_feat.propertyId
""")

In [0]:
df.count()

In [0]:
df.limit(4).toPandas()

In [0]:
# Create a single column 'text_for_embedding'
columns_needed = [
 'propertyType','location','city','state','countryCode','postalCode',
 'yearBuilt','sqft','lot_size','beds','baths','fullbaths','partialBaths',
 'house_price','stories','top_features','zip_population','zip_density'
]

df_text = df.select(*columns_needed)
df_text = df_text.withColumn("embedding_text", template_udf(*columns_needed))


In [0]:
df.select(*columns_needed).limit(4).toPandas()

In [0]:
df_text.printSchema()

In [0]:
df_text.show(5, truncate=False)

In [0]:
from pyspark.sql.functions import udf, col, when, coalesce, lit
from pyspark.sql.types import StringType, Row

def create_template(propertyType, location, city, state, countryCode, postalCode,
                   yearBuilt, sqft, lot_size, beds, baths, fullbaths, partialBaths,
                   house_price, stories, top_features, zip_population, zip_density):
    """
    Create a template string for property data with null handling
    """
    # Helper function to handle null values
    def safe_str(value, default="N/A"):
        return str(value) if value is not None else default
    
    return (
        f"Property Type: {safe_str(propertyType)}. "
        f"Located in {safe_str(location)}, {safe_str(city)}, {safe_str(state)}, {safe_str(countryCode)}, "
        f"ZIP {safe_str(postalCode)}. "
        f"Built in {safe_str(yearBuilt)}, approximately {safe_str(sqft)} sqft "
        f"on a {safe_str(lot_size)} sqft lot with {safe_str(beds)} bedrooms and {safe_str(baths)} bathrooms "
        f"({safe_str(fullbaths)} full, {safe_str(partialBaths)} partial). "
        f"Listed price is ${safe_str(house_price)}. "
        f"Stories: {safe_str(stories)}. "
        f"Additional Features: {safe_str(top_features)}. "
        f"Neighborhood population: {safe_str(zip_population)}, density: {safe_str(zip_density)}."
    )

# Register UDF with proper parameter mapping
template_udf = udf(create_template, StringType())

# Load data with join
df = spark.sql("""
    SELECT
        fp_parsed.*,
        fp_hm_feat.top_features
    FROM mycatalog.hp_prd_data.fp_parsed_data fp_parsed
    INNER JOIN mycatalog.hp_prd_data.fp_home_features fp_hm_feat
        ON fp_parsed.propertyId = fp_hm_feat.propertyId
""")

# Define columns needed for embedding
columns_needed = [
    'propertyType', 'location', 'city', 'state', 'countryCode', 'postalCode',
    'yearBuilt', 'sqft', 'lot_size', 'beds', 'baths', 'fullbaths', 'partialBaths',
    'house_price', 'stories', 'top_features', 'zip_population', 'zip_density'
]

# Select required columns and create embedding text
df_text = df.select(*columns_needed)

# Apply UDF with individual column references
df_text = df_text.withColumn(
    "embedding_text", 
    template_udf(
        col('propertyType'), col('location'), col('city'), col('state'), 
        col('countryCode'), col('postalCode'), col('yearBuilt'), col('sqft'),
        col('lot_size'), col('beds'), col('baths'), col('fullbaths'), 
        col('partialBaths'), col('house_price'), col('stories'), 
        col('top_features'), col('zip_population'), col('zip_density')
    )
)

In [0]:
df_text.select(col("embedding_text")).show(5, truncate=False)

In [0]:
# now how do we create vectors with it

In [0]:
# In Databricks notebook
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch

# Load and preprocess your data
df = df_text.select("embedding_text").limit(20)

# Clean and chunk your text data
def chunk_text(text, chunk_size=512, overlap=50):
    """Split text into overlapping chunks"""
    words = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size - overlap):
        chunk = ' '.join(words[i:i + chunk_size])
        chunks.append(chunk)
    return chunks

# Apply chunking to your documents
df_chunked = df.rdd.flatMap(lambda row: [
    (row.id, chunk, row.metadata) 
    for chunk in chunk_text(row.content)
]).toDF(["doc_id", "chunk", "metadata"])