### Understand Product Structure Stats

In [None]:
%%bigquery
SELECT 
  COUNT(*) as total_products,
  COUNT(DISTINCT category) as categories,
  COUNT(DISTINCT brand) as brands,
  COUNT(DISTINCT department) as departments
FROM `bigquery-public-data.thelook_ecommerce.products`;

### Sample Product Structure

In [None]:
%%bigquery
SELECT 
  id,
  name,
  category,
  brand,
  department,
  retail_price,
  cost
FROM `bigquery-public-data.thelook_ecommerce.products`
LIMIT 5;

### Dataset Overview

In [None]:
%%bigquery
SELECT 
  '📊 TheLook E-commerce Dataset Overview' as analysis,
  COUNT(*) as total_products,
  COUNT(DISTINCT category) as categories,
  COUNT(DISTINCT brand) as brands,
  COUNT(DISTINCT department) as departments,
  AVG(retail_price) as avg_price,
  MIN(retail_price) as min_price,
  MAX(retail_price) as max_price
FROM `bigquery-public-data.thelook_ecommerce.products`
WHERE name IS NOT NULL;

### Create Product Features Table

In [None]:
%%bigquery
CREATE OR REPLACE TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_features` AS
SELECT 
  p.id,
  p.name,
  p.category,
  p.brand,
  p.department,
  p.retail_price,
  p.cost,
  CONCAT(
    'Product: ', p.name, '. ',
    'Category: ', p.category, '. ',
    'Brand: ', p.brand, '. ',
    'Department: ', p.department, '. ',
    'Price range: ', 
    CASE 
      WHEN p.retail_price < 20 THEN 'budget-friendly'
      WHEN p.retail_price < 50 THEN 'mid-range'
      WHEN p.retail_price < 100 THEN 'premium'
      ELSE 'luxury'
    END
  ) AS semantic_description
FROM `bigquery-public-data.thelook_ecommerce.products` p;

### Create Text Embedding Model

In [None]:
%%bigquery
CREATE MODEL `bigquery-hackathon-471715.thelook_ecommerce.thelook_model_text_embedding_004`
REMOTE WITH CONNECTION DEFAULT
OPTIONS(
  ENDPOINT = 'text-embedding-004'
);

### Generate Product Embeddings

In [None]:
%%bigquery
CREATE OR REPLACE TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings` AS
SELECT
  *,
  ml_generate_embedding_result AS embedding 
FROM
  ML.GENERATE_EMBEDDING(
    MODEL `bigquery-hackathon-471715.thelook_ecommerce.thelook_model_text_embedding_004`,
    (
      SELECT
        id,
        name,
        category,
        brand,
        department,
        retail_price,
        semantic_description AS content
      FROM `bigquery-hackathon-471715.thelook_ecommerce.product_features`
    )
  );

### Clean Product Embeddings

In [None]:
%%bigquery
CREATE OR REPLACE TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` AS
SELECT *
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings`
WHERE embedding IS NOT NULL
  AND ARRAY_LENGTH(embedding) = 768;

### Create Vector Index

In [None]:
%%bigquery
CREATE VECTOR INDEX product_similarity_index
ON `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`(embedding)
OPTIONS(
  index_type='IVF',
  distance_type='COSINE',
  ivf_options='{"num_lists": 1000}'
);

### Test 0: Basic Similar Product

In [None]:
%%bigquery
WITH target_product AS (
  SELECT embedding, name as target_name, category, brand, retail_price
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018 
)
SELECT 
  base.id,
  base.name,
  target.target_name,
  base.category,
  base.brand,
  base.retail_price,
  ROUND((1 - COSINE_DISTANCE(target.embedding, base.embedding)), 3) AS similarity_score
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` base
CROSS JOIN target_product target
WHERE base.id != 21018  
ORDER BY similarity_score DESC
LIMIT 5;

### Test 1: Core Recommendation Function

In [None]:
%%bigquery
WITH target_product AS (
  SELECT embedding, 
  name as target_name, 
  category, brand, 
  retail_price
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018 
),
similar_products AS (
  SELECT 
    base.id,
    base.name,
    base.category,
    base.brand,
    base.retail_price,
    ROUND((1 - COSINE_DISTANCE(target.embedding, base.embedding)), 3) AS similarity_score
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` base
  CROSS JOIN target_product target
  WHERE base.id != 21018
  ORDER BY similarity_score DESC
  LIMIT 5
)
SELECT * FROM similar_products;

### Test 2: Smart Multi-Factor Recommendations

In [None]:
%%bigquery
WITH target_product AS (
  SELECT 
    embedding, retail_price, category, brand, name as target_name
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
),
smart_recommendations AS (
  SELECT 
    p.id, p.name, t.target_name,
    p.category, p.brand, p.retail_price,
    ROUND((1 - COSINE_DISTANCE(t.embedding, p.embedding)), 3) * 0.6 AS semantic_score,
    ROUND((1 - ABS(p.retail_price - t.retail_price) / GREATEST(p.retail_price, t.retail_price)), 3) * 0.25 AS price_score,
    CASE WHEN p.category = t.category THEN 0.1 ELSE 0 END AS category_bonus,
    CASE WHEN p.brand = t.brand THEN 0.05 ELSE 0 END AS brand_bonus
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` p
  CROSS JOIN target_product t
  WHERE p.id != 21018
)
SELECT *,
  ROUND((semantic_score + price_score + category_bonus + brand_bonus), 3) AS total_score,
  CASE 
    WHEN category_bonus > 0 AND brand_bonus > 0 THEN '🎯 Perfect Match'
    WHEN category_bonus > 0 THEN '📂 Same Category'
    WHEN brand_bonus > 0 THEN '🏷️ Same Brand'
    WHEN semantic_score > 0.4 THEN '🧠 Semantically Similar'
    ELSE '🔄 Alternative'
  END AS match_type
FROM smart_recommendations
ORDER BY total_score DESC
LIMIT 8;

### Test 3: Cross-Category Discovery

In [None]:
%%bigquery
WITH target AS (
  SELECT embedding, category as target_category, name as target_name, brand, retail_price
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` 
  WHERE id = 21018
)
SELECT 
  p.id, p.name, t.target_name,
  p.category, t.target_category,
  p.brand, p.retail_price,
  ROUND((1 - COSINE_DISTANCE(t.embedding, p.embedding)), 3) AS similarity,
  CASE WHEN p.category = t.target_category THEN '✅ Same' ELSE '🔀 Different' END AS category_match
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` p
CROSS JOIN target t  
WHERE p.id != 21018
  AND p.category != t.target_category
ORDER BY similarity DESC
LIMIT 8;

### Test 4: Price-Conscious Recommendations

In [None]:
%%bigquery
WITH target AS (
  SELECT embedding, retail_price, name, category, brand
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
)
SELECT 
  p.id, p.name, t.name as original_product,
  p.category, p.brand, 
  p.retail_price, t.retail_price as original_price,
  ROUND((1 - COSINE_DISTANCE(t.embedding, p.embedding)), 3) AS similarity,
  ROUND(((p.retail_price - t.retail_price) / t.retail_price * 100), 1) AS price_change_pct,
  CASE 
    WHEN p.retail_price < t.retail_price THEN '💰 Cheaper'
    WHEN p.retail_price = t.retail_price THEN '💯 Same Price'
    ELSE '💎 Premium'
  END AS price_category
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` p
CROSS JOIN target t
WHERE p.id != 21018
  AND ABS(p.retail_price - t.retail_price) / t.retail_price <= 0.2
ORDER BY similarity DESC
LIMIT 10;

### Test 5: Trend-Aware Recommendations

In [None]:
%%bigquery
WITH target AS (
  SELECT id, embedding, name, category, brand, retail_price
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
),
popularity AS (
  SELECT product_id, COUNT(*) AS order_count
  FROM `bigquery-public-data.thelook_ecommerce.order_items`
  GROUP BY product_id
)
SELECT 
  p.id, p.name, t.name as original_product,
  p.category, p.brand, p.retail_price,
  COALESCE(pop.order_count, 0) AS popularity,
  ROUND((1 - COSINE_DISTANCE(t.embedding, p.embedding)), 3) AS similarity,
  ROUND((1 - COSINE_DISTANCE(t.embedding, p.embedding)), 3) * 0.7 +
  LOG10(COALESCE(pop.order_count, 1)) * 0.3 AS weighted_score
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` p
LEFT JOIN popularity pop ON p.id = pop.product_id
CROSS JOIN target t
WHERE p.id != t.id
ORDER BY weighted_score DESC
LIMIT 10;

### Test 6: Department-Level Exploration

In [None]:
%%bigquery
WITH target AS (
  SELECT embedding, department as target_department, name as target_name
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
)
SELECT 
  p.id, p.name, t.target_name,
  p.department, t.target_department,
  ROUND((1 - COSINE_DISTANCE(t.embedding, p.embedding)), 3) AS similarity
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` p
CROSS JOIN target t
WHERE p.id != 21018
  AND p.department != t.target_department
ORDER BY similarity DESC
LIMIT 10;

In [None]:
%%bigquery
WITH target AS (
  SELECT id, embedding, name FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
)
SELECT 
  p.id, p.name, t.name as original_product,
  ROUND((1 - COSINE_DISTANCE(t.embedding, p.embedding)), 3) AS similarity
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` p
CROSS JOIN target t
LEFT JOIN `bigquery-hackathon-471715.thelook_ecommerce.inventory` i ON p.id = i.product_id
WHERE p.id != t.id
  AND i.in_stock = TRUE
ORDER BY similarity DESC
LIMIT 5;

### Python: Generate Embeddings

In [None]:
import bigframes.ml.llm as llm
import bigframes.bigquery as bq

generator = llm.TextEmbeddingGenerator(model_name="text-embedding-004")
products = bq.read_gbq("bigquery-hackathon-471715.thelook_ecommerce.product_features")
embeddings = generator.generate(products["semantic_description"])
products["embedding"] = embeddings

### Python: Create Vector Index

In [None]:
bq.create_vector_index(
    table="bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean",
    column="embedding",
    index_type="IVF",
    distance_type="COSINE",
    options={"num_lists": 1000}
)

### Python: Run Vector Search

In [None]:
from bigframes.bigquery import vector_search

query_embedding = products.loc[products["id"] == 21018, "embedding"].iloc[0]

results = vector_search(
    table="bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean",
    column="embedding",
    query_vector=query_embedding,
    top_k=5
)
print(results[["id", "name", "similarity"]])

### Python: Multi-Criteria Search

In [None]:
target = products.loc[products["id"] == 21018].iloc[0]
query_vector = target["embedding"]

results = vector_search(
    table="bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean",
    column="embedding",
    query_vector=query_vector,
    top_k=20
)
filtered = results[
    (results["retail_price"] >= 0.8 * target["retail_price"]) &
    (results["retail_price"] <= 1.2 * target["retail_price"])
]
print(filtered[["id", "name", "retail_price", "similarity"]].head(10))

### Test 7: In-Stock Substitutes (using inventory_items)

In [None]:
%%bigquery
WITH target AS (
  SELECT id, embedding, name
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
),
inventory_status AS (
  SELECT 
    product_id,
    COUNTIF(sold_at IS NULL OR sold_at > CURRENT_TIMESTAMP()) AS active_stock
  FROM `bigquery-public-data.thelook_ecommerce.inventory_items`
  GROUP BY product_id
)
SELECT 
  p.id, p.name, t.name AS original_product,
  ROUND((1 - COSINE_DISTANCE(t.embedding, p.embedding)), 3) AS similarity,
  CASE 
    WHEN COALESCE(inv.active_stock, 0) > 0 THEN '✅ In Stock'
    ELSE '❌ Out of Stock'
  END AS stock_status
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` p
CROSS JOIN target t
LEFT JOIN inventory_status inv ON p.id = inv.product_id
WHERE p.id != t.id
  AND COALESCE(inv.active_stock, 0) > 0
ORDER BY similarity DESC
LIMIT 5;