# Bigquery Version

### Local Setup in Jupyter Notebook or Jupyter Lab(optional)

In [None]:
# from dotenv import load_dotenv
# import os

# load_dotenv()
# os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = os.getenv('GOOGLE_APPLICATION_CREDENTIALS')
# or export GOOGLE_APPLICATION_CREDENTIALS=your google application credentails service account json file
# make sure you have enabled the bigquery api and has proper access to bigquery crate job, query, write etc.(Admin access mostly)
# %load_ext google.cloud.bigquery
# %reload_ext google.cloud.bigquery

### Understand Product Structure Stats

In [None]:
%%bigquery
-- This query counts the total number of products, unique categories, brands, and departments
-- in the `products` table to understand the product structure.
SELECT
  COUNT(*) as total_products,
  COUNT(DISTINCT category) as categories,
  COUNT(DISTINCT brand) as brands,
  COUNT(DISTINCT department) as departments
FROM `bigquery-public-data.thelook_ecommerce.products`;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,total_products,categories,brands,departments
0,29120,26,2756,2


### Sample Product Structure

In [None]:
%%bigquery
-- This query selects a sample of 5 rows from the `products` table
-- to provide a glimpse into the product structure.
SELECT
  id,
  name,
  category,
  brand,
  department,
  retail_price,
  cost
FROM `bigquery-public-data.thelook_ecommerce.products`
LIMIT 5;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,category,brand,department,retail_price,cost
0,13842,Low Profile Dyed Cotton Twill Cap - Navy W39S55D,Accessories,MG,Women,6.25,2.51875
1,13928,Low Profile Dyed Cotton Twill Cap - Putty W39S55D,Accessories,MG,Women,5.95,2.33835
2,14115,Enzyme Regular Solid Army Caps-Black W35S45D,Accessories,MG,Women,10.99,4.87956
3,14157,Enzyme Regular Solid Army Caps-Olive W35S45D (...,Accessories,MG,Women,10.99,4.64877
4,14273,Washed Canvas Ivy Cap - Black W11S64C,Accessories,MG,Women,15.99,6.50793


### Dataset Overview

In [None]:
%%bigquery
-- This query provides a general overview of the dataset,
-- including the total number of products, unique categories, brands,
-- departments, and price statistics.
SELECT
  '📊 TheLook E-commerce Dataset Overview' as analysis,
  COUNT(*) as total_products,
  COUNT(DISTINCT category) as categories,
  COUNT(DISTINCT brand) as brands,
  COUNT(DISTINCT department) as departments,
  AVG(retail_price) as avg_price,
  MIN(retail_price) as min_price,
  MAX(retail_price) as max_price
FROM `bigquery-public-data.thelook_ecommerce.products`
WHERE name IS NOT NULL;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,analysis,total_products,categories,brands,departments,avg_price,min_price,max_price
0,📊 TheLook E-commerce Dataset Overview,29118,26,2756,2,59.217913,0.02,999.0


### Create Product Features Table

In [None]:
%%bigquery
-- Creates a table `product_features` with product information and a semantic description.
-- The semantic description concatenates product details and a price range category.
CREATE OR REPLACE TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_features` AS
SELECT
  p.id,
  p.name,
  p.category,
  p.brand,
  p.department,
  p.retail_price,
  p.cost,
  CONCAT(
    'Product: ', p.name, '. ',
    'Category: ', p.category, '. ',
    'Brand: ', p.brand, '. ',
    'Department: ', p.department, '. ',
    'Price range: ',
    CASE
      WHEN p.retail_price < 20 THEN 'budget-friendly'
      WHEN p.retail_price < 50 THEN 'mid-range'
      WHEN p.retail_price < 100 THEN 'premium'
      ELSE 'luxury'
    END
  ) AS semantic_description
FROM `bigquery-public-data.thelook_ecommerce.products` p;

Query is running:   0%|          |

### Create Text Embedding Model

In [None]:
%%bigquery
-- Creates a remote model for text embedding using the 'text-embedding-004' endpoint.
CREATE MODEL `bigquery-hackathon-471715.thelook_ecommerce.thelook_model_text_embedding_004`
REMOTE WITH CONNECTION DEFAULT
OPTIONS(
  ENDPOINT = 'text-embedding-004'
);

Query is running:   0%|          |

### Bigquery: Generate Product Embeddings

In [None]:
%%bigquery
-- Generates text embeddings for the semantic descriptions in the `product_features` table
-- using the created embedding model and stores them in `product_embeddings`.
CREATE OR REPLACE TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings` AS
SELECT
  *,
  ml_generate_embedding_result AS embedding
FROM
  ML.GENERATE_EMBEDDING(
    MODEL `bigquery-hackathon-471715.thelook_ecommerce.thelook_model_text_embedding_004`,
    (
      SELECT
        id,
        name,
        category,
        brand,
        department,
        retail_price,
        semantic_description AS content
      FROM `bigquery-hackathon-471715.thelook_ecommerce.product_features`
    )
  );

Query is running:   0%|          |

### Bigquery: Clean Product Embeddings

In [None]:
%%bigquery
--
CREATE OR REPLACE TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` AS
SELECT *
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings`
WHERE embedding IS NOT NULL
  AND ARRAY_LENGTH(embedding) = 768;

Query is running:   0%|          |

### Bigquery: Create Vector Index

In [None]:
%%bigquery
CREATE VECTOR INDEX product_similarity_index
ON `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`(embedding)
OPTIONS(
  index_type='IVF',
  distance_type='COSINE',
  ivf_options='{"num_lists": 1000}'
);

Query is running:   0%|          |

### Target product structure

In [None]:
%%bigquery
-- This query shows the structure of the `products` table.
SELECT * FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` WHERE id = 21018;

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,ml_generate_embedding_result,ml_generate_embedding_statistics,ml_generate_embedding_status,id,name,category,brand,department,retail_price,content,embedding
0,"[0.032505184412002563, -0.024057593196630478, ...","{""token_count"":36,""truncated"":false}",,21018,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,Men,49.990002,Product: Wrangler Men's Western Traditional Bo...,"[0.032505184412002563, -0.024057593196630478, ..."


### Normal Search

In [None]:
%%bigquery
WITH target_product AS (
  SELECT category, brand
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
)
SELECT
  base.id,
  base.name,
  base.category,
  base.brand,
  base.retail_price
FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean` base
JOIN target_product target
  ON base.category = target.category
  AND base.brand = target.brand
WHERE base.id != 21018 and LOWER(name) like '%jeans%'
ORDER BY base.retail_price ASC
LIMIT 5;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,category,brand,retail_price
0,20645,Genuine Wrangler Men's Loose Fit Jeans,Jeans,Wrangler,40.0
1,20741,Genuine Wrangler Men's Regular Fit Jeans,Jeans,Wrangler,40.0


### Test 1: Basic Similar Product(semantic)

In [None]:
%%bigquery
WITH target_product AS (
  SELECT embedding, name AS target_name, category, brand, retail_price
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
)
SELECT
  base.id AS id,
  base.name AS name,
  target.target_name AS target_name,
  base.category AS category,
  base.brand AS brand,
  base.retail_price AS retail_price,
  ROUND(1 - query.distance, 3) AS similarity_score
FROM VECTOR_SEARCH(
  TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`,
  'embedding',
  (SELECT embedding FROM target_product),
  top_k => 5,
  distance_type => 'COSINE'
) AS query
CROSS JOIN target_product target
WHERE base.id != 21018
ORDER BY similarity_score DESC


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,target_name,category,brand,retail_price,similarity_score
0,20850,Wrangler Men's Western Slim Boot Cut Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,49.990002,0.99
1,21284,Wrangler Men's Western Boot Cut Slim Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,35.950001,0.99
2,20921,Wrangler Men's Cowboy Cut Western Slim Fit Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,40.720001,0.974
3,21255,Wrangler Men's Western Boot Cut Jean Regular,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,37.0,0.972
4,20585,Wrangler Men's Cowboy Cut Slim Fit Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,49.990002,0.967


### Test 2: Smart Multi-Factor Recommendations(semantic)

In [None]:
%%bigquery
WITH target_product AS (
  SELECT embedding, retail_price, category, brand, name AS target_name
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
)
SELECT
  base.id,
  base.name,
  target.target_name,
  base.category,
  base.brand,
  base.retail_price,
  ROUND(1 - query.distance, 3) * 0.6 AS semantic_score,
  ROUND((1 - ABS(base.retail_price - target.retail_price) / GREATEST(base.retail_price, target.retail_price)), 3) * 0.25 AS price_score,
  CASE WHEN base.category = target.category THEN 0.1 ELSE 0 END AS category_bonus,
  CASE WHEN base.brand = target.brand THEN 0.05 ELSE 0 END AS brand_bonus,
  ROUND(
    (ROUND(1 - query.distance, 3) * 0.6) +
    (ROUND((1 - ABS(base.retail_price - target.retail_price) / GREATEST(base.retail_price, target.retail_price)), 3) * 0.25) +
    CASE WHEN base.category = target.category THEN 0.1 ELSE 0 END +
    CASE WHEN base.brand = target.brand THEN 0.05 ELSE 0 END
  , 3) AS total_score,
  CASE
    WHEN base.category = target.category AND base.brand = target.brand THEN 'Perfect Match'
    WHEN base.category = target.category THEN 'Same Category'
    WHEN base.brand = target.brand THEN 'Same Brand'
    WHEN (ROUND(1 - query.distance, 3) * 0.6) > 0.4 THEN 'Semantically Similar'
    ELSE 'Alternative'
  END AS match_type
FROM VECTOR_SEARCH(
  TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`,
  'embedding',
  (SELECT embedding FROM target_product),
  top_k => 20,
  distance_type => 'COSINE'
) AS query
CROSS JOIN target_product target
WHERE base.id != 21018
ORDER BY total_score DESC
LIMIT 5;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,target_name,category,brand,retail_price,semantic_score,price_score,category_bonus,brand_bonus,total_score,match_type
0,20850,Wrangler Men's Western Slim Boot Cut Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,49.990002,0.594,0.25,0.1,0.05,0.994,Perfect Match
1,20585,Wrangler Men's Cowboy Cut Slim Fit Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,49.990002,0.5802,0.25,0.1,0.05,0.98,Perfect Match
2,20632,Wrangler Men's Cowboy Cut Original Fit Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,49.990002,0.5676,0.25,0.1,0.05,0.968,Perfect Match
3,20968,Wrangler Men's Original Cowboy Cut Relaxed Fit...,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,49.990002,0.564,0.25,0.1,0.05,0.964,Perfect Match
4,20971,Wrangler Men's Premium Performance Cowboy Cut ...,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,49.990002,0.5586,0.25,0.1,0.05,0.959,Perfect Match


### Test 3: Price-Conscious Recommendations(semantic)

In [None]:
%%bigquery
WITH target AS (
  SELECT embedding,
         retail_price,
         name AS target_name,
         category,
         brand
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
)
SELECT
  base.id,
  base.name,
  t.target_name AS original_product,
  base.category,
  base.brand,
  base.retail_price,
  t.retail_price AS original_price,
  ROUND(1 - query.distance, 3) AS similarity,
  ROUND(((base.retail_price - t.retail_price) / t.retail_price * 100), 1) AS price_change_pct,
  CASE
    WHEN base.retail_price < t.retail_price THEN 'Cheaper'
    WHEN base.retail_price = t.retail_price THEN 'Same Price'
    ELSE 'Premium'
  END AS price_category
FROM VECTOR_SEARCH(
  TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`,
  'embedding',
  (SELECT embedding FROM target),
  top_k => 20,
  distance_type => 'COSINE'
) AS query
CROSS JOIN target t
WHERE base.id != 21018
  AND ABS(base.retail_price - t.retail_price) / t.retail_price <= 0.2
ORDER BY similarity DESC
LIMIT 5;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,target_name,category,target_category,brand,retail_price,similarity,category_match
0,22035,Men's Wrangler George Straight Cowboy Cut Casu...,Wrangler Men's Western Traditional Boot Cut Sl...,Pants,Jeans,Wrangler,34.990002,0.871,Different
1,21993,Wrangler Men's Ranger Pant,Wrangler Men's Western Traditional Boot Cut Sl...,Pants,Jeans,Wrangler,49.880001,0.862,Different
2,21762,Wrangler Men's Ranger Pant,Wrangler Men's Western Traditional Boot Cut Sl...,Pants,Jeans,Wrangler,49.880001,0.862,Different
3,23363,Wrangler Men's Ranger Short,Wrangler Men's Western Traditional Boot Cut Sl...,Shorts,Jeans,Wrangler,37.0,0.844,Different
4,22888,Wrangler Men's Rugged Wear Relaxed Fit Short,Wrangler Men's Western Traditional Boot Cut Sl...,Shorts,Jeans,Wrangler,25.0,0.842,Different


### Test 4: Trend-Aware Recommendations(semantic)

##### **ordert_items** table structure

In [None]:
%%bigquery
SELECT *
  FROM `bigquery-public-data.thelook_ecommerce.order_items`
LIMIT 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,order_id,user_id,product_id,inventory_item_id,status,created_at,shipped_at,delivered_at,returned_at,sale_price
0,19087,13048,10310,14235,51224,Complete,2022-06-06 06:09:55+00:00,2022-06-06 13:02:00+00:00,2022-06-09 11:29:00+00:00,NaT,0.02
1,66494,45842,36630,14159,179448,Cancelled,2025-04-29 04:47:33+00:00,NaT,NaT,NaT,0.49
2,14356,9838,7777,14159,38482,Complete,2024-01-30 05:23:50+00:00,2024-02-01 13:20:00+00:00,2024-02-02 16:34:00+00:00,NaT,0.49
3,85780,59229,47352,14159,231587,Processing,2023-02-21 03:59:11+00:00,NaT,NaT,NaT,0.49
4,117309,80983,64522,14159,316651,Returned,2021-07-11 18:19:49+00:00,2021-07-10 21:56:00+00:00,2021-07-15 13:42:00+00:00,2021-07-17 10:34:00+00:00,0.49


In [None]:
%%bigquery
WITH target AS (
  SELECT id,
         embedding,
         name AS target_name,
         category,
         brand,
         retail_price
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
),
popularity AS (
  SELECT product_id, COUNT(*) AS order_count
  FROM `bigquery-public-data.thelook_ecommerce.order_items`
  GROUP BY product_id
)
SELECT
  base.id,
  base.name,
  t.target_name AS original_product,
  base.category,
  base.brand,
  base.retail_price,
  COALESCE(pop.order_count, 0) AS popularity,
  ROUND(1 - query.distance, 3) AS similarity,
  ROUND(1 - query.distance, 3) * 0.7 + LOG10(COALESCE(pop.order_count, 1)) * 0.3 AS weighted_score
FROM VECTOR_SEARCH(
  TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`,
  'embedding',
  (SELECT embedding FROM target),
  top_k => 20,
  distance_type => 'COSINE'
) AS query
LEFT JOIN popularity pop ON base.id = pop.product_id
CROSS JOIN target t
WHERE base.id != t.id
ORDER BY weighted_score DESC
LIMIT 5;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,original_product,category,brand,retail_price,popularity,similarity,weighted_score
0,20741,Genuine Wrangler Men's Regular Fit Jeans,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,40.0,15,0.926,1.001027
1,20585,Wrangler Men's Cowboy Cut Slim Fit Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,49.990002,11,0.967,0.989318
2,20682,George Strait by Wrangler Men's Cowboy Cut Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,49.990002,13,0.901,0.964883
3,21148,Wrangler Men's Retro Slim Straight Jean,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,51.950001,12,0.91,0.960754
4,21373,Wrangler Men's Tall Original Cowboy Cut Relaxe...,Wrangler Men's Western Traditional Boot Cut Sl...,Jeans,Wrangler,39.5,11,0.923,0.958518


### Test 5: Inventory-Level in-Stock Substitutes (semantic)

##### **inventory** table structure

In [None]:
%%bigquery
SELECT * from `bigquery-public-data.thelook_ecommerce.inventory_items` LIMIT 5

Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,product_id,created_at,sold_at,cost,product_category,product_name,product_brand,product_retail_price,product_department,product_sku,product_distribution_center_id
0,38530,8059,2025-09-17 19:47:00+00:00,2025-09-19 22:05:00+00:00,25.753561,Clothing Sets,Aeropostale Juniors' Times Square 87 Graphic T...,Aeropostale,39.990002,Women,E382F91E2C82C3853AEB0D3948275232,9
1,38531,8059,2024-08-11 09:16:00+00:00,NaT,25.753561,Clothing Sets,Aeropostale Juniors' Times Square 87 Graphic T...,Aeropostale,39.990002,Women,E382F91E2C82C3853AEB0D3948275232,9
2,38532,8059,2023-07-16 03:30:00+00:00,NaT,25.753561,Clothing Sets,Aeropostale Juniors' Times Square 87 Graphic T...,Aeropostale,39.990002,Women,E382F91E2C82C3853AEB0D3948275232,9
3,212994,8059,2023-04-28 09:02:06+00:00,2023-05-21 05:50:06+00:00,25.753561,Clothing Sets,Aeropostale Juniors' Times Square 87 Graphic T...,Aeropostale,39.990002,Women,E382F91E2C82C3853AEB0D3948275232,9
4,212995,8059,2020-02-27 17:30:00+00:00,NaT,25.753561,Clothing Sets,Aeropostale Juniors' Times Square 87 Graphic T...,Aeropostale,39.990002,Women,E382F91E2C82C3853AEB0D3948275232,9


In [None]:
%%bigquery
WITH target AS (
  SELECT id, embedding, name AS target_name
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
),
inventory_status AS (
  -- Calculates the number of actively stocked items for each product.
  SELECT
    product_id,
    COUNTIF(sold_at IS NULL OR sold_at > CURRENT_TIMESTAMP()) AS active_stock
  FROM `bigquery-public-data.thelook_ecommerce.inventory_items`
  GROUP BY product_id
)
SELECT
  base.id,
  base.name,
  t.target_name AS original_product,
  ROUND(1 - query.distance, 3) AS similarity,
  CASE
    WHEN COALESCE(inv.active_stock, 0) > 0 THEN '✅ In Stock'
    ELSE '❌ Out of Stock'
  END AS stock_status
FROM VECTOR_SEARCH(
  TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`,
  'embedding',
  (SELECT embedding FROM target),
  top_k => 20,
  distance_type => 'COSINE'
) AS query
CROSS JOIN target t
LEFT JOIN inventory_status inv ON base.id = inv.product_id
WHERE base.id != t.id
  AND COALESCE(inv.active_stock, 0) > 0
ORDER BY similarity DESC
LIMIT 5;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,original_product,similarity,stock_status
0,21284,Wrangler Men's Western Boot Cut Slim Jean,Wrangler Men's Western Traditional Boot Cut Sl...,0.99,✅ In Stock
1,20850,Wrangler Men's Western Slim Boot Cut Jean,Wrangler Men's Western Traditional Boot Cut Sl...,0.99,✅ In Stock
2,20921,Wrangler Men's Cowboy Cut Western Slim Fit Jean,Wrangler Men's Western Traditional Boot Cut Sl...,0.974,✅ In Stock
3,21255,Wrangler Men's Western Boot Cut Jean Regular,Wrangler Men's Western Traditional Boot Cut Sl...,0.972,✅ In Stock
4,20585,Wrangler Men's Cowboy Cut Slim Fit Jean,Wrangler Men's Western Traditional Boot Cut Sl...,0.967,✅ In Stock


### Test 6: Seasonal/Occasion-Based Semantic Recommendations (semantic)

In [None]:
%%bigquery
WITH target AS (
  SELECT id, embedding, name, category, retail_price
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
),
occasion_context AS (
  SELECT
    base.id,
    base.name,
    base.category,
    base.brand,
    base.retail_price,
    ROUND(1 - query.distance, 3) AS base_similarity,
    CASE
      WHEN LOWER(base.name) LIKE '%formal%' OR LOWER(base.name) LIKE '%dress%'
           OR LOWER(base.name) LIKE '%elegant%' OR LOWER(base.name) LIKE '%cocktail%' THEN 1.0
      WHEN LOWER(base.name) LIKE '%party%' OR LOWER(base.name) LIKE '%evening%'
           OR LOWER(base.name) LIKE '%special%' THEN 0.8
      WHEN (LOWER(base.name) LIKE '%jean%' OR LOWER(base.name) LIKE '%western%' OR LOWER(base.name) LIKE '%boot cut%')
           AND t.category = 'Jeans' THEN 0.75
      WHEN base.category = t.category THEN 0.6
      ELSE 0.3
    END AS occasion_relevance,
    CASE
      WHEN base.retail_price BETWEEN t.retail_price * 0.7 AND t.retail_price * 1.3 THEN 1.0
      WHEN base.retail_price BETWEEN t.retail_price * 0.5 AND t.retail_price * 1.5 THEN 0.8
      ELSE 0.5
    END AS price_tier_match
  FROM VECTOR_SEARCH(
    TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`,
    'embedding',
    (SELECT embedding FROM target),
    top_k => 50,
    distance_type => 'COSINE'
  ) AS query
  CROSS JOIN target t
  WHERE base.id != t.id
    AND (1 - query.distance) > 0.4
)
SELECT
  id,
  name,
  category,
  retail_price,
  brand,
  base_similarity,
  occasion_relevance,
  ROUND(
    base_similarity * 0.5 +
    occasion_relevance * 0.3 +
    price_tier_match * 0.2,
    3
  ) AS occasion_aware_score,
  CASE
    WHEN occasion_relevance >= 0.8 THEN 'Perfect for Occasion'
    WHEN occasion_relevance >= 0.6 THEN 'Good Alternative'
    ELSE 'Creative Option'
  END AS recommendation_type
FROM occasion_context
WHERE base_similarity > 0.5
ORDER BY occasion_aware_score DESC
LIMIT 5;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,category,retail_price,brand,base_similarity,occasion_relevance,occasion_aware_score,recommendation_type
0,20718,Wrangler Men's Wrancher Dress Jean,Jeans,40.400002,Wrangler,0.904,1.0,0.952,Perfect for Occasion
1,20801,Wrangler Men's Wrancher Dress Jean,Jeans,40.400002,Wrangler,0.904,1.0,0.952,Perfect for Occasion
2,20972,Wrangler Men's Wrancher Dress Jean,Jeans,40.400002,Wrangler,0.904,1.0,0.952,Perfect for Occasion
3,20896,Wrangler Men's Wrancher Dress Jean,Jeans,40.400002,Wrangler,0.904,1.0,0.952,Perfect for Occasion
4,20850,Wrangler Men's Western Slim Boot Cut Jean,Jeans,49.990002,Wrangler,0.99,0.75,0.92,Good Alternative


### Test 7: Size/Fit-Aware Semantic Substitutes (semantic)

In [None]:
%%bigquery
WITH target AS (
  SELECT id, embedding, name, category, brand, retail_price
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
),
fit_analysis AS (
  SELECT
    base.id,
    base.name,
    base.category,
    base.brand,
    base.retail_price,
    -- Core semantic similarity
    ROUND(1 - query.distance, 3) AS semantic_similarity,
    -- Fit compatibility analysis using semantic content
    CASE
      WHEN LOWER(base.name) LIKE '%slim%' AND LOWER(t.name) LIKE '%slim%' THEN 1.0
      WHEN LOWER(base.name) LIKE '%regular%' AND LOWER(t.name) LIKE '%regular%' THEN 1.0
      WHEN LOWER(base.name) LIKE '%boot%' AND LOWER(t.name) LIKE '%boot%' THEN 1.0
      WHEN LOWER(base.name) LIKE '%straight%' AND LOWER(t.name) LIKE '%straight%' THEN 1.0
      WHEN base.category = t.category THEN 0.7
      ELSE 0.4
    END AS fit_compatibility,
    -- Brand sizing consistency (same brand = more predictable fit)
    CASE
      WHEN base.brand = t.brand THEN 1.0
      WHEN base.retail_price BETWEEN t.retail_price * 0.8 AND t.retail_price * 1.2 THEN 0.8
      ELSE 0.6
    END AS sizing_consistency,
    -- Material/style similarity for fit prediction
    CASE
      WHEN LOWER(base.name) LIKE '%denim%' AND LOWER(t.name) LIKE '%denim%' THEN 1.0
      WHEN LOWER(base.name) LIKE '%cotton%' AND LOWER(t.name) LIKE '%cotton%' THEN 0.9
      WHEN base.category = t.category THEN 0.7
      ELSE 0.5
    END AS material_similarity
  FROM VECTOR_SEARCH(
    TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`,
    'embedding',
    (SELECT embedding FROM target),
    top_k => 50,
    distance_type => 'COSINE'
  ) AS query
  CROSS JOIN target t
  WHERE base.id != t.id
    AND (1 - query.distance) > 0.5
)
SELECT
  id,
  name,
  category,
  brand,
  retail_price,
  semantic_similarity,
  fit_compatibility,
  sizing_consistency,
  material_similarity,
  -- Comprehensive fit-aware score
  ROUND(
    semantic_similarity * 0.4 +
    fit_compatibility * 0.25 +
    sizing_consistency * 0.2 +
    material_similarity * 0.15, 3
  ) AS fit_aware_score,
  -- Confidence level for fit prediction
  CASE
    WHEN fit_compatibility >= 0.8 AND sizing_consistency >= 0.8 THEN 'High Fit Confidence'
    WHEN fit_compatibility >= 0.6 THEN 'Good Fit Match'
    ELSE 'Try Before Buy'
  END AS fit_confidence
FROM fit_analysis
WHERE semantic_similarity > 0.6
ORDER BY fit_aware_score DESC
LIMIT 5;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,category,brand,retail_price,semantic_similarity,fit_compatibility,sizing_consistency,material_similarity,fit_aware_score,fit_confidence
0,21284,Wrangler Men's Western Boot Cut Slim Jean,Jeans,Wrangler,35.950001,0.99,1.0,1.0,0.7,0.951,High Fit Confidence
1,20850,Wrangler Men's Western Slim Boot Cut Jean,Jeans,Wrangler,49.990002,0.99,1.0,1.0,0.7,0.951,High Fit Confidence
2,20921,Wrangler Men's Cowboy Cut Western Slim Fit Jean,Jeans,Wrangler,40.720001,0.974,1.0,1.0,0.7,0.945,High Fit Confidence
3,21255,Wrangler Men's Western Boot Cut Jean Regular,Jeans,Wrangler,37.0,0.972,1.0,1.0,0.7,0.944,High Fit Confidence
4,20585,Wrangler Men's Cowboy Cut Slim Fit Jean,Jeans,Wrangler,49.990002,0.967,1.0,1.0,0.7,0.942,High Fit Confidence


### Test 8: Brand Affinity Semantic Matching

In [None]:
%%bigquery
WITH target AS (
  SELECT id, embedding, name, category, brand, retail_price
  FROM `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`
  WHERE id = 21018
),
brand_ecosystem AS (
  SELECT
    base.id,
    base.name,
    base.category,
    base.brand,
    base.retail_price,
    t.retail_price AS target_retail_price,
    -- Core semantic similarity
    ROUND(1 - query.distance, 3) AS semantic_similarity,
    -- Brand affinity scoring
    CASE
      WHEN base.brand = t.brand THEN 1.0
      WHEN base.retail_price > 100 AND t.retail_price > 100 THEN 0.8
      WHEN base.retail_price BETWEEN 50 AND 100 AND t.retail_price BETWEEN 50 AND 100 THEN 0.7
      WHEN base.retail_price < 50 AND t.retail_price < 50 THEN 0.6
      ELSE 0.4
    END AS brand_tier_affinity,
    -- Brand positioning compatibility
    CASE
      WHEN base.brand = t.brand THEN 1.0
      WHEN LENGTH(base.brand) > 10 AND LENGTH(t.brand) > 10 THEN 0.7
      WHEN base.retail_price / t.retail_price BETWEEN 0.7 AND 1.3 THEN 0.6
      ELSE 0.4
    END AS brand_positioning,
    -- Customer journey brand preference
    CASE
      WHEN base.brand = t.brand THEN 1.0
      WHEN base.retail_price > t.retail_price * 1.2 THEN 0.8
      WHEN base.retail_price < t.retail_price * 0.8 THEN 0.6
      ELSE 0.5
    END AS brand_journey_fit
  FROM VECTOR_SEARCH(
    TABLE `bigquery-hackathon-471715.thelook_ecommerce.product_embeddings_clean`,
    'embedding',
    (SELECT embedding FROM target),
    top_k => 50,
    distance_type => 'COSINE'
  ) AS query
  CROSS JOIN target t
  WHERE base.id != t.id
    AND (1 - query.distance) > 0.4
)
SELECT
  id,
  name,
  category,
  brand,
  retail_price,
  semantic_similarity,
  brand_tier_affinity,
  brand_positioning,
  brand_journey_fit,
  ROUND(
    semantic_similarity * 0.35 +
    brand_tier_affinity * 0.25 +
    brand_positioning * 0.25 +
    brand_journey_fit * 0.15, 3
  ) AS brand_aware_score,
  CASE
    WHEN brand_tier_affinity = 1.0 THEN 'Same Brand Loyalty'
    WHEN brand_tier_affinity >= 0.8 THEN 'Premium Brand Match'
    WHEN brand_tier_affinity >= 0.6 THEN 'Compatible Brand Tier'
    ELSE 'Brand Discovery'
  END AS brand_strategy,
  -- Use target_retail_price carried from CTE
  ROUND(((retail_price - target_retail_price) / target_retail_price) * 100, 1) AS price_difference_pct
FROM brand_ecosystem
WHERE semantic_similarity > 0.5
ORDER BY brand_aware_score DESC
LIMIT 5;


Query is running:   0%|          |

Downloading:   0%|          |

Unnamed: 0,id,name,category,brand,retail_price,semantic_similarity,brand_tier_affinity,brand_positioning,brand_journey_fit,brand_aware_score,brand_strategy,price_difference_pct
0,20850,Wrangler Men's Western Slim Boot Cut Jean,Jeans,Wrangler,49.990002,0.99,1.0,1.0,1.0,0.997,Same Brand Loyalty,0.0
1,21284,Wrangler Men's Western Boot Cut Slim Jean,Jeans,Wrangler,35.950001,0.99,1.0,1.0,1.0,0.997,Same Brand Loyalty,-28.1
2,20921,Wrangler Men's Cowboy Cut Western Slim Fit Jean,Jeans,Wrangler,40.720001,0.974,1.0,1.0,1.0,0.991,Same Brand Loyalty,-18.5
3,21255,Wrangler Men's Western Boot Cut Jean Regular,Jeans,Wrangler,37.0,0.972,1.0,1.0,1.0,0.99,Same Brand Loyalty,-26.0
4,20585,Wrangler Men's Cowboy Cut Slim Fit Jean,Jeans,Wrangler,49.990002,0.967,1.0,1.0,1.0,0.988,Same Brand Loyalty,0.0
