In [24]:
print(5)

5


In [25]:

!pip install -U unidecode rapidfuzz emoji pyarrow



In [26]:
import os, re, sys, math, json, gc, string, hashlib, textwrap, unicodedata
from pathlib import Path
import numpy as np
import pandas as pd

from collections import Counter, defaultdict
from unidecode import unidecode
from rapidfuzz import fuzz
from tqdm.auto import tqdm
import emoji

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Paths
WORK_DIR  = Path("/kaggle/working")
CACHE_DIR = WORK_DIR / "cache"
CACHE_DIR.mkdir(parents=True, exist_ok=True)

def _auto_find_file(fname="train.csv"):
    base = Path("/kaggle/input")
    candidates = []
    for p, d, files in os.walk(base):
        if fname in files:
            candidates.append(Path(p)/fname)
    return candidates

FOUND_TRAIN = _auto_find_file("train.csv")
FOUND_TEST  = _auto_find_file("test.csv")

if len(FOUND_TRAIN)==0 or len(FOUND_TEST)==0:
    raise FileNotFoundError("Could not auto-discover train.csv/test.csv under /kaggle/input. "
                            "Add your dataset to the notebook and re-run.")
TRAIN_PATH = FOUND_TRAIN[0]
TEST_PATH  = FOUND_TEST[0]

print("Using files:")
print("  TRAIN:", TRAIN_PATH)
print("  TEST :", TEST_PATH)


Using files:
  TRAIN: /kaggle/input/dataset-products/train.csv
  TEST : /kaggle/input/dataset-products/test.csv


In [27]:
train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)

expected_cols_train = {"sample_id","catalog_content","image_link","price"}
expected_cols_test  = {"sample_id","catalog_content","image_link"}

assert expected_cols_train.issubset(set(train.columns)), f"train missing required columns: {expected_cols_train - set(train.columns)}"
assert expected_cols_test.issubset(set(test.columns)),   f"test missing required columns:  {expected_cols_test  - set(test.columns)}"

# Basic integrity
assert train["sample_id"].is_unique, "train.sample_id not unique"
assert test["sample_id"].is_unique,  "test.sample_id not unique"
assert (~train["price"].isna()).all(), "price has NaNs in train"
assert (train["price"] > 0).all(), "price must be positive"

print(train.shape, test.shape)
train.head(3)


(75000, 4) (75000, 3)


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97


In [28]:
def smape(y_true, y_pred, eps=1e-9):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred) + eps)/2.0
    return 100.0 * np.mean(num/den)

def to_ascii(s: str) -> str:
    s = s or ""
    s = unidecode(str(s))
    # Keep emojis out; they often confuse regex and add noise
    s = emoji.replace_emoji(s, replace="")
    # Normalize whitespace
    s = re.sub(r"\s+", " ", s).strip()
    return s

def normalize_text_for_parsing(s: str) -> str:
    s = to_ascii(s)
    # Lower for robust regex; we’ll keep original for brand extraction
    return s.lower()

def only_words(s: str) -> str:
    return re.sub(r"[^a-z0-9\s\-\.x%/]+", " ", s.lower()).strip()


In [29]:
# Unit canonicalization maps
WEIGHT_UNITS = {
    "g": ("g", 1.0),
    "gram": ("g", 1.0),
    "grams": ("g", 1.0),
    "kg": ("g", 1000.0),
    "kilogram": ("g", 1000.0),
    "kilograms": ("g", 1000.0),
    "oz": ("g", 28.3495),
    "ounce": ("g", 28.3495),
    "ounces": ("g", 28.3495),
    "lb": ("g", 453.592),
    "lbs": ("g", 453.592),
    "pound": ("g", 453.592),
    "pounds": ("g", 453.592),
}

VOLUME_UNITS = {
    "ml": ("ml", 1.0),
    "milliliter": ("ml", 1.0),
    "milliliters": ("ml", 1.0),
    "l": ("ml", 1000.0),
    "liter": ("ml", 1000.0),
    "liters": ("ml", 1000.0),
    "litre": ("ml", 1000.0),
    "litres": ("ml", 1000.0),
    "fl oz": ("ml", 29.5735),
    "floz": ("ml", 29.5735),
    "fluid ounce": ("ml", 29.5735),
    "fluid ounces": ("ml", 29.5735),
}

COUNT_TOKENS = {"count","ct","cts","pk","pack","packs","pcs","pieces","tabs","tablets","capsules","pods"}

# Regex patterns
NUM = r"(\d{1,4}(?:[.,]\d{1,3})?)"
SP = r"[ \-]*"
UNIT_PATTERN = (
    r"(fl\.?\s?oz|fluid\s?ounce(?:s)?|floz|oz|ounce(?:s)?|g|gram(?:s)?|kg|kilogram(?:s)?|ml|milliliter(?:s)?|l|liter(?:s)?|litre(?:s)?|lb|lbs|pound(?:s)?)"
)
# e.g., "12 oz", "12-oz", "12oz"
RE_QTY_UNIT = re.compile(NUM + SP + UNIT_PATTERN, flags=re.IGNORECASE)

# e.g., "pack of 6", "6 pack", "6pk", "6 ct", "x6", "6 x 12 oz"
RE_PACK_PATTERNS = [
    re.compile(r"pack of\s+(\d{1,4})", re.IGNORECASE),
    re.compile(r"(\d{1,4})\s*(?:pack|pk)\b", re.IGNORECASE),
    re.compile(r"(\d{1,4})\s*(?:ct|count|pcs|pieces)\b", re.IGNORECASE),
    re.compile(r"\bx\s*(\d{1,4})\b", re.IGNORECASE),  # "x6"
    re.compile(r"(\d{1,4})\s*[xX]\s*(\d{1,4}(?:[.,]\d{1,3})?)\s*"+UNIT_PATTERN, re.IGNORECASE),  # "6 x 12 oz"
]

def _as_float(num_str: str) -> float:
    if num_str is None: return np.nan
    return float(num_str.replace(",", "."))

def extract_pack_count(text_norm: str) -> float:
    # Try multiple patterns; choose the largest (often correct for multi-mentions)
    vals = []
    for pat in RE_PACK_PATTERNS:
        for m in pat.findall(text_norm):
            if isinstance(m, tuple):
                # could be (pack, size, unit)
                if len(m) >= 1:
                    vals.append(_as_float(m[0]))
            else:
                vals.append(_as_float(m))
    vals = [v for v in vals if v==v and v>0]
    return float(max(vals)) if vals else np.nan

def _canon_unit(unit_raw: str):
    u = unit_raw.lower().strip().replace("fl. oz","fl oz").replace("fl oz.","fl oz")
    u = re.sub(r"\s+", " ", u)
    if u in VOLUME_UNITS: return VOLUME_UNITS[u]
    if u in WEIGHT_UNITS: return WEIGHT_UNITS[u]
    # minor cleanup
    if u.endswith("s") and u[:-1] in VOLUME_UNITS: return VOLUME_UNITS[u[:-1]]
    if u.endswith("s") and u[:-1] in WEIGHT_UNITS: return WEIGHT_UNITS[u[:-1]]
    return None

def extract_qty_units(text_norm: str):
    """Return list of (value_in_native, native_unit, canon_unit, canon_value) found in text."""
    out = []
    for m in RE_QTY_UNIT.finditer(text_norm):
        qty_raw, unit_raw = m.group(1), m.group(2)
        q = _as_float(qty_raw)
        cu = _canon_unit(unit_raw)
        if not cu or not (q==q and q>0):
            continue
        canon_name, factor = cu
        out.append((q, unit_raw.lower(), canon_name, q*factor))
    return out

def pick_totals_and_per_item(text_norm: str):
    """
    Heuristics:
    - Find pack_count if present.
    - Parse all qty-units; split by canon unit type (g/ml vs weight vs volume).
    - Prefer 'per-item size * pack_count' when a clear xN pattern exists; otherwise take max total per unit-type.
    """
    pack_count = extract_pack_count(text_norm)
    pairs = extract_qty_units(text_norm)
    if not pairs:
        return pack_count, np.nan, np.nan, np.nan  # no totals

    grams = [v for (q,u,cn,v) in pairs if cn=="g"]
    mls   = [v for (q,u,cn,v) in pairs if cn=="ml"]

    # Try to detect "N x SIZE UNIT": RE_PACK_PATTERNS captures some, but we’ll recompute per-item via surrounding context
    per_item_g = np.nan
    per_item_ml = np.nan

    # Simple heuristic: if multiple same-unit sizes exist, smaller values often are 'per item'
    if len(grams) >= 2:
        per_item_g = min(grams)
    if len(mls) >= 2:
        per_item_ml = min(mls)

    total_g = max(grams) if grams else np.nan
    total_ml = max(mls) if mls else np.nan

    # If we have pack_count and a plausible per-item size, prefer derived totals
    if pack_count==pack_count and pack_count>1:
        if per_item_g==per_item_g:
            total_g = max(total_g, per_item_g*pack_count) if total_g==total_g else per_item_g*pack_count
        if per_item_ml==per_item_ml:
            total_ml = max(total_ml, per_item_ml*pack_count) if total_ml==total_ml else per_item_ml*pack_count

    return pack_count, total_g, total_ml, per_item_g if per_item_g==per_item_g else per_item_ml


In [30]:
# ====== Cell 5R: Canonical quantity helper (add below your current Cell 5) ======

def choose_canonical_qty(total_g, total_ml, pack_count):
    """
    Choose exactly ONE canonical quantity:
      - If a volume exists, prefer ml (liquids price ~ volume).
      - Else if a weight exists, use grams.
      - Else if count exists, use count.
      - Else unknown (NaN).
    """
    if pd.notna(total_ml) and float(total_ml) > 0:
        return "ml", float(total_ml)
    if pd.notna(total_g) and float(total_g) > 0:
        return "g", float(total_g)
    if pd.notna(pack_count) and float(pack_count) > 0:
        return "count", float(pack_count)
    return "unknown", np.nan


In [31]:
# ====== Cell 6R: Brand lexicon from "Item Name" + robust mapping ======

RE_ITEMNAME = re.compile(r"item\s*name[:\s]+(.+)", re.IGNORECASE)
RE_STOP     = re.compile(r"(?:\b(?:value|unit|item\s*pack\s*quantity|ipq)\b|[\|\n•]| - )", re.IGNORECASE)

def extract_itemname_phrase(text_raw: str) -> str:
    s = (text_raw or "").strip()
    m = RE_ITEMNAME.search(s)
    if m:
        tail = m.group(1)
        # cut at stop marker or punctuation divider
        tail = RE_STOP.split(tail)[0]
        tail = re.sub(r"^\s*[-:–—|•]\s*", "", tail).strip()
        # avoid empty tails
        if len(tail) >= 2:
            return tail[:160]
    # fallback: use the first "title-like" piece
    return extract_title_like(s)

def brand_tokens(phrase: str):
    # Keep tokens with letters/digits/&/'/+
    return re.findall(r"[A-Za-z][A-Za-z0-9&'’+-]*", phrase)

def build_brand_lexicon(train_df: pd.DataFrame, min_count=25, lookahead=6):
    """
    Build brand lexicon from first up-to-6 tokens of the Item Name phrase:
    collect 1-3 gram prefixes sliding over the first 6 tokens; keep frequent ngrams.
    """
    counts = Counter()
    phrases = train_df["item_name_phrase"].fillna("").tolist()
    for ph in phrases:
        toks = brand_tokens(ph)
        L = min(len(toks), lookahead)
        for i in range(L):
            for n in (3,2,1):
                if i+n <= L:
                    ng = " ".join(toks[i:i+n])
                    # Drop trivial ngrams
                    if len(ng) < 2: 
                        continue
                    # Filter extremely generic starters
                    if ng.lower() in {"item name","brand","food","foods"}:
                        continue
                    counts[ng] += 1
    # Keep high-signal ngrams
    lexicon = {ng for ng,c in counts.items() if c >= min_count}
    return lexicon, counts

def map_brand_from_phrase(phrase: str, lexicon, counts, lookahead=6):
    toks = brand_tokens(phrase)
    L = min(len(toks), lookahead)
    best = None
    best_pos, best_len, best_cnt = 1e9, -1, -1
    for i in range(L):
        for n in (3,2,1):  # prefer longer n
            if i+n <= L:
                ng = " ".join(toks[i:i+n])
                if ng in lexicon:
                    cnt = counts[ng]
                    # choose earliest; tie-break by length then global count
                    if (i < best_pos) or (i == best_pos and n > best_len) or (i == best_pos and n == best_len and cnt > best_cnt):
                        best = ng
                        best_pos, best_len, best_cnt = i, n, cnt
    return best or "__other__"


In [32]:
STOP_TOKENS = set("""
and & with for of the a an by in on to from new pack set combo bundle value size
""".split())

def extract_title_like(text_raw: str) -> str:
    """First ~120 chars pre-colon or first sentence is often the product title."""
    s = (text_raw or "").strip()
    # Try split by newline or period or dash; otherwise keep first 120 chars
    for sep in ["\n", "•", " - ", " | ", " — ", " – ", ". "]:
        if sep in s:
            s = s.split(sep, 1)[0]
            break
    return s[:160]

def guess_brand_from_title(text_raw: str) -> str:
    t = extract_title_like(text_raw)
    tokens = re.split(r"[^\w'+&]+", t)  # keep & and ' inside tokens
    acc = []
    for tok in tokens[:6]:  # first few tokens
        if not tok: break
        # brand tokens typically start with uppercase or are all-caps; keep numerics out
        if tok.lower() in STOP_TOKENS: break
        if re.match(r"^[A-Z][\w'+&-]*$", tok) or re.match(r"^[A-Z0-9&'+-]{2,}$", tok):
            acc.append(tok)
        else:
            break
    if not acc and tokens:
        # fallback: first token (even if lowercase), sometimes brands are lowercase stylistically
        acc = [tokens[0]]
    brand = " ".join(acc).strip()
    brand = re.sub(r"^by\s+", "", brand, flags=re.IGNORECASE)
    return brand if brand else "__unknown__"

CATEGORY_KEYWORDS = {
    "beverage": ["drink","beverage","juice","soda","cola","sparkling","water","fl oz","bottle","can","ml","l "],
    "coffee_tea": ["coffee","arabica","espresso","k-cup","kcup","k cup","brew","roast","tea","chai","green tea","herbal"],
    "snack": ["chips","crisps","cookie","cracker","snack","bar","trail mix","granola"],
    "breakfast": ["cereal","oats","oatmeal","pancake","syrup"],
    "baking": ["flour","sugar","yeast","baking","cocoa","chocolate chip"],
    "condiment": ["sauce","ketchup","mustard","mayo","mayonnaise","dressing","salsa","hot sauce","soy"],
    "baby": ["baby","infant","toddler","diaper","formula","pouch"],
    "pet": ["dog","cat","kitten","puppy","pet","kibble","litter"],
    "personal_care": ["shampoo","conditioner","soap","body wash","lotion","deodorant","toothpaste","toothbrush","razor"],
    "household": ["detergent","cleaner","wipes","trash","bag","paper towel","tissue"],
    "health": ["vitamin","supplement","capsule","tablet","gummy","probiotic","omega","electrolyte"],
}

BOOL_KEYWORDS = {
    "organic": ["organic"],
    "gluten_free": ["gluten free","gf"],
    "keto": ["keto"],
    "sugar_free": ["sugar free","no sugar"],
    "premium": ["premium","gourmet","artisan"],
    "non_gmo": ["non-gmo","nongmo","non gmo"],
    "kosher": ["kosher"],
    "decaf": ["decaf","decaffeinated"],
    "instant": ["instant"],
    "refill": ["refill"],
    "bulk": ["bulk","family size","value size"],
    "arabica": ["arabica"],
}

def coarse_category(text_norm: str) -> str:
    for cat, kws in CATEGORY_KEYWORDS.items():
        for kw in kws:
            if kw in text_norm:
                return cat
    return "__other__"

def keyword_flags(text_norm: str) -> dict:
    flags = {}
    for name, kws in BOOL_KEYWORDS.items():
        flags[name] = int(any(kw in text_norm for kw in kws))
    return flags


In [33]:
%%time
def process_df(df: pd.DataFrame, is_train: bool, top_brands=None, top_k=4000):
    out = df.copy()
    out["catalog_raw"]  = df["catalog_content"].fillna("")
    out["catalog_norm"] = out["catalog_raw"].map(normalize_text_for_parsing)

    # Core parsing
    packs, tots_g, tots_ml, per_item = [], [], [], []
    lens_char, lens_word, digits_cnt = [], [], []

    cats = []
    bool_cols = list(BOOL_KEYWORDS.keys())
    bool_matrix = {k: [] for k in bool_cols}

    brands_guess = []

    for raw, norm in tqdm(zip(out["catalog_raw"].tolist(), out["catalog_norm"].tolist()), total=len(out)):
        p, tg, tm, pi = pick_totals_and_per_item(norm)
        packs.append(p)
        tots_g.append(tg)
        tots_ml.append(tm)
        per_item.append(pi)
        lens_char.append(len(raw))
        lens_word.append(len(raw.split()))
        digits_cnt.append(len(re.findall(r"\d", raw)))
        cats.append(coarse_category(norm))
        for k in bool_cols:
            bool_matrix[k].append(int(any(kw in norm for kw in BOOL_KEYWORDS[k])))
        brands_guess.append(guess_brand_from_title(raw))

    out["pack_count"]     = packs
    out["total_g"]        = tots_g
    out["total_ml"]       = tots_ml
    out["per_item_size"]  = per_item
    out["len_chars"]      = lens_char
    out["len_words"]      = lens_word
    out["num_digits"]     = digits_cnt
    out["coarse_category"]= cats
    for k in bool_cols:
        out[k] = bool_matrix[k]
    out["brand_guess"]    = brands_guess

    # Choose unit type with priority: if ml present use ml; else if g present use g; else count-based
    out["unit_type"] = np.where(out["total_ml"].notna(), "ml",
                          np.where(out["total_g"].notna(), "g",
                          np.where(out["pack_count"].notna(), "count", "unknown")))

    # Log transforms (safe)
    def safelog(v):
        return np.log1p(np.clip(v.astype(float), 0, None))
    out["log_total_g"]   = safelog(out["total_g"].fillna(0))
    out["log_total_ml"]  = safelog(out["total_ml"].fillna(0))
    out["log_pack_count"]= safelog(out["pack_count"].fillna(0))
    out["log_len_words"] = safelog(out["len_words"])

    # Build/Apply brand whitelist
    if is_train:
        freq = Counter(out["brand_guess"].fillna("__unknown__"))
        top = [b for b,_ in freq.most_common(top_k)]
        top = [t for t in top if t != "__unknown__"]
        brand_whitelist = set(top)
    else:
        brand_whitelist = set(top_brands or [])

    def map_brand(b):
        b = (b or "").strip()
        return b if b in brand_whitelist else "__other__"

    out["brand_mapped"] = out["brand_guess"].map(map_brand)

    meta = {
        "brand_whitelist": sorted(list(brand_whitelist))
    }
    return out, meta

# Process train
proc_train, meta = process_df(train, is_train=True, top_k=4000)
brand_whitelist = meta["brand_whitelist"]

# Process test using the same brand whitelist
proc_test, _ = process_df(test, is_train=False, top_brands=brand_whitelist)

# Save caches
train_out_path = CACHE_DIR / "processed_train.parquet"
test_out_path  = CACHE_DIR / "processed_test.parquet"
meta_path      = CACHE_DIR / "meta.json"

proc_train.to_parquet(train_out_path, index=False)
proc_test.to_parquet(test_out_path, index=False)
with open(meta_path, "w") as f:
    json.dump(meta, f)

print("Saved:")
print(" ", train_out_path)
print(" ", test_out_path)
print(" ", meta_path)


  0%|          | 0/75000 [00:00<?, ?it/s]

  0%|          | 0/75000 [00:00<?, ?it/s]

Saved:
  /kaggle/working/cache/processed_train.parquet
  /kaggle/working/cache/processed_test.parquet
  /kaggle/working/cache/meta.json
CPU times: user 2min 14s, sys: 826 ms, total: 2min 15s
Wall time: 2min 15s


In [34]:
def coverage_report(df: pd.DataFrame, name="train"):
    print(f"=== Coverage report: {name} ===")
    for col in ["pack_count","total_g","total_ml","per_item_size"]:
        cov = df[col].notna().mean()*100
        print(f"{col:15s}: {cov:6.2f}% non-null")
    print("\nunit_type distribution:")
    print(df["unit_type"].value_counts(dropna=False, normalize=True).mul(100).round(2).to_string())
    print("\nbrand_mapped top 15:")
    print(df["brand_mapped"].value_counts().head(15).to_string())
    if "price" in df.columns:
        print("\nPrice summary (train):")
        print(df["price"].describe(percentiles=[.5,.9,.99]).to_string())
    print("\nExample rows:")
    display(df.sample(5, random_state=SEED)[["sample_id","brand_guess","brand_mapped","pack_count","total_g","total_ml","unit_type","coarse_category","len_words"]])

coverage_report(proc_train, "train")
coverage_report(proc_test, "test")


=== Coverage report: train ===
pack_count     :  46.01% non-null
total_g        :  72.35% non-null
total_ml       :  19.90% non-null
per_item_size  :  38.33% non-null

unit_type distribution:
unit_type
g          63.66
ml         19.90
count       8.31
unknown     8.13

brand_mapped top 15:
brand_mapped
__other__                                   52718
Item Name                                    1906
Item Name Food                                962
Item Name Frontier Co                         136
Item Name Pride                               112
Item Name Morton                               99
Item Name Amazon Brand Happy Belly             87
Item Name Harney                               86
Item Name Snyder's                             81
Item Name Great                                80
Item Name Big Dot                              79
Item Name Crystal Light                        76
Item Name Green Mountain Coffee Roasters       74
Item Name Marshall                           

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,sample_id,brand_guess,brand_mapped,pack_count,total_g,total_ml,unit_type,coarse_category,len_words
26837,158784,Item Name Log Cabin Sugar Free,__other__,12.0,680.388,709.764,ml,beverage,101
2592,4095,Item Name Raspberry Ginseng Oolong Tea,__other__,2.0,,1000.0,ml,beverage,399
18359,172021,Item Name Walden Farms Honey Dijon,__other__,2.0,340.194,,g,beverage,117
73292,268276,Item Name Vlasic Ovals Hamburger Dill,Item Name Vlasic Ovals Hamburger Dill,,,473.176,ml,beverage,82
60127,154791,Item Name Amoretti Premium Syrup Grand,__other__,12.0,720.0773,,g,beverage,65


=== Coverage report: test ===
pack_count     :  45.83% non-null
total_g        :  72.41% non-null
total_ml       :  19.94% non-null
per_item_size  :  38.72% non-null

unit_type distribution:
unit_type
g          63.74
ml         19.94
count       8.45
unknown     7.88

brand_mapped top 15:
brand_mapped
__other__                                   58492
Item Name                                    1916
Item Name Food                                964
Item Name Frontier Co                         123
Item Name Harney                               93
Item Name Pride                                91
Item Name Morton                               85
Item Name Amazon Brand Happy Belly             84
Item Name Great                                79
Item Name Marshall                             75
Item Name Marshalls Creek Spices               67
Item Name Big Dot                              67
Item Name Green Mountain Coffee Roasters       66
Item Name Chicken                             

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,sample_id,brand_guess,brand_mapped,pack_count,total_g,total_ml,unit_type,coarse_category,len_words
26837,217392,Item Name Gift Basket Village Gourmet,__other__,,198.4465,,g,beverage,545
2592,209156,Item Name NPG Dried Lotus Seeds,__other__,,454.0,,g,beverage,187
18359,262333,Item Name Annies Homegrown Macaroni,__other__,,170.097,,g,beverage,54
73292,295979,Item Name Bear Creek Country Kitchens,__other__,,286.32995,,g,beverage,112
60127,50604,Item Name Japanese Kelp Kombu Umami,__other__,10.0,396.893,591.47,ml,beverage,195


In [35]:
keep_cols_common = [
    "sample_id","catalog_raw","catalog_norm",
    "brand_guess","brand_mapped","coarse_category",
    "pack_count","total_g","total_ml","per_item_size",
    "len_chars","len_words","num_digits",
    "log_total_g","log_total_ml","log_pack_count","log_len_words",
    "unit_type",
] + list(BOOL_KEYWORDS.keys())

train_feats = proc_train[keep_cols_common + ["price"]].copy()
test_feats  = proc_test[keep_cols_common].copy()

# Re-save slim versions
train_feats_path = CACHE_DIR / "train_feats.parquet"
test_feats_path  = CACHE_DIR / "test_feats.parquet"
train_feats.to_parquet(train_feats_path, index=False)
test_feats.to_parquet(test_feats_path, index=False)

print("Slim features saved:")
print(" ", train_feats_path)
print(" ", test_feats_path)


Slim features saved:
  /kaggle/working/cache/train_feats.parquet
  /kaggle/working/cache/test_feats.parquet


In [36]:
%%time
def process_df_v2(df: pd.DataFrame, is_train: bool, brand_lexicon=None, brand_counts=None, top_k=4000):
    out = df.copy()
    out["catalog_raw"]  = df["catalog_content"].fillna("")
    out["catalog_norm"] = out["catalog_raw"].map(normalize_text_for_parsing)

    # ----- core parsing (re-use existing pick_totals_and_per_item) -----
    packs, tots_g, tots_ml, per_item = [], [], [], []
    lens_char, lens_word, digits_cnt = [], [], []

    cats = []
    bool_cols = list(BOOL_KEYWORDS.keys())
    bool_matrix = {k: [] for k in bool_cols}

    # NEW: item name phrase
    itemname_phrases = []

    for raw, norm in tqdm(zip(out["catalog_raw"].tolist(), out["catalog_norm"].tolist()), total=len(out)):
        p, tg, tm, pi = pick_totals_and_per_item(norm)
        packs.append(p)
        tots_g.append(tg)
        tots_ml.append(tm)
        per_item.append(pi)
        lens_char.append(len(raw))
        lens_word.append(len(raw.split()))
        digits_cnt.append(len(re.findall(r"\d", raw)))
        cats.append(coarse_category(norm))
        for k in bool_cols:
            bool_matrix[k].append(int(any(kw in norm for kw in BOOL_KEYWORDS[k])))
        itemname_phrases.append(extract_itemname_phrase(raw))

    out["pack_count"]     = packs
    out["total_g"]        = tots_g
    out["total_ml"]       = tots_ml
    out["per_item_size"]  = per_item
    out["len_chars"]      = lens_char
    out["len_words"]      = lens_word
    out["num_digits"]     = digits_cnt
    out["coarse_category"]= cats
    for k in bool_cols:
        out[k] = bool_matrix[k]
    out["item_name_phrase"] = itemname_phrases

    # ----- canonical quantity -----
    qty_types, qty_vals = [], []
    for g, ml, pk in zip(out["total_g"], out["total_ml"], out["pack_count"]):
        t, v = choose_canonical_qty(g, ml, pk)
        qty_types.append(t); qty_vals.append(v)
    out["qty_type"]        = qty_types
    out["total_qty_std"]   = qty_vals

    def safelog(v):
        return np.log1p(np.clip(pd.Series(v, dtype="float64"), 0, None))
    out["log_total_qty_std"] = safelog(out["total_qty_std"])
    out["log_len_words"]     = safelog(out["len_words"])
    out["log_pack_count"]    = safelog(out["pack_count"].fillna(0))

    # ----- brand lexicon build/map -----
    if is_train:
        # build lexicon from train phrases
        tmp = out[["item_name_phrase"]].copy()
        brand_lexicon, brand_counts = build_brand_lexicon(tmp, min_count=25, lookahead=6)

    # map brand for both train/test using the train-built lexicon
    mapped = [map_brand_from_phrase(ph, brand_lexicon, brand_counts) for ph in out["item_name_phrase"]]
    out["brand_mapped_v2"] = mapped

    meta = {
        "brand_lexicon": sorted(list(brand_lexicon)),
        "brand_counts": {k:int(v) for k,v in list(brand_counts.items())[:100000]}  # cap for JSON size
    }
    return out, meta

# ---- Run on train to build lexicon ----
proc_train_v2, meta_v2 = process_df_v2(train, is_train=True)
lexicon = set(meta_v2["brand_lexicon"])
counts_map = Counter(meta_v2["brand_counts"])

# ---- Run on test using the same lexicon ----
proc_test_v2, _ = process_df_v2(test, is_train=False, brand_lexicon=lexicon, brand_counts=counts_map)

# ---- Save v2 caches ----
V2_DIR = CACHE_DIR / "v2"
V2_DIR.mkdir(parents=True, exist_ok=True)

proc_train_v2.to_parquet(V2_DIR/"processed_train_v2.parquet", index=False)
proc_test_v2.to_parquet(V2_DIR/"processed_test_v2.parquet", index=False)

# Slim feature sets for modeling (v2)
keep_cols_common_v2 = [
    "sample_id","catalog_raw","catalog_norm","item_name_phrase",
    "brand_mapped_v2","coarse_category",
    "pack_count","total_g","total_ml","total_qty_std","qty_type","per_item_size",
    "len_chars","len_words","num_digits",
    "log_total_qty_std","log_len_words","log_pack_count",
] + list(BOOL_KEYWORDS.keys())

train_feats_v2 = proc_train_v2[keep_cols_common_v2 + ["price"]].copy()
test_feats_v2  = proc_test_v2[keep_cols_common_v2].copy()

train_feats_v2.to_parquet(V2_DIR/"train_feats_v2.parquet", index=False)
test_feats_v2.to_parquet(V2_DIR/"test_feats_v2.parquet", index=False)

with open(V2_DIR/"meta_v2.json", "w") as f:
    json.dump({"n_lexicon": len(lexicon)}, f)

print("Saved v2:")
print(" ", V2_DIR/"processed_train_v2.parquet")
print(" ", V2_DIR/"processed_test_v2.parquet")
print(" ", V2_DIR/"train_feats_v2.parquet")
print(" ", V2_DIR/"test_feats_v2.parquet")


def coverage_report_v2(df: pd.DataFrame, name="train_v2"):
    print(f"=== Coverage report: {name} ===")
    cov = df["total_qty_std"].notna().mean()*100
    print(f"total_qty_std : {cov:6.2f}% non-null")
    print("qty_type distribution (%):")
    print(df["qty_type"].value_counts(dropna=False, normalize=True).mul(100).round(2).to_string())
    print("\nbrand_mapped_v2 top 20:")
    print(df["brand_mapped_v2"].value_counts().head(20).to_string())
    if "price" in df.columns:
        print("\nPrice summary:")
        print(df["price"].describe(percentiles=[.5,.9,.99]).to_string())

train_v2 = pd.read_parquet(V2_DIR/"processed_train_v2.parquet")
test_v2  = pd.read_parquet(V2_DIR/"processed_test_v2.parquet")

coverage_report_v2(train_v2, "train_v2")
coverage_report_v2(test_v2,  "test_v2")

print("\nSample rows:")
print(train_v2.sample(5, random_state=SEED)[["sample_id","item_name_phrase","brand_mapped_v2","qty_type","total_qty_std","pack_count"]].to_string(index=False))


  0%|          | 0/75000 [00:00<?, ?it/s]

  return op(a, b)
  result = getattr(ufunc, method)(*inputs, **kwargs)


  0%|          | 0/75000 [00:00<?, ?it/s]

  return op(a, b)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Saved v2:
  /kaggle/working/cache/v2/processed_train_v2.parquet
  /kaggle/working/cache/v2/processed_test_v2.parquet
  /kaggle/working/cache/v2/train_feats_v2.parquet
  /kaggle/working/cache/v2/test_feats_v2.parquet
=== Coverage report: train_v2 ===
total_qty_std :  91.87% non-null
qty_type distribution (%):
qty_type
g          63.66
ml         19.90
count       8.31
unknown     8.13

brand_mapped_v2 top 20:
brand_mapped_v2
__other__         1213
Organic           1013
Food to Live       945
Gourmet            389
Original           373
Rani               358
McCormick          339
The                326
Betty Crocker      315
Premium            302
Badia              285
Bob's Red Mill     272
Goya               261
Coffee             258
Starbucks          247
Kraft              247
Amoretti           235
Natural            230
Red                229
Chocolate          216

Price summary:
count    75000.000000
mean        23.647654
std         33.376932
min          0.130000
50%     

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


In [37]:
%%capture
!pip install -U transformers accelerate sentencepiece safetensors

In [38]:
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cuda.matmul.allow_tf32 = True

MODEL_NAME = "google/flan-t5-small"  # ~80M params
tok = AutoTokenizer.from_pretrained(MODEL_NAME)
mdl = AutoModelForSeq2SeqLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16 if device=="cuda" else torch.float32
).to(device)

print("Device:", device)


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!
2025-10-12 11:16:12.208258: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760267772.579254      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760267772.692339      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Device: cuda


In [39]:
import json, re
from typing import List, Tuple, Optional

# Words that are *not* brands by themselves
GENERIC_SINGLE_WORDS = {
    "Organic","Gourmet","Original","Premium","Natural","The",
    "Chocolate","Coffee","Candy","Red","Blue","Green","Classic","Fresh","New","Value"
}
# Phrases to trim if LLM returns full phrases like "Amazon Brand Happy Belly"
TRIM_PATTERNS = [
    (re.compile(r"amazon brand\s+", re.I), ""),   # Keep just "Happy Belly"
    (re.compile(r"\bbrand\b", re.I), ""),         # trailing "Brand"
]

def clean_brand_text(b: str) -> str:
    if not b: return ""
    s = b.strip()
    # Trim known wrappers
    for pat, repl in TRIM_PATTERNS:
        s = pat.sub(repl, s).strip()
    # Normalize spaces/punctuation
    s = re.sub(r"\s{2,}", " ", s)
    # Keep reasonable chars
    s = re.sub(r"[^A-Za-z0-9&'’\-\.\s]+", "", s).strip()
    # Special fixes
    s = s.replace("Mccormick", "McCormick")
    s = s.replace("Bobs Red Mill", "Bob's Red Mill")
    # Uppercase sequences are fine; otherwise title-case lightly (preserve Mc prefixes)
    def smart_tc(word):
        if re.fullmatch(r"[A-Z0-9&'’\-]+", word):  # all-caps tokens
            return word
        if word.lower().startswith("mc") and len(word)>=3:
            return "Mc" + word[2:].capitalize()
        return word.capitalize()
    s = " ".join(smart_tc(w) for w in s.split())
    return s

def is_valid_brand(b: str) -> bool:
    if not b: return False
    # Reject single generic words
    if b in GENERIC_SINGLE_WORDS: return False
    # Length and alphabetic check
    if len(re.sub(r"[^A-Za-z]+","", b)) < 2: return False
    # Heuristic: at least one letter, not pure category words
    bad = {"Sugar Free","Original","Gourmet","Organic","Premium","The","Candy","Chocolate","Coffee","Natural"}
    if b in bad: return False
    return True

FEWSHOTS = [
    # Keep these short; T5 small learns JSON structure with a couple of examples
    {
      "text": "Log Cabin Sugar Free Syrup, 24 FL OZ (Pack of 12)",
      "json": {"brand": "Log Cabin", "pack_count": 12}
    },
    {
      "text": "Amazon Brand - Happy Belly Mixed Nuts, 16 oz, 2 Pack",
      "json": {"brand": "Happy Belly", "pack_count": 2}
    },
    {
      "text": "Starbucks Pike Place Roast K-Cup Coffee Pods, 44 ct",
      "json": {"brand": "Starbucks", "pack_count": 44}
    },
]

INSTR = (
  "Extract the MANUFACTURER BRAND and PACK_COUNT (integer if clearly present) from the product text.\n"
  "Rules: brand must be a proper name (not adjectives like Organic/Gourmet/Premium). "
  "If text says 'Amazon Brand - Happy Belly', return 'Happy Belly' as the brand. "
  "Return JSON only with keys: brand (string or null) and pack_count (integer or null)."
)

def make_prompt(text: str) -> str:
    shots = ""
    for ex in FEWSHOTS:
        shots += f"Text: {ex['text']}\nJSON: {json.dumps(ex['json'])}\n\n"
    return f"{INSTR}\n\n{shots}Text: {text}\nJSON:"

def parse_json(s: str) -> dict:
    m = re.search(r"\{.*\}", s, flags=re.S)
    if not m:
        return {}
    try:
        return json.loads(m.group(0))
    except Exception:
        # crude fixes for single quotes or trailing commas
        t = m.group(0).replace("'", '"')
        t = re.sub(r",\s*}", "}", t)
        try:
            return json.loads(t)
        except Exception:
            return {}

@torch.no_grad()
def llm_extract_batch(texts: List[str], max_new_tokens=48) -> Tuple[List[Optional[str]], List[Optional[int]]]:
    prompts = [make_prompt(t[:600]) for t in texts]  # truncate long texts
    enc = tok(prompts, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
    gen = mdl.generate(**enc, do_sample=False, num_beams=1, max_new_tokens=max_new_tokens)
    outs = tok.batch_decode(gen, skip_special_tokens=True)
    brands, packs = [], []
    for o in outs:
        js = parse_json(o)
        b = js.get("brand") if isinstance(js, dict) else None
        p = js.get("pack_count") if isinstance(js, dict) else None
        b = clean_brand_text(b) if b else ""
        if not is_valid_brand(b):
            b = ""
        try:
            p = int(p) if p is not None else None
            if p is not None and (p <= 0 or p > 10000):
                p = None
        except Exception:
            p = None
        brands.append(b if b else None)
        packs.append(p)
    return brands, packs


In [40]:
# Load v2 frames created earlier
V2_DIR   = CACHE_DIR / "v2"
train_v2 = pd.read_parquet(V2_DIR/"processed_train_v2.parquet")
test_v2  = pd.read_parquet(V2_DIR/"processed_test_v2.parquet")

# Rows that look suspicious: __other__ or single generic words
SUSPECT = GENERIC_SINGLE_WORDS | {"__other__"}
mask_train = (train_v2["brand_mapped_v2"].isin(SUSPECT))
mask_test  = (test_v2["brand_mapped_v2"].isin(SUSPECT))

print("Suspicious in train:", int(mask_train.sum()), " / ", len(train_v2))
print("Suspicious in test :", int(mask_test.sum()),  " / ", len(test_v2))

BATCH = 128  # safe on T4; reduce to 64 if you OOM

def run_llm_on_frame(df, mask):
    idx = df.index[mask].tolist()
    texts = df.loc[idx, "catalog_raw"].tolist()
    brands, packs = [], []
    for i in range(0, len(texts), BATCH):
        br, pk = llm_extract_batch(texts[i:i+BATCH])
        brands.extend(br); packs.extend(pk)
    out = pd.DataFrame({
        "idx": idx,
        "brand_llm": brands,
        "pack_count_llm": packs
    })
    return out

train_llm = run_llm_on_frame(train_v2, mask_train)
test_llm  = run_llm_on_frame(test_v2,  mask_test)

# Merge back
train_v2 = train_v2.merge(train_llm, how="left", left_index=True, right_on="idx").drop(columns=["idx"])
test_v2  = test_v2.merge(test_llm,  how="left", left_index=True, right_on="idx").drop(columns=["idx"])

# Fill NaNs for non-suspicious rows
train_v2["brand_llm"] = train_v2["brand_llm"].fillna("")
test_v2["brand_llm"]  = test_v2["brand_llm"].fillna("")


Suspicious in train: 5312  /  75000
Suspicious in test : 5460  /  75000


In [41]:
def choose_brand_final(row):
    # Prefer LLM brand if present and valid; else keep v2 rule
    b_llm = row.get("brand_llm") or ""
    if b_llm and is_valid_brand(b_llm):
        return clean_brand_text(b_llm)
    return row["brand_mapped_v2"]

def pick_pack_final(row):
    # prefer parsed pack if original missing or zero
    pk0 = row["pack_count"]
    pkl = row.get("pack_count_llm", None)
    if pd.isna(pk0) or (isinstance(pk0, (int,float)) and pk0 <= 0):
        return float(pkl) if pkl is not None else pk0
    return pk0

for df in (train_v2, test_v2):
    df["brand_final"] = df.apply(choose_brand_final, axis=1)
    df["pack_count_final"] = df.apply(pick_pack_final, axis=1)

# Update canonical quantity for rows that had unknown and now have pack_count
def recompute_qty_cols(df):
    qty_type_final = []
    total_qty_final = []
    for qt, tq, pk in zip(df["qty_type"], df["total_qty_std"], df["pack_count_final"]):
        if qt != "unknown":
            qty_type_final.append(qt)
            total_qty_final.append(tq)
        else:
            if pd.notna(pk) and float(pk) > 0:
                qty_type_final.append("count")
                total_qty_final.append(float(pk))
            else:
                qty_type_final.append(qt)
                total_qty_final.append(np.nan)
    df["qty_type_final"] = qty_type_final
    df["total_qty_std_final"] = total_qty_final
    df["log_total_qty_std_final"] = np.log1p(pd.Series(total_qty_final, dtype="float64").fillna(0.0))

recompute_qty_cols(train_v2)
recompute_qty_cols(test_v2)

# Quick peek
print("brand_mapped_v2  -> brand_final (train) top 20")
print(train_v2["brand_final"].value_counts().head(20).to_string())


brand_mapped_v2  -> brand_final (train) top 20
brand_final
__other__         1213
Organic           1013
Food to Live       945
Gourmet            389
Original           373
Rani               358
McCormick          339
The                326
Betty Crocker      315
Premium            302
Badia              285
Bob's Red Mill     272
Goya               261
Coffee             258
Starbucks          247
Kraft              247
Amoretti           235
Natural            230
Red                229
Chocolate          216


In [42]:
V2LLM_DIR = CACHE_DIR / "v2_llm"
V2LLM_DIR.mkdir(parents=True, exist_ok=True)

train_v2.to_parquet(V2LLM_DIR/"processed_train_v2_llm.parquet", index=False)
test_v2.to_parquet(V2LLM_DIR/"processed_test_v2_llm.parquet", index=False)

# Slim feature views (we will use brand_final & qty_type_final going forward)
keep_cols_common_v2llm = [
    "sample_id","catalog_raw","catalog_norm","item_name_phrase",
    "brand_final","coarse_category",
    "pack_count_final","total_g","total_ml","per_item_size",
    "qty_type_final","total_qty_std_final","log_total_qty_std_final",
    "len_chars","len_words","num_digits","log_len_words","log_pack_count",
] + list(BOOL_KEYWORDS.keys())

train_feats_v2llm = train_v2[keep_cols_common_v2llm + ["price"]].copy()
test_feats_v2llm  = test_v2[keep_cols_common_v2llm].copy()

train_feats_v2llm.to_parquet(V2LLM_DIR/"train_feats_v2_llm.parquet", index=False)
test_feats_v2llm.to_parquet(V2LLM_DIR/"test_feats_v2_llm.parquet", index=False)

print("Saved v2_llm features:")
print(" ", V2LLM_DIR/"train_feats_v2_llm.parquet")
print(" ", V2LLM_DIR/"test_feats_v2_llm.parquet")


Saved v2_llm features:
  /kaggle/working/cache/v2_llm/train_feats_v2_llm.parquet
  /kaggle/working/cache/v2_llm/test_feats_v2_llm.parquet


In [43]:
def sanity_after_llm(df, name="train_v2_llm"):
    print(f"=== {name} ===")
    print("brand_final top 20:")
    print(df["brand_final"].value_counts().head(20).to_string())
    print("\nqty_type_final distribution (%):")
    print(df["qty_type_final"].value_counts(dropna=False, normalize=True).mul(100).round(2).to_string())
    if "price" in df.columns:
        print("\nPrice summary:")
        print(df["price"].describe(percentiles=[.5,.9,.99]).to_string())
    # coverage for final qty
    cov = df["total_qty_std_final"].notna().mean()*100
    print(f"\nfinal total_qty_std coverage: {cov:0.2f}%")

train_v2_llm = pd.read_parquet(V2LLM_DIR/"processed_train_v2_llm.parquet")
test_v2_llm  = pd.read_parquet(V2LLM_DIR/"processed_test_v2_llm.parquet")

sanity_after_llm(train_v2_llm, "train_v2_llm")
sanity_after_llm(test_v2_llm,  "test_v2_llm")


=== train_v2_llm ===
brand_final top 20:
brand_final
__other__         1213
Organic           1013
Food to Live       945
Gourmet            389
Original           373
Rani               358
McCormick          339
The                326
Betty Crocker      315
Premium            302
Badia              285
Bob's Red Mill     272
Goya               261
Coffee             258
Starbucks          247
Kraft              247
Amoretti           235
Natural            230
Red                229
Chocolate          216

qty_type_final distribution (%):
qty_type_final
g          63.66
ml         19.90
count       8.31
unknown     8.13

Price summary:
count    75000.000000
mean        23.647654
std         33.376932
min          0.130000
50%         14.000000
90%         52.301000
99%        145.250300
max       2796.000000

final total_qty_std coverage: 91.87%
=== test_v2_llm ===
brand_final top 20:
brand_final
__other__         1228
Organic           1052
Food to Live       947
Gourmet            

In [44]:
print(5)

5


In [45]:
import os, gc, re, json, hashlib
import numpy as np
import pandas as pd
from pathlib import Path
from tqdm.auto import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
from scipy import sparse

# Our SMAPE from earlier
def smape(y_true, y_pred, eps=1e-9):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred) + eps)/2.0
    return 100.0 * np.mean(num/den)

SEED = 42
np.random.seed(SEED)

WORK_DIR  = Path("/kaggle/working")
CACHE_DIR = WORK_DIR / "cache"
V2LLM_DIR = CACHE_DIR / "v2_llm"

train = pd.read_parquet(V2LLM_DIR / "train_feats_v2_llm.parquet")
test  = pd.read_parquet(V2LLM_DIR / "test_feats_v2_llm.parquet")

print(train.shape, test.shape)
train.head(2)


(75000, 31) (75000, 30)


  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,sample_id,catalog_raw,catalog_norm,item_name_phrase,brand_final,coarse_category,pack_count_final,total_g,total_ml,per_item_size,...,sugar_free,premium,non_gmo,kosher,decaf,instant,refill,bulk,arabica,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...","item name: la victoria green taco sauce mild, ...","La Victoria Green Taco Sauce Mild, 12 Ounce (P...",La Victoria,beverage,6.0,340.194,,,...,0,0,0,0,0,0,0,0,0,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...","item name: salerno cookies, the original butte...","Salerno Cookies, The Original Butter Cookies, ...",Cookies,beverage,4.0,226.796,,,...,0,0,0,0,0,0,0,0,0,13.12


In [46]:
def build_text(df: pd.DataFrame) -> pd.Series:
    # Title-like phrase + truncated normalized body
    title = df["item_name_phrase"].fillna("").astype(str)
    body  = df["catalog_norm"].fillna("").astype(str).str[:500]
    # Join with a separator token so char ngrams cross less
    return (title + " ␟ " + body).str.strip()

train_text = build_text(train)
test_text  = build_text(test)

print("Example text:\n", train_text.iloc[0][:200])


Example text:
 La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6) ␟ item name: la victoria green taco sauce mild, 12 ounce (pack of 6) value: 72.0 unit: fl oz


In [47]:
%%time
from sklearn.utils import murmurhash3_32

# Word n-grams
word_vec = TfidfVectorizer(
    analyzer="word",
    ngram_range=(1,2),
    min_df=3,
    max_features=150_000,
    strip_accents="unicode",
    sublinear_tf=True,
)
Xw_tr = word_vec.fit_transform(train_text)
Xw_te = word_vec.transform(test_text)

# Char n-grams
char_vec = TfidfVectorizer(
    analyzer="char",
    ngram_range=(3,5),
    min_df=10,
    max_features=120_000,
    sublinear_tf=True,
)
Xc_tr = char_vec.fit_transform(train_text)
Xc_te = char_vec.transform(test_text)

# Cast down to float32
Xw_tr = Xw_tr.astype(np.float32); Xw_te = Xw_te.astype(np.float32)
Xc_tr = Xc_tr.astype(np.float32); Xc_te = Xc_te.astype(np.float32)

print("Word tfidf:", Xw_tr.shape, "Char tfidf:", Xc_tr.shape)
gc.collect();


Word tfidf: (75000, 150000) Char tfidf: (75000, 120000)
CPU times: user 2min 8s, sys: 4.01 s, total: 2min 12s
Wall time: 2min 11s


30

In [48]:
num_cols = [
    "log_total_qty_std_final",
    "log_len_words",
    "log_pack_count",
    "len_chars","len_words","num_digits",
] + [  # boolean flags we built earlier
    "organic","gluten_free","keto","sugar_free","premium","non_gmo","kosher",
    "decaf","instant","refill","bulk","arabica"
]

# Ensure the boolean columns exist (if any were missing earlier)
for c in num_cols:
    if c not in train.columns:
        train[c] = 0
        test[c]  = 0

num_tr = train[num_cols].fillna(0.0).astype(np.float32).values
num_te = test[num_cols].fillna(0.0).astype(np.float32).values

# Standardize the continuous ones lightly (except pure booleans)
cont_idx = [num_cols.index(c) for c in ["log_total_qty_std_final","log_len_words","log_pack_count","len_chars","len_words","num_digits"]]
scaler = StandardScaler(with_mean=False)  # with_mean=False for sparse compatibility
num_tr_scaled = num_tr.copy()
num_te_scaled = num_te.copy()
num_tr_scaled[:, cont_idx] = scaler.fit_transform(num_tr[:, cont_idx])
num_te_scaled[:, cont_idx] = scaler.transform(num_te[:, cont_idx])

Xs_tr = sparse.csr_matrix(num_tr_scaled)
Xs_te = sparse.csr_matrix(num_te_scaled)

# Final design matrix = [word tfidf | char tfidf | small numeric]
X_tr = sparse.hstack([Xw_tr, Xc_tr, Xs_tr], format="csr")
X_te = sparse.hstack([Xw_te, Xc_te, Xs_te], format="csr")

del Xw_tr, Xc_tr, Xs_tr, Xw_te, Xc_te, Xs_te
gc.collect();

print("Final shapes:", X_tr.shape, X_te.shape)


Final shapes: (75000, 270018) (75000, 270018)


In [49]:
def make_groups(df: pd.DataFrame) -> np.ndarray:
    # Use item_name_phrase, strip numbers, normalize tokens, keep first ~6 tokens
    t = df["item_name_phrase"].fillna("").astype(str).str.lower()
    t = t.str.replace(r"\d+", " ", regex=True).str.replace(r"[^a-z]+", " ", regex=True)
    key = t.str.split().str[:6].str.join(" ")
    # Hash to int32
    gids = key.apply(lambda s: int(hashlib.md5(s.encode()).hexdigest()[:8], 16))
    return gids.values

groups = make_groups(train)
y = train["price"].values.astype(np.float32)
y_log = np.log1p(y)
print("Groups OK:", len(groups), "unique:", len(np.unique(groups)))


Groups OK: 75000 unique: 62039


In [50]:
%%time
ALPHAS = [0.05, 0.1, 0.2, 0.5, 1.0]
FOLDS  = 5
EPS_FLOOR = 0.10  # to avoid SMAPE blowups on tiny preds

gkf = GroupKFold(n_splits=FOLDS)

oof_pred_price = np.zeros(len(train), dtype=np.float32)
test_pred_price_folds = []

fold_smapes = []
for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_tr, y_log, groups)):
    Xtr, Xva = X_tr[tr_idx], X_tr[va_idx]
    ytr, yva = y_log[tr_idx], y_log[va_idx]

    best_alpha, best_smape, best_va_pred = None, 1e9, None

    for a in ALPHAS:
        # 👇 Force a solver that is stable with sparse matrices on the current SciPy
        model = Ridge(alpha=a, solver="lsqr", fit_intercept=True, random_state=SEED)
        model.fit(Xtr, ytr)
        va_pred_log = model.predict(Xva)
        va_pred = np.expm1(va_pred_log).astype(np.float32)
        va_pred = np.clip(va_pred, EPS_FLOOR, None)  # floor to avoid SMAPE blow-ups
        s = smape(np.expm1(yva), va_pred)
        if s < best_smape:
            best_smape = s
            best_alpha = a
            best_va_pred = va_pred

    # lock best model for the fold
    model = Ridge(alpha=best_alpha, solver="lsqr", fit_intercept=True, random_state=SEED)
    model.fit(Xtr, ytr)

    # Store OOF
    oof_pred_price[va_idx] = best_va_pred

    # Predict test for this fold
    te_pred_log = model.predict(X_te)
    te_pred = np.expm1(te_pred_log).astype(np.float32)
    te_pred = np.clip(te_pred, EPS_FLOOR, None)
    test_pred_price_folds.append(te_pred)

    fold_smapes.append(best_smape)
    print(f"[Fold {fold}] alpha={best_alpha} SMAPE={best_smape:.4f}")

# OOF score
oof_smape = smape(y, np.clip(oof_pred_price, EPS_FLOOR, None))
print(f"\nOOF SMAPE (Ridge TF-IDF): {oof_smape:.4f}")

# Average test over folds
test_pred_price = np.mean(np.vstack(test_pred_price_folds), axis=0).astype(np.float32)


[Fold 0] alpha=1.0 SMAPE=51.7975
[Fold 1] alpha=1.0 SMAPE=51.6529
[Fold 2] alpha=1.0 SMAPE=51.9198
[Fold 3] alpha=1.0 SMAPE=51.6584
[Fold 4] alpha=1.0 SMAPE=51.7228

OOF SMAPE (Ridge TF-IDF): 51.7503
CPU times: user 29min 13s, sys: 3.96 s, total: 29min 17s
Wall time: 25min 21s


In [51]:
RIDGE_DIR = V2LLM_DIR / "ridge_tfidf"
RIDGE_DIR.mkdir(parents=True, exist_ok=True)

np.save(RIDGE_DIR/"oof_price.npy", oof_pred_price)
np.save(RIDGE_DIR/"test_price.npy", test_pred_price)

# Save diagnostics
with open(RIDGE_DIR/"oof_metrics.json", "w") as f:
    json.dump({
        "fold_smapes": [float(s) for s in fold_smapes],
        "oof_smape": float(oof_smape),
        "alphas": ALPHAS
    }, f, indent=2)

# Optional sanity submission (not final; we'll re-blend later)
sub = test[["sample_id"]].copy()
sub["price"] = test_pred_price
sub_path = RIDGE_DIR / "baseline_ridge_tfidf_submission.csv"
sub.to_csv(sub_path, index=False)
print("Saved:")
print("  OOF preds ->", RIDGE_DIR/"oof_price.npy")
print("  Test preds->", RIDGE_DIR/"test_price.npy")
print("  OOF JSON  ->", RIDGE_DIR/"oof_metrics.json")
print("  Submission->", sub_path)


Saved:
  OOF preds -> /kaggle/working/cache/v2_llm/ridge_tfidf/oof_price.npy
  Test preds-> /kaggle/working/cache/v2_llm/ridge_tfidf/test_price.npy
  OOF JSON  -> /kaggle/working/cache/v2_llm/ridge_tfidf/oof_metrics.json
  Submission-> /kaggle/working/cache/v2_llm/ridge_tfidf/baseline_ridge_tfidf_submission.csv


In [52]:
%%capture
!pip install -U faiss-cpu sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [53]:
!pip install -U sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [54]:
!pip install -U sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [55]:
import numpy as np, pandas as pd, gc, json, os, re, hashlib, math
from pathlib import Path
import torch
from sentence_transformers import SentenceTransformer

SEED = 42
np.random.seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"

WORK_DIR  = Path("/kaggle/working")
CACHE_DIR = WORK_DIR / "cache"
V2LLM_DIR = CACHE_DIR / "v2_llm"
KNN_DIR   = V2LLM_DIR / "knn_faiss"
KNN_DIR.mkdir(parents=True, exist_ok=True)

# Load the same feature frames we used for Ridge
train = pd.read_parquet(V2LLM_DIR / "train_feats_v2_llm.parquet")
test  = pd.read_parquet(V2LLM_DIR / "test_feats_v2_llm.parquet")

def build_embed_text(df: pd.DataFrame) -> list[str]:
    """
    Short, informative sentence: prefer item_name_phrase + (trusted brand when available).
    Keep numbers (sizes) — they help match same variants.
    """
    title = df["item_name_phrase"].fillna("").astype(str)
    brand = df["brand_final"].fillna("").astype(str)
    # If brand is generic like "Organic"/"Gourmet", it won't hurt; MiniLM is robust.
    txt = (brand.str.strip() + " || " + title.str.strip()).str[:256]
    return txt.tolist()

train_sents = build_embed_text(train)
test_sents  = build_embed_text(test)

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
embedder = SentenceTransformer(MODEL_NAME, device=device)
# encode returns float32; normalize=True gives L2-normalized vectors suitable for IP search
emb_tr = embedder.encode(train_sents, batch_size=1024, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True)
emb_te = embedder.encode(test_sents,  batch_size=1024, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True)

np.save(KNN_DIR/"emb_tr.npy", emb_tr)
np.save(KNN_DIR/"emb_te.npy", emb_te)
print("Embeddings:", emb_tr.shape, emb_te.shape)


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Batches:   0%|          | 0/74 [00:00<?, ?it/s]

Embeddings: (75000, 384) (75000, 384)


In [56]:
import faiss
from sklearn.model_selection import GroupKFold

def smape(y_true, y_pred, eps=1e-9):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred) + eps)/2.0
    return 100.0 * np.mean(num/den)

# Prepare arrays
y          = train["price"].values.astype(np.float32)
qty        = train["total_qty_std_final"].values.astype(np.float32)
qty_type_s = train["qty_type_final"].fillna("unknown").astype(str).values
qty_map = {"ml":0,"g":1,"count":2,"unknown":3}
qty_type   = np.array([qty_map.get(v,3) for v in qty_type_s], dtype=np.int8)

# Per-unit price (train only) for rows with known qty
ppu = np.full_like(y, np.nan, dtype=np.float32)
mask_qty = (qty > 0) & np.isfinite(qty)
ppu[mask_qty] = y[mask_qty] / qty[mask_qty]

# Global anchors (fold-safe usage later)
df_tmp = train.copy()
df_tmp["qty_type_id"] = qty_type
global_anchor_price = df_tmp.groupby("qty_type_id")["price"].median().to_dict()
global_anchor_ppu   = df_tmp.loc[mask_qty].groupby("qty_type_id").apply(lambda d: np.median(d["price"]/d["total_qty_std_final"])).to_dict()

# Build groups (as in Ridge)
def make_groups(df: pd.DataFrame) -> np.ndarray:
    t = df["item_name_phrase"].fillna("").astype(str).str.lower()
    t = t.str.replace(r"\d+", " ", regex=True).str.replace(r"[^a-z]+", " ", regex=True)
    key = t.str.split().str[:6].str.join(" ")
    gids = key.apply(lambda s: int(hashlib.md5(s.encode()).hexdigest()[:8], 16))
    return gids.values

groups = make_groups(train)

# Helper: weighted quantiles
def weighted_quantile(v, w, qs):
    v = np.asarray(v, dtype=np.float32)
    w = np.asarray(w, dtype=np.float32)
    m = np.isfinite(v) & np.isfinite(w) & (w>0)
    v = v[m]; w = w[m]
    if v.size == 0:
        return [np.nan for _ in qs]
    order = np.argsort(v)
    v = v[order]; w = w[order]
    cw = np.cumsum(w); cw = cw / cw[-1]
    return [np.interp(q, cw, v) for q in qs]

def weighted_median(v, w):
    return weighted_quantile(v, w, [0.5])[0]

# Core retrieval per fold
K = 100          # retrieve this many neighbors before filtering
MIN_SIM = 0.35   # discard low-sim neighbors
MIN_VALID = 5    # need at least this many neighbors after filtering

gkf = GroupKFold(n_splits=5)

oof_knn_pred      = np.zeros(len(train), dtype=np.float32)  # main KNN price
oof_knn_pred_rbu  = np.zeros(len(train), dtype=np.float32)  # re-based via per-unit when available
oof_knn_valid_ct  = np.zeros(len(train), dtype=np.int32)    # neighbor count after filtering

# Also collect neighbor stats as features for later LGBM
feat_cols = ["knn_mean","knn_med","knn_p10","knn_p25","knn_p75","knn_p90","knn_std","knn_s1","knn_s2","knn_n"]
oof_feats = np.zeros((len(train), len(feat_cols)), dtype=np.float32)

for fold, (tr_idx, va_idx) in enumerate(gkf.split(emb_tr, y, groups)):
    xb = emb_tr[tr_idx].astype(np.float32).copy()
    xq = emb_tr[va_idx].astype(np.float32).copy()

    # FAISS IP index (embeddings already L2-normalized)
    index = faiss.IndexFlatIP(xb.shape[1])
    index.add(xb)

    sims, idxs = index.search(xq, K)  # (n_val, K)
    # map local neighbor indices to global train indices
    idxs_global = tr_idx[idxs]

    # For easy masking
    qty_type_va = qty_type[va_idx]
    qty_type_nb = qty_type[idxs_global]
    price_nb    = y[idxs_global]
    ppu_nb      = ppu[idxs_global]
    qty_nb      = qty[idxs_global]

    # For each validation row, compute aggregates
    for j, vid in enumerate(va_idx):
        s   = sims[j]
        nbi = idxs_global[j]
        # filter by similarity
        keep = s >= MIN_SIM
        # same qty_type
        keep = keep & (qty_type_nb[j] == qty_type_va[j])

        if not np.any(keep):
            # fallback: no same-type neighbors above threshold → use top K anyway (weak)
            keep = sims[j] >= np.sort(sims[j])[-MIN_VALID]  # best MIN_VALID

        nbi = nbi[keep]
        s   = s[keep]
        prices = price_nb[j][keep]

        # weights
        w = np.clip(s, 0, 1)**2

        # aggregates (price)
        med = weighted_median(prices, w)
        mean = np.average(prices, weights=w) if prices.size>0 else np.nan
        p10, p25, p75, p90 = weighted_quantile(prices, w, [0.10,0.25,0.75,0.90])
        std = np.sqrt(np.average((prices - mean)**2, weights=w)) if prices.size>1 and np.isfinite(mean) else 0.0
        s1 = float(np.max(s)) if s.size>0 else 0.0
        s2 = float(np.partition(s, -2)[-2]) if s.size>=2 else s1
        n_valid = int(prices.size)

        # rebase via per-unit if we can (use only neighbors with valid ppu)
        pred_rbu = np.nan
        if np.isfinite(train.loc[vid, "total_qty_std_final"]) and train.loc[vid, "total_qty_std_final"]>0:
            m_ppu = ppu_nb[j][keep]
            m_ppu = m_ppu[np.isfinite(m_ppu)]
            w_ppu = w[np.isfinite(ppu_nb[j][keep])]
            if m_ppu.size >= MIN_VALID:
                ppu_med = weighted_median(m_ppu, w_ppu)
                pred_rbu = ppu_med * float(train.loc[vid, "total_qty_std_final"])

        # store
        oof_knn_pred[vid]     = med if np.isfinite(med) else (global_anchor_price.get(int(qty_type_va[j]), np.median(y)))
        oof_knn_pred_rbu[vid] = pred_rbu if np.isfinite(pred_rbu) else np.nan
        oof_knn_valid_ct[vid] = n_valid

        oof_feats[vid] = np.array([mean, med, p10, p25, p75, p90, std, s1, s2, n_valid], dtype=np.float32)

    print(f"[Fold {fold}] done; avg valid neighbors: {oof_knn_valid_ct[va_idx].mean():.2f}")

# Choose best of (re-based vs plain) per row
final_oof_knn = np.where(np.isfinite(oof_knn_pred_rbu), oof_knn_pred_rbu, oof_knn_pred)

# SMAPE of KNN OOF
EPS_FLOOR = 0.10
knn_oof_smape = smape(y, np.clip(final_oof_knn, EPS_FLOOR, None))
print(f"\nOOF SMAPE (KNN embeddings): {knn_oof_smape:.4f}")


  mask_qty = (qty > 0) & np.isfinite(qty)
  global_anchor_ppu   = df_tmp.loc[mask_qty].groupby("qty_type_id").apply(lambda d: np.median(d["price"]/d["total_qty_std_final"])).to_dict()


[Fold 0] done; avg valid neighbors: 61.08
[Fold 1] done; avg valid neighbors: 61.67
[Fold 2] done; avg valid neighbors: 61.47
[Fold 3] done; avg valid neighbors: 60.72
[Fold 4] done; avg valid neighbors: 61.28

OOF SMAPE (KNN embeddings): 74.9601


In [57]:
# Build one full-train index for test inference
index = faiss.IndexFlatIP(emb_tr.shape[1])
index.add(emb_tr)

K = 100
MIN_SIM = 0.35
test_sims, test_idxs = index.search(emb_te, K)
qty_type_te = np.array([qty_map.get(v,3) for v in test["qty_type_final"].fillna("unknown").astype(str).values], dtype=np.int8)

# Precompute arrays for convenience
price_tr  = y
ppu_tr    = ppu
qty_tr    = qty
qtytype_tr= qty_type

test_pred_med   = np.zeros(len(test), dtype=np.float32)
test_pred_rbu   = np.full(len(test), np.nan, dtype=np.float32)
test_feat_mat   = np.zeros((len(test), len(feat_cols)), dtype=np.float32)
test_valid_ct   = np.zeros(len(test), dtype=np.int32)

for i in range(len(test)):
    s = test_sims[i]
    gi = test_idxs[i]  # indices into train
    # filter by similarity and same qty_type
    keep = (s >= MIN_SIM) & (qtytype_tr[gi] == qty_type_te[i])
    if not np.any(keep):
        keep = s >= np.sort(s)[-min(MIN_VALID, s.size)]
    gi = gi[keep]; s = s[keep]
    prices = price_tr[gi]
    w = np.clip(s, 0, 1)**2

    # aggregates
    med = weighted_median(prices, w)
    mean = np.average(prices, weights=w) if prices.size>0 else np.nan
    p10, p25, p75, p90 = weighted_quantile(prices, w, [0.10,0.25,0.75,0.90])
    std = np.sqrt(np.average((prices - mean)**2, weights=w)) if prices.size>1 and np.isfinite(mean) else 0.0
    s1 = float(np.max(s)) if s.size>0 else 0.0
    s2 = float(np.partition(s, -2)[-2]) if s.size>=2 else s1
    n_valid = int(prices.size)

    test_pred_med[i] = med if np.isfinite(med) else float(np.median(y))
    test_valid_ct[i] = n_valid
    test_feat_mat[i] = np.array([mean, med, p10, p25, p75, p90, std, s1, s2, n_valid], dtype=np.float32)

    # re-base via per-unit when test qty is known
    tqty = test.loc[i, "total_qty_std_final"]
    if np.isfinite(tqty) and tqty>0:
        m_ppu = ppu_tr[gi]
        m_ppu = m_ppu[np.isfinite(m_ppu)]
        w_ppu = w[np.isfinite(ppu_tr[gi])]
        if m_ppu.size >= MIN_VALID:
            ppu_med = weighted_median(m_ppu, w_ppu)
            test_pred_rbu[i] = ppu_med * float(tqty)

final_test_knn = np.where(np.isfinite(test_pred_rbu), test_pred_rbu, test_pred_med)

print("Test KNN predictions computed:", len(final_test_knn))


Test KNN predictions computed: 75000


In [58]:
# OOF/test predictions
np.save(KNN_DIR/"oof_knn_price.npy", final_oof_knn.astype(np.float32))
np.save(KNN_DIR/"test_knn_price.npy", final_test_knn.astype(np.float32))

# Neighbor features for later LightGBM
oof_knn_feats_df  = pd.DataFrame(oof_feats, columns=feat_cols)
oof_knn_feats_df.insert(0, "sample_id", train["sample_id"].values)
oof_knn_feats_df.to_parquet(KNN_DIR/"oof_knn_features.parquet", index=False)

test_knn_feats_df = pd.DataFrame(test_feat_mat, columns=feat_cols)
test_knn_feats_df.insert(0, "sample_id", test["sample_id"].values)
test_knn_feats_df.to_parquet(KNN_DIR/"test_knn_features.parquet", index=False)

# Quick submission from KNN alone (for sanity)
sub_knn = test[["sample_id"]].copy()
sub_knn["price"] = final_test_knn
sub_knn_path = KNN_DIR/"baseline_knn_submission.csv"
sub_knn.to_csv(sub_knn_path, index=False)

print("Saved:")
print("  OOF  ->", KNN_DIR/"oof_knn_price.npy")
print("  TEST ->", KNN_DIR/"test_knn_price.npy")
print("  OOF  features ->", KNN_DIR/"oof_knn_features.parquet")
print("  TEST features ->", KNN_DIR/"test_knn_features.parquet")
print("  KNN submission ->", sub_knn_path)


Saved:
  OOF  -> /kaggle/working/cache/v2_llm/knn_faiss/oof_knn_price.npy
  TEST -> /kaggle/working/cache/v2_llm/knn_faiss/test_knn_price.npy
  OOF  features -> /kaggle/working/cache/v2_llm/knn_faiss/oof_knn_features.parquet
  TEST features -> /kaggle/working/cache/v2_llm/knn_faiss/test_knn_features.parquet
  KNN submission -> /kaggle/working/cache/v2_llm/knn_faiss/baseline_knn_submission.csv


In [59]:
# Load ridge oof saved earlier
RIDGE_DIR = V2LLM_DIR / "ridge_tfidf"
ridge_oof = np.load(RIDGE_DIR/"oof_price.npy")

EPS_FLOOR = 0.10
print("Ridge OOF SMAPE: ", smape(y, np.clip(ridge_oof, EPS_FLOOR, None)).round(4))
print("KNN   OOF SMAPE: ", smape(y, np.clip(final_oof_knn, EPS_FLOOR, None)).round(4))

# Naive blend (50/50) just to see if it improves (we'll do proper stacking later)
blend_oof = 0.5*ridge_oof + 0.5*final_oof_knn
print("Naive 50/50 blend OOF SMAPE:", smape(y, np.clip(blend_oof, EPS_FLOOR, None)).round(4))


Ridge OOF SMAPE:  51.7503
KNN   OOF SMAPE:  74.9601
Naive 50/50 blend OOF SMAPE: 57.6303


In [60]:
print(5)

5


In [61]:
%%capture
!pip install -U lightgbm

import os, gc, re, json, hashlib, math
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter, defaultdict

import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import LabelEncoder

SEED = 42
np.random.seed(SEED)

def smape(y_true, y_pred, eps=1e-9):
    num = np.abs(y_pred - y_pred)
    den = (np.abs(y_true) + np.abs(y_pred) + eps)/2.0
    return 100.0 * np.mean(num/den)

# OOPS: fixed version (typo above—keep this one!)
def smape(y_true, y_pred, eps=1e-9):
    num = np.abs(y_pred - y_true)
    den = (np.abs(y_true) + np.abs(y_pred) + eps)/2.0
    return 100.0 * np.mean(num/den)

WORK_DIR  = Path("/kaggle/working")
CACHE_DIR = WORK_DIR / "cache"
V2LLM_DIR = CACHE_DIR / "v2_llm"
KNN_DIR   = V2LLM_DIR / "knn_faiss"
LGB_DIR   = V2LLM_DIR / "lightgbm"
LGB_DIR.mkdir(parents=True, exist_ok=True)

train = pd.read_parquet(V2LLM_DIR/"train_feats_v2_llm.parquet")
test  = pd.read_parquet(V2LLM_DIR/"test_feats_v2_llm.parquet")

# Load embeddings (already saved) for clustering
emb_tr = np.load(KNN_DIR/"emb_tr.npy")
emb_te = np.load(KNN_DIR/"emb_te.npy")

# Load neighbor features + OOF preds we saved earlier
oof_knn_feats = pd.read_parquet(KNN_DIR/"oof_knn_features.parquet")
test_knn_feats= pd.read_parquet(KNN_DIR/"test_knn_features.parquet")
ridge_oof     = np.load(V2LLM_DIR/"ridge_tfidf/oof_price.npy")
ridge_test    = np.load(V2LLM_DIR/"ridge_tfidf/test_price.npy")
knn_oof       = np.load(KNN_DIR/"oof_knn_price.npy")
knn_test      = np.load(KNN_DIR/"test_knn_price.npy")

# Ensure alignment by sample_id
assert (oof_knn_feats["sample_id"].values == train["sample_id"].values).all()
assert (test_knn_feats["sample_id"].values == test["sample_id"].values).all()

# Attach meta preds + knn feats
train["ridge_oof"] = ridge_oof
train["knn_oof"]   = knn_oof
test["ridge_test"] = ridge_test
test["knn_test"]   = knn_test

train = train.merge(oof_knn_feats, on="sample_id", how="left")
test  = test.merge(test_knn_feats, on="sample_id", how="left")

print(train.shape, test.shape)
train.head(3)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [62]:
from sklearn.cluster import KMeans

def make_groups(df: pd.DataFrame) -> np.ndarray:
    t = df["item_name_phrase"].fillna("").astype(str).str.lower()
    t = t.str.replace(r"\d+", " ", regex=True).str.replace(r"[^a-z]+", " ", regex=True)
    key = t.str.split().str[:6].str.join(" ")
    gids = key.apply(lambda s: int(hashlib.md5(s.encode()).hexdigest()[:8], 16))
    return gids.values

groups = make_groups(train)

K_CLUST = 128  # modest; good tradeoff
km = KMeans(n_clusters=K_CLUST, random_state=SEED, n_init="auto", max_iter=200)
embed_cluster_tr = km.fit_predict(emb_tr)
embed_cluster_te = km.predict(emb_te)

train["embed_cluster"] = embed_cluster_tr.astype(np.int32)
test["embed_cluster"]  = embed_cluster_te.astype(np.int32)


In [63]:
# Prepare core arrays
y = train["price"].values.astype(np.float32)

qty_tr = train["total_qty_std_final"].astype(float).values
qty_te = test["total_qty_std_final"].astype(float).values
has_qty_tr = np.isfinite(qty_tr) & (qty_tr>0)
has_qty_te = np.isfinite(qty_te) & (qty_te>0)
ppu_tr = np.full_like(y, np.nan, dtype=np.float32)
ppu_tr[has_qty_tr] = y[has_qty_tr] / qty_tr[has_qty_tr]

# Helper to compute fold-safe medians
def foldwise_median_map(keys, values, tr_idx):
    """Return dict: key -> median (computed only on tr_idx)."""
    d = {}
    tmp = pd.DataFrame({"k":keys[tr_idx], "v":values[tr_idx]})
    g = tmp.groupby("k")["v"].median()
    d = g.to_dict()
    med_all = np.nanmedian(values[tr_idx])
    return d, float(med_all)

def apply_map(keys, d, default):
    return np.array([d.get(k, default) for k in keys], dtype=np.float32)

# Categorical keys
brand_key_tr = train["brand_final"].astype(str).values
brand_key_te = test["brand_final"].astype(str).values
cat_key_tr   = train["coarse_category"].astype(str).values
cat_key_te   = test["coarse_category"].astype(str).values
qtytype_tr   = train["qty_type_final"].astype(str).values
qtytype_te   = test["qty_type_final"].astype(str).values
cluster_tr   = train["embed_cluster"].astype(int).values
cluster_te   = test["embed_cluster"].astype(int).values

FOLDS = 5
gkf = GroupKFold(n_splits=FOLDS)

# Placeholders for fold-safe features
fold_te_cols = [
    "te_brand_price","te_brand_ppu",
    "te_cluster_ppu","te_cat_ppu","te_qtytype_ppu"
]
oof_te_feats = {c: np.zeros(len(train), dtype=np.float32) * np.nan for c in fold_te_cols}

# For test we will use medians computed on FULL train
full_brand_price_med = pd.DataFrame({"k":brand_key_tr, "v":y}).groupby("k")["v"].median().to_dict()
full_brand_ppu_med   = pd.DataFrame({"k":brand_key_tr[has_qty_tr], "v":ppu_tr[has_qty_tr]}).groupby("k")["v"].median().to_dict()
full_cluster_ppu_med = pd.DataFrame({"k":cluster_tr[has_qty_tr], "v":ppu_tr[has_qty_tr]}).groupby("k")["v"].median().to_dict()
full_cat_ppu_med     = pd.DataFrame({"k":cat_key_tr[has_qty_tr], "v":ppu_tr[has_qty_tr]}).groupby("k")["v"].median().to_dict()
full_qtytype_ppu_med = pd.DataFrame({"k":qtytype_tr[has_qty_tr], "v":ppu_tr[has_qty_tr]}).groupby("k")["v"].median().to_dict()

# Global fallbacks
global_price_med = float(np.median(y))
global_ppu_med   = float(np.nanmedian(ppu_tr))

for fold, (tr_idx, va_idx) in enumerate(gkf.split(train, y, groups)):
    # brand price / ppu
    m_brand_price, def_bp = foldwise_median_map(brand_key_tr, y, tr_idx)
    m_brand_ppu,   def_bu = foldwise_median_map(brand_key_tr, ppu_tr, tr_idx)

    # cluster ppu
    m_cluster_ppu, def_cp = foldwise_median_map(cluster_tr, ppu_tr, tr_idx)

    # category ppu
    m_cat_ppu, def_cat = foldwise_median_map(cat_key_tr, ppu_tr, tr_idx)

    # qtytype ppu
    m_qtytype_ppu, def_qt = foldwise_median_map(qtytype_tr, ppu_tr, tr_idx)

    oof_te_feats["te_brand_price"][va_idx]  = apply_map(brand_key_tr[va_idx],   m_brand_price, def_bp if not math.isnan(def_bp) else global_price_med)
    oof_te_feats["te_brand_ppu"][va_idx]    = apply_map(brand_key_tr[va_idx],   m_brand_ppu,   def_bu if not math.isnan(def_bu) else global_ppu_med)
    oof_te_feats["te_cluster_ppu"][va_idx]  = apply_map(cluster_tr[va_idx],     m_cluster_ppu, def_cp if not math.isnan(def_cp) else global_ppu_med)
    oof_te_feats["te_cat_ppu"][va_idx]      = apply_map(cat_key_tr[va_idx],     m_cat_ppu,     def_cat if not math.isnan(def_cat) else global_ppu_med)
    oof_te_feats["te_qtytype_ppu"][va_idx]  = apply_map(qtytype_tr[va_idx],     m_qtytype_ppu, def_qt if not math.isnan(def_qt) else global_ppu_med)

# Assemble OOF TE features
for k,v in oof_te_feats.items():
    train[k] = v

# Test TE features from FULL medians
test["te_brand_price"]  = apply_map(brand_key_te, full_brand_price_med, global_price_med)
test["te_brand_ppu"]    = apply_map(brand_key_te, full_brand_ppu_med,   global_ppu_med)
test["te_cluster_ppu"]  = apply_map(cluster_te,   full_cluster_ppu_med, global_ppu_med)
test["te_cat_ppu"]      = apply_map(cat_key_te,   full_cat_ppu_med,     global_ppu_med)
test["te_qtytype_ppu"]  = apply_map(qtytype_te,   full_qtytype_ppu_med, global_ppu_med)

# Anchor prices (multiply ppu medians by qty)
def anchor(ppu_arr, qty_arr):
    out = np.full_like(qty_arr, np.nan, dtype=np.float32)
    m = np.isfinite(ppu_arr) & np.isfinite(qty_arr) & (qty_arr>0)
    out[m] = ppu_arr[m] * qty_arr[m]
    return out

for pref in ["brand","cluster","cat","qtytype"]:
    train[f"anchor_{pref}"] = anchor(train[f"te_{pref}_ppu"].values, qty_tr)
    test[f"anchor_{pref}"]  = anchor(test[f"te_{pref}_ppu"].values,  qty_te)


  has_qty_tr = np.isfinite(qty_tr) & (qty_tr>0)
  has_qty_te = np.isfinite(qty_te) & (qty_te>0)
  m = np.isfinite(ppu_arr) & np.isfinite(qty_arr) & (qty_arr>0)
  m = np.isfinite(ppu_arr) & np.isfinite(qty_arr) & (qty_arr>0)
  m = np.isfinite(ppu_arr) & np.isfinite(qty_arr) & (qty_arr>0)
  m = np.isfinite(ppu_arr) & np.isfinite(qty_arr) & (qty_arr>0)
  m = np.isfinite(ppu_arr) & np.isfinite(qty_arr) & (qty_arr>0)
  m = np.isfinite(ppu_arr) & np.isfinite(qty_arr) & (qty_arr>0)
  m = np.isfinite(ppu_arr) & np.isfinite(qty_arr) & (qty_arr>0)
  m = np.isfinite(ppu_arr) & np.isfinite(qty_arr) & (qty_arr>0)


In [64]:
# Map brand to top-K, others -> "__other__" to keep cardinality tame
TOPK_BRANDS = 1000
top_brands = [b for b,_ in Counter(train["brand_final"].astype(str)).most_common(TOPK_BRANDS)]
train["brand_top"] = np.where(train["brand_final"].isin(top_brands), train["brand_final"], "__other__")
test["brand_top"]  = np.where(test["brand_final"].isin(top_brands),  test["brand_final"],  "__other__")

# Feature lists
num_cols = [
    "log_total_qty_std_final","log_len_words","log_pack_count",
    "len_chars","len_words","num_digits",
    # neighbor stats
    "knn_mean","knn_med","knn_p10","knn_p25","knn_p75","knn_p90","knn_std","knn_s1","knn_s2","knn_n",
    # TE + anchors
    "te_brand_price","te_brand_ppu","te_cluster_ppu","te_cat_ppu","te_qtytype_ppu",
    "anchor_brand","anchor_cluster","anchor_cat","anchor_qtytype",
    # meta preds (OOF/test)
    "ridge_oof","knn_oof",
]
num_cols_test = [c.replace("_oof","_test") if c.endswith("_oof") else c for c in num_cols]

# Booleans
bool_cols = ["organic","gluten_free","keto","sugar_free","premium","non_gmo","kosher","decaf","instant","refill","bulk","arabica"]
for c in bool_cols:
    if c not in train.columns: train[c]=0
    if c not in test.columns:  test[c]=0
num_cols += bool_cols
num_cols_test += bool_cols

# Categorical columns
cat_cols = ["brand_top","coarse_category","qty_type_final","embed_cluster"]

# Build X/y
X_num  = train[num_cols].fillna(0.0).astype(np.float32)
Xc     = train[cat_cols].copy()
Xt_num = test[num_cols_test].fillna(0.0).astype(np.float32)
Xtc    = test[cat_cols].copy()

# Label encode categoricals and keep as category dtype
cat_map = {}
for c in cat_cols:
    le = LabelEncoder()
    all_vals = pd.concat([Xc[c].astype(str), Xtc[c].astype(str)], axis=0)
    le.fit(all_vals)
    Xc[c]  = pd.Categorical(le.transform(Xc[c].astype(str)))
    Xtc[c] = pd.Categorical(le.transform(Xtc[c].astype(str)))
    cat_map[c] = le

# Final design frames
X_train = pd.concat([X_num, Xc], axis=1)
X_test  = pd.concat([Xt_num, Xtc], axis=1)

# LightGBM understands pandas categorical dtype
y = train["price"].values.astype(np.float32)

print("Train/Test shapes:", X_train.shape, X_test.shape)


Train/Test shapes: (75000, 43) (75000, 43)


In [65]:
def smape_weight_scheme(y):
    return 1.0 / np.sqrt(np.clip(y, 1e-2, None))

params = dict(
    objective="regression_l1",
    metric="mae",
    learning_rate=0.05,
    num_leaves=64,
    max_depth=-1,
    min_data_in_leaf=40,
    feature_fraction=0.9,
    bagging_fraction=0.8,
    bagging_freq=1,
    lambda_l1=0.0,
    lambda_l2=1.0,
    verbosity=-1,
    seed=SEED,
)

FOLDS = 5
gkf = GroupKFold(n_splits=FOLDS)

oof_lgb = np.zeros(len(X_train), dtype=np.float32)
preds_test_folds = []
fold_metrics = []

weights = smape_weight_scheme(y)

for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_train, y, groups)):
    dtr = lgb.Dataset(X_train.iloc[tr_idx], label=y[tr_idx], weight=weights[tr_idx], categorical_feature=cat_cols, free_raw_data=False)
    dva = lgb.Dataset(X_train.iloc[va_idx], label=y[va_idx], weight=weights[va_idx], categorical_feature=cat_cols, free_raw_data=False)

    model = lgb.train(
        params,
        dtr,
        num_boost_round=5000,
        valid_sets=[dtr, dva],
        valid_names=["tr","va"],
        callbacks=[
            lgb.early_stopping(stopping_rounds=200, verbose=False),
            lgb.log_evaluation(period=200),
        ],
    )

    va_pred = model.predict(X_train.iloc[va_idx], num_iteration=model.best_iteration).astype(np.float32)
    oof_lgb[va_idx] = va_pred

    te_pred = model.predict(X_test, num_iteration=model.best_iteration).astype(np.float32)
    preds_test_folds.append(te_pred)

    score = smape(y[va_idx], np.clip(va_pred, 0.10, None))
    fold_metrics.append(score)
    print(f"[Fold {fold}] best_iter={model.best_iteration} SMAPE={score:.4f}")

oof_smape = smape(y, np.clip(oof_lgb, 0.10, None))
print(f"\nOOF SMAPE (LightGBM): {oof_smape:.4f}")
preds_lgb_test = np.mean(np.vstack(preds_test_folds), axis=0).astype(np.float32)


[200]	tr's l1: 6.00844	va's l1: 6.71909
[400]	tr's l1: 5.84117	va's l1: 6.71006
[Fold 0] best_iter=327 SMAPE=51.4214
[200]	tr's l1: 6.03331	va's l1: 6.5895
[400]	tr's l1: 5.86663	va's l1: 6.57698
[600]	tr's l1: 5.76088	va's l1: 6.57166
[800]	tr's l1: 5.69085	va's l1: 6.57089
[1000]	tr's l1: 5.6334	va's l1: 6.56857
[1200]	tr's l1: 5.58655	va's l1: 6.56701
[1400]	tr's l1: 5.54121	va's l1: 6.5659
[1600]	tr's l1: 5.50521	va's l1: 6.56604
[Fold 1] best_iter=1539 SMAPE=50.4869
[200]	tr's l1: 6.01139	va's l1: 6.70761
[400]	tr's l1: 5.8516	va's l1: 6.69818
[600]	tr's l1: 5.74509	va's l1: 6.692
[800]	tr's l1: 5.66699	va's l1: 6.69039
[1000]	tr's l1: 5.60443	va's l1: 6.6909
[1200]	tr's l1: 5.55578	va's l1: 6.68983
[1400]	tr's l1: 5.5166	va's l1: 6.69201
[Fold 2] best_iter=1216 SMAPE=51.2191
[200]	tr's l1: 6.01188	va's l1: 6.685
[400]	tr's l1: 5.84953	va's l1: 6.67726
[600]	tr's l1: 5.74959	va's l1: 6.67304
[800]	tr's l1: 5.67637	va's l1: 6.6704
[1000]	tr's l1: 5.61982	va's l1: 6.66937
[1200]	tr'

In [66]:
def train_quantile(alpha):
    qparams = params.copy()
    qparams.update(dict(objective="quantile", alpha=alpha, metric="quantile"))
    oof_q = np.zeros(len(X_train), dtype=np.float32)
    preds_q_folds = []
    for fold, (tr_idx, va_idx) in enumerate(gkf.split(X_train, y, groups)):
        dtr = lgb.Dataset(X_train.iloc[tr_idx], label=y[tr_idx], weight=weights[tr_idx], categorical_feature=cat_cols, free_raw_data=False)
        dva = lgb.Dataset(X_train.iloc[va_idx], label=y[va_idx], weight=weights[va_idx], categorical_feature=cat_cols, free_raw_data=False)
        mdl = lgb.train(
            qparams, dtr, num_boost_round=2000, valid_sets=[dtr, dva],
            callbacks=[lgb.early_stopping(200, verbose=False)]
        )
        oof_q[va_idx] = mdl.predict(X_train.iloc[va_idx], num_iteration=mdl.best_iteration).astype(np.float32)
        preds_q_folds.append(mdl.predict(X_test, num_iteration=mdl.best_iteration).astype(np.float32))
    pred_q_test = np.mean(np.vstack(preds_q_folds), axis=0).astype(np.float32)
    return oof_q, pred_q_test

print("Training quantile p10 ...")
oof_p10, te_p10 = train_quantile(alpha=0.10)
print("Training quantile p90 ...")
oof_p90, te_p90 = train_quantile(alpha=0.90)

# Clip LGB predictions to [p10,p90]
oof_lgb_clipped = np.clip(oof_lgb, a_min=oof_p10, a_max=oof_p90)
test_lgb_clipped = np.clip(preds_lgb_test, a_min=te_p10, a_max=te_p90)

print("OOF SMAPE (LGB clipped):", smape(y, np.clip(oof_lgb_clipped, 0.10, None)).round(4))


Training quantile p10 ...
Training quantile p90 ...
OOF SMAPE (LGB clipped): 51.1569


In [67]:
np.save(LGB_DIR/"oof_lgb.npy", oof_lgb)
np.save(LGB_DIR/"test_lgb.npy", preds_lgb_test)
np.save(LGB_DIR/"oof_lgb_clipped.npy", oof_lgb_clipped if 'oof_lgb_clipped' in locals() else oof_lgb)
np.save(LGB_DIR/"test_lgb_clipped.npy", test_lgb_clipped if 'test_lgb_clipped' in locals() else preds_lgb_test)

with open(LGB_DIR/"oof_metrics.json", "w") as f:
    json.dump({
        "fold_smapes": [float(s) for s in fold_metrics],
        "oof_smape": float(oof_smape)
    }, f, indent=2)

# quick submission (we will re-blend in next phase)
sub_lgb = test[["sample_id"]].copy()
sub_lgb["price"] = test_lgb_clipped if 'test_lgb_clipped' in locals() else preds_lgb_test
sub_lgb_path = LGB_DIR/"baseline_lgb_submission.csv"
sub_lgb.to_csv(sub_lgb_path, index=False)
print("Saved:", sub_lgb_path)


Saved: /kaggle/working/cache/v2_llm/lightgbm/baseline_lgb_submission.csv


In [72]:
df= pd.read_csv('/kaggle/working/cache/v2_llm/lightgbm/baseline_lgb_submission.csv')

In [73]:
df.head()

Unnamed: 0,sample_id,price
0,100179,11.714991
1,245611,10.738874
2,146263,16.404509
3,95658,4.567592
4,36806,27.749157


In [None]:
df.to_csv('my_dataframe.csv', index=False)