# i. Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re

from typing import Dict, Any, List
from coffee_rec import UserPref, build_profile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# ii. Load Data

In [2]:
df = pd.read_csv("EDA/tokopedia_products_updated.csv")

In [3]:
# Sanity check
print(df.shape)
print(df.columns)
df.head()

(614, 8)
Index(['source', 'name', 'price', 'description', 'origin', 'process',
       'roast_level', 'notes'],
      dtype='object')


Unnamed: 0,source,name,price,description,origin,process,roast_level,notes
0,instinct roastery,candy apple espresso roast instinct roastery ...,195000,50 toraja washed 20 colombia sequoias waterme...,"colombia, toraja, flores","washed, natural",unknown,"fruity, sweet"
1,instinct roastery,ethiopia tima washed instinct roastery 150 gram,185000,region guji process washed variety 74110 al...,ethiopia,washed,light,floral
2,instinct roastery,espresso roast kenya tatu instinct roastery ...,195000,process natural region kiambu county farm t...,kenya,"washed, natural",unknown,sweet
3,instinct roastery,ethiopia sidama baturo instinct roastery 150...,220000,process anaerobic natural variety 74158 regi...,ethiopia,"natural, anaerobic",unknown,"fruity, herbal"
4,instinct roastery,colombia eliecer ordez instinct roastery 150...,175000,process washed variety pink bourbon region ...,colombia,washed,unknown,sweet


In [7]:
# Check data type
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   source       614 non-null    object
 1   name         614 non-null    object
 2   price        614 non-null    int64 
 3   description  614 non-null    object
 4   origin       614 non-null    object
 5   process      614 non-null    object
 6   roast_level  614 non-null    object
 7   notes        614 non-null    object
dtypes: int64(1), object(7)
memory usage: 38.5+ KB


# iii. Feature Engineering

## 3.1 Expected Feature-Engineered Output

In this step, we freeze an output contract: the exact set of columns our Feature Engineering must produce so the downstream pipeline (TF-IDF, cosine similarity, reranking) remains stable. This matters because datasets evolve; without a clear schema contract, a small change (missing/renamed columns) can break the notebook or silently degrade recommendation quality.

In [6]:
# Columns we need from the RAW dataset (minimum to run FE safely)
REQUIRED_RAW_COLS = ["source", "name", "price", "description"]

# Columns we promise to produce after FEATURE ENGINEERING (downstream contract)
REQUIRED_FE_COLS = [
    # identity
    "source", "name", "price", "description",
    # normalized text
    "name_clean", "desc_clean", "raw_text",
    # ranking flags
    "is_house_blend", "is_blend", "is_capsule", "is_liquid", "is_merch_tools",
    # structured coffee profile
    "process", "roast", "bean_type", "notes", "notes_str",
    # TF-IDF main input
    "match_text",
    # optional but recommended to keep if available
    "origin",
]

# Create a function to validate columns
def assert_columns_exist(df, required_cols, stage_name=""):
    missing = [c for c in required_cols if c not in df.columns]
    if missing:
        raise ValueError(f"[{stage_name}] Missing required columns: {missing}")
    return True

## 3.2 Text Normalization

In this step, we create clean, consistent text versions of name and description through simple normalization: lowercasing, whitespace cleanup, and safe missing-value handling. The outputs are name_clean, desc_clean, and a combined raw_text. This matters because marketplace text is noisy (random casing, messy spacing, inconsistent formatting). Normalized text makes keyword-based flags (capsule/liquid/tools) more reliable, and it gives us a strong fallback source in case new structured columns (process/notes) are missing or inconsistent.

In [None]:
def normalize_text(s: Any) -> str:
    """
    Normalize text safely for downstream keyword/regex logic.
    """

    if pd.isna(s):
        return ""
    s = str(s).lower()
    s = re.sub(r"\s+", " ", s).strip()
    return s

# Normalize key text fields
df["name_clean"] = df["name"].apply(normalize_text)
df["desc_clean"] = df["description"].apply(normalize_text)

# Build raw_text (foundation for flags + fallback extraction)
df["raw_text"] = (df["name_clean"] + " " + df["desc_clean"]).str.strip()

# Quick check
print(df[["name_clean", "desc_clean", "raw_text"]].head())

                                          name_clean  \
0  candy apple espresso roast instinct roastery 2...   
1    ethiopia tima washed instinct roastery 150 gram   
2  espresso roast kenya tatu instinct roastery 20...   
3  ethiopia sidama baturo instinct roastery 150 gram   
4  colombia eliecer ordez instinct roastery 150 gram   

                                          desc_clean  \
0  50 toraja washed 20 colombia sequoias watermel...   
1  region guji process washed variety 74110 altit...   
2  process natural region kiambu county farm tatu...   
3  process anaerobic natural variety 74158 region...   
4  process washed variety pink bourbon region hui...   

                                            raw_text  
0  candy apple espresso roast instinct roastery 2...  
1  ethiopia tima washed instinct roastery 150 gra...  
2  espresso roast kenya tatu instinct roastery 20...  
3  ethiopia sidama baturo instinct roastery 150 g...  
4  colombia eliecer ordez instinct roastery 150 g..

## 3.3 Create Important Flags

In this step, we create boolean flags (True/False) to label product types such as capsules, ready-to-drink/liquid, brewing tools/merch, blend, and house blend. This matters because our recommender is meant to serve “coffee beans,” but marketplace listings often include look-alike items with similar wording (e.g., “filter” appears in tools, not beans). These flags become business guardrails for the ranking/penalty layer: if users want beans, we should avoid pushing V60, drip bags, capsules, or RTD drinks.

In [9]:
def contains_any(text: str, keywords: List[str]) -> bool:
    """
    Return True if any keyword appears in the given text.
    """

    if not text:
        return False
    return any(k in text for k in keywords)

# Keyword sets
kw_house_blend = ["house blend", "signature blend"]
kw_blend = [" blend", "espresso blend", "arabica robusta blend"]  # note the leading space to reduce false hits
kw_capsule = ["capsule", "capsules", "nespresso", "dolce gusto", "k-cup", "kcup", "pod", "coffee pod"]
kw_liquid = [
    "ml", "m l", "liter", "litre", "milliliter", "ready to drink", "rtd",
    "bottle", "botol", "can", "kaleng", "cold brew", "latte", "milk coffee"
]
kw_merch_tools = [
    "v60", "drip bag", "dripbag", "filter paper", "paper filter", "aeropress",
    "french press", "moka pot", "grinder", "hand grinder", "tamper", "server",
    "kettle", "gooseneck", "scale", "milk jug", "pitcher"
]

# Build flags
df["is_house_blend"] = df["raw_text"].apply(lambda t: contains_any(t, kw_house_blend))
df["is_blend"] = df["raw_text"].apply(lambda t: contains_any(t, kw_blend))
df["is_capsule"] = df["raw_text"].apply(lambda t: contains_any(t, kw_capsule))
df["is_liquid"] = df["raw_text"].apply(lambda t: contains_any(t, kw_liquid))
df["is_merch_tools"] = df["raw_text"].apply(lambda t: contains_any(t, kw_merch_tools))

# Quick check
print(df[["raw_text", "is_house_blend", "is_blend", "is_capsule", "is_liquid", "is_merch_tools"]].head(5))
print("\nFlag counts:")
print(df[["is_house_blend", "is_blend", "is_capsule", "is_liquid", "is_merch_tools"]].sum())

                                            raw_text  is_house_blend  \
0  candy apple espresso roast instinct roastery 2...           False   
1  ethiopia tima washed instinct roastery 150 gra...           False   
2  espresso roast kenya tatu instinct roastery 20...           False   
3  ethiopia sidama baturo instinct roastery 150 g...           False   
4  colombia eliecer ordez instinct roastery 150 g...           False   

   is_blend  is_capsule  is_liquid  is_merch_tools  
0      True       False       True           False  
1     False       False      False           False  
2     False       False      False           False  
3     False       False      False           False  
4     False       False       True           False  

Flag counts:
is_house_blend     10
is_blend           90
is_capsule          8
is_liquid         217
is_merch_tools    402
dtype: int64


## 3.4 Create Coffee Profile Attribute Columns

In this step, we rebuild the coffee profile attributes (process, roast, notes, origin) using a hybrid approach. We trust explicit columns from the updated dataset as the primary source of truth (they’re usually cleaner and more consistent), then fallback to raw_text extraction only when those explicit values are missing or invalid. This matters because it improves recommendation quality (better signals) while keeping the system robust when marketplace data is messy or incomplete. The outputs here feed directly into match_text (TF-IDF) and the ranking/penalty layer.

In [None]:
# Helpers for normalization/mapping
def norm_str(x) -> str:
    """
    Safely normalize any value into a clean lowercase string.
    """

    if pd.isna(x):
        return ""
    return re.sub(r"\s+", " ", str(x).strip().lower())

def normalize_process(val: str) -> str:
    """
    Map process text into our canonical labels: washed/natural/honey/anaerobic (or empty).
    """
    
    s = norm_str(val)
    if not s:
        return ""
    if "wash" in s:
        return "washed"
    if "natural" in s:
        return "natural"
    if "honey" in s:
        return "honey"
    if "anaer" in s:
        return "anaerobic"
    return ""

def map_roast_level(val: str) -> str:
    """
    Map dataset roast_level into: light/medium/dark/espresso (or empty).
    Adjust mapping here if your dataset uses different labels.
    """

    s = norm_str(val)
    if not s:
        return ""

    # Common variants
    if "espresso" in s:
        return "espresso"
    if "dark" in s:
        return "dark"
    if "medium" in s:
        return "medium"
    if "light" in s:
        return "light"

    # Sometimes datasets use "city/full city/french/italian" etc.
    if "french" in s or "italian" in s:
        return "dark"
    if "full city" in s or "city+" in s:
        return "medium"
    if "city" in s:
        return "light"

    return ""

def normalize_notes(val: str) -> str:
    """
    Make notes consistent.
    Output: a single string, e.g. "fruity floral chocolate_nutty"
    """
    s = norm_str(val)
    if not s:
        return ""

    # Split by commas/slashes/pipes
    parts = re.split(r"[,\|/;]+", s)
    parts = [p.strip() for p in parts if p.strip()]

    # Lightweight synonym mapping
    mapped = []
    for p in parts:
        if "chocolate" in p or "cocoa" in p or "nut" in p:
            mapped.append("chocolate_nutty")
        elif "floral" in p or "jasmine" in p or "rose" in p:
            mapped.append("floral")
        elif "citrus" in p or "berry" in p or "apple" in p or "grape" in p or "tropical" in p or "fruity" in p:
            mapped.append("fruity")
        elif "spice" in p or "cinnamon" in p or "clove" in p:
            mapped.append("spicy")
        elif "sweet" in p or "honey" in p or "vanilla" in p or "brown sugar" in p:
            mapped.append("sweet")
        else:
            # Keep raw token
            mapped.append(p.replace(" ", "_"))

    # Dedup while preserving order
    mapped = list(dict.fromkeys(mapped))
    return " ".join(mapped)

def normalize_origin(val: str) -> str:
    """Keep origin as-is but normalized lightly (string, lowercase, trimmed)."""
    return norm_str(val)


# Fallback extractors from raw_text (only used when explicit is missing/invalid)

def extract_one(text: str, patterns: dict) -> str:
    """Return label if any regex pattern matches the text."""
    for label, pat in patterns.items():
        if re.search(pat, text):
            return label
    return ""

PROCESS_PATTERNS = {
    "washed": r"\bwashed\b",
    "natural": r"\bnatural\b",
    "honey": r"\bhoney\b",
    "anaerobic": r"\banaerobic\b",
}

ROAST_PATTERNS = {
    "espresso": r"\bespresso\b|\bespresso roast\b",
    "dark": r"\bdark\b|\bdark roast\b|\bfrench\b|\bitalian\b",
    "medium": r"\bmedium\b|\bmedium roast\b|\bfull city\b|\bcity\+\b",
    "light": r"\blight\b|\blight roast\b|\bcity\b",
}

NOTES_DICT = {
    "fruity": ["fruity", "berry", "citrus", "apple", "grape", "tropical", "mango", "pineapple"],
    "floral": ["floral", "jasmine", "rose"],
    "chocolate_nutty": ["chocolate", "cocoa", "nutty", "almond", "hazelnut", "caramel"],
    "spicy": ["spice", "cinnamon", "clove"],
    "sweet": ["sweet", "honey", "brown sugar", "vanilla"],
}

def extract_notes_from_raw(text: str) -> str:
    """Return normalized notes string using keyword matching."""
    found = []
    for label, kws in NOTES_DICT.items():
        if any(kw in text for kw in kws):
            found.append(label)
    found = list(dict.fromkeys(found))
    return " ".join(found)


# Apply HYBRID logic
# - Use explicit columns first
# - Fallback to raw_text if missing/invalid

# Process (explicit -> normalize -> fallback)
df["process_clean"] = df["process"].apply(normalize_process) if "process" in df.columns else ""
missing_proc = df["process_clean"].eq("")
df.loc[missing_proc, "process_clean"] = df.loc[missing_proc, "raw_text"].apply(
    lambda t: extract_one(t, PROCESS_PATTERNS)
)

# Roast (explicit roast_level -> map -> fallback)
df["roast_clean"] = df["roast_level"].apply(map_roast_level) if "roast_level" in df.columns else ""
missing_roast = df["roast_clean"].eq("")
df.loc[missing_roast, "roast_clean"] = df.loc[missing_roast, "raw_text"].apply(
    lambda t: extract_one(t, ROAST_PATTERNS)
)

# Notes (explicit -> normalize -> fallback)
df["notes_clean"] = df["notes"].apply(normalize_notes) if "notes" in df.columns else ""
missing_notes = df["notes_clean"].eq("")
df.loc[missing_notes, "notes_clean"] = df.loc[missing_notes, "raw_text"].apply(extract_notes_from_raw)

# Origin (keep if exists)
df["origin_clean"] = df["origin"].apply(normalize_origin) if "origin" in df.columns else ""

# Keep a consistent downstream schema:
# - process, roast, notes_str, origin (clean versions)
df["process"] = df["process_clean"]
df["roast"] = df["roast_clean"]
df["notes_str"] = df["notes_clean"]
df["origin"] = df["origin_clean"]

# Quick checks
print("Null/empty checks (count of empty strings):")
print({
    "process_empty": int((df["process"] == "").sum()),
    "roast_empty": int((df["roast"] == "").sum()),
    "notes_empty": int((df["notes_str"] == "").sum()),
    "origin_empty": int((df["origin"] == "").sum()) if "origin" in df.columns else None,
})

df[["raw_text", "process", "roast", "notes_str", "origin"]].head()

Null/empty checks (count of empty strings):
{'process_empty': 168, 'roast_empty': 95, 'notes_empty': 0, 'origin_empty': 0}


Unnamed: 0,raw_text,process,roast,notes_str,origin
0,candy apple espresso roast instinct roastery 2...,washed,espresso,fruity sweet,"colombia, toraja, flores"
1,ethiopia tima washed instinct roastery 150 gra...,washed,light,floral,ethiopia
2,espresso roast kenya tatu instinct roastery 20...,washed,espresso,sweet,kenya
3,ethiopia sidama baturo instinct roastery 150 g...,natural,,fruity herbal,ethiopia
4,colombia eliecer ordez instinct roastery 150 g...,washed,,sweet,colombia


## 3.5 Clean and Standarize `notes` Column

Here we clean and standardize the notes column into a consistent notes_str, ready for TF-IDF tokens like notes_fruity, notes_floral, etc. In real datasets, notes can come as a list, a long free-text string, or already bucketed labels. If we keep it inconsistent, TF-IDF becomes noisy and rankings fluctuate. A clean notes_str makes the similarity layer more stable and the recommendations easier to reason about.

In [None]:
# Notes buckets
NOTES_BUCKETS = {
    "fruity": ["fruity", "berry", "citrus", "apple", "grape", "tropical", "mango", "pineapple"],
    "floral": ["floral", "jasmine", "rose"],
    "chocolate_nutty": ["chocolate", "cocoa", "nutty", "almond", "hazelnut", "caramel"],
    "spicy": ["spice", "cinnamon", "clove"],
    "sweet": ["sweet", "honey", "brown sugar", "vanilla"],
    "herbal": ["herbal", "tea", "green tea", "mint", "lemongrass"],
    "woody": ["woody", "wood", "cedar", "oak"]
}

CANONICAL = set(NOTES_BUCKETS.keys())

def norm_str(x) -> str:
    if pd.isna(x):
        return ""
    return re.sub(r"\s+", " ", str(x).strip().lower())

def split_notes_string(s: str) -> list[str]:
    """
    Split a notes string that may use commas, slashes, pipes, semicolons.
    Example: 'fruity/floral/chocolate_nutty' -> ['fruity','floral','chocolate_nutty']
    """

    s = norm_str(s)
    if not s:
        return []
    parts = re.split(r"[,\|/;]+", s)
    parts = [p.strip() for p in parts if p.strip()]
    return parts

def map_to_bucket(token: str) -> str:
    """
    Map any token into our canonical buckets if possible.
    - If token already one of canonical buckets => keep it
    - Else find bucket by keyword matching
    """

    t = norm_str(token).replace(" ", "_")
    if t in CANONICAL:
        return t

    t_space = norm_str(token)
    # boundary match (kata utuh)
    for bucket, kws in NOTES_BUCKETS.items():
        for kw in kws:
            if re.search(rf"\b{re.escape(kw)}\b", t_space):
                return bucket
    # Drop unknown token to reduce noise
    return ""

def normalize_notes_to_str(val) -> str:
    """
    Accepts:
    - list: ['citrus','jasmine']
    - string: 'citrus, jasmine'
    - bucketed string: 'fruity/floral/chocolate_nutty'
    Output:
    - 'fruity floral' (space-separated buckets, deduped)
    """
    if val is None or (isinstance(val, float) and pd.isna(val)):
        return ""

    # If list-like
    if isinstance(val, list):
        tokens = [norm_str(x) for x in val if x is not None and str(x).strip()]
    else:
        # If it’s a string representation of a list: "['citrus', 'jasmine']"
        s = norm_str(val)
        if s.startswith("[") and s.endswith("]"):
            # very light parse: remove brackets and quotes
            s = s.strip("[]")
            s = s.replace('"', "").replace("'", "")
            tokens = split_notes_string(s)
        else:
            tokens = split_notes_string(s)

    # Map to canonical buckets
    buckets = []
    for tok in tokens:
        b = map_to_bucket(tok)
        if b:
            buckets.append(b)

    # Deduplicate while preserving order
    buckets = list(dict.fromkeys(buckets))

    return " ".join(buckets)

# Apply
df["notes_str"] = df["notes"].apply(normalize_notes_to_str)

# Quick checks
print("notes_str empty count:", int((df["notes_str"] == "").sum()))
df[["notes", "notes_str"]].head(15)

notes_str empty count: 59


Unnamed: 0,notes,notes_str
0,"fruity, sweet",fruity sweet
1,floral,floral
2,sweet,sweet
3,"fruity, herbal",fruity herbal
4,sweet,sweet
5,"chocolate, sweet",chocolate_nutty sweet
6,"caramel, floral, sweet",chocolate_nutty floral sweet
7,"fruity, sweet",fruity sweet
8,"fruity, sweet",fruity sweet
9,"chocolate, fruity",chocolate_nutty fruity


## 3.6 Create `bean_type` Column

We explicitly define bean_type as a structured feature because it represents one of the most fundamental decision factors in coffee selection. For many users, the difference between arabica and robusta is not just technical—it directly affects flavor expectation, caffeine level, and overall drinking experience. By extracting bean_type into a dedicated column, we give the recommender a clear, explainable signal that can later be used for ranking boosts, penalties, or user preference matching, instead of relying purely on noisy free-text similarity.

From a system design perspective, this also keeps the recommendation logic modular. Even if bean_type is not heavily weighted today, having it as a first-class feature allows us to evolve the ranking logic (for example, “prefer arabica for filter drinkers”) without refactoring the entire pipeline.

In [20]:
def infer_bean_type(text: str):
    if pd.isna(text):
        return None
    s = str(text).lower()
    has_arabica = "arabica" in s
    has_robusta = "robusta" in s
    if has_arabica and has_robusta:
        return "arabica_robusta_blend"
    if has_arabica:
        return "arabica"
    if has_robusta:
        return "robusta"
    return None

if "bean_type" not in df.columns:
    df["bean_type"] = df["raw_text"].apply(infer_bean_type)

In [28]:
# Quick sanity check
print(df["bean_type"].unique())
print("\n")
print(df["bean_type"].value_counts())
print("\n")
print(df["bean_type"])

[None 'arabica' 'robusta']


bean_type
arabica    173
robusta     95
Name: count, dtype: int64


0         None
1         None
2         None
3         None
4         None
        ...   
609    robusta
610    arabica
611    robusta
612       None
613       None
Name: bean_type, Length: 614, dtype: object


The large number of None values is expected and normal for marketplace data. In real product listings, sellers often do not explicitly state the bean type, especially when it is assumed (e.g., many specialty coffees default to arabica and don’t bother mentioning it). Some listings focus more on origin, roast level, or flavor notes, leaving bean composition implicit.

This means None here does not indicate a data quality failure. It simply reflects that the information is not observable from text, and forcing a guess would introduce noise and false confidence. From an industry standpoint, it is better to leave a feature unknown than to hallucinate a value that could mislead downstream logic.

**None is completely safe and intentional in this setup.**

In our pipeline, bean_type is used as an optional structured signal, not a hard requirement. When it is present, it can contribute a boost during reranking (e.g., matching user preference). When it is missing, the system simply falls back to other strong signals such as TF-IDF similarity, process, roast level, and flavor notes.

Crucially, bean_type is not a mandatory input for TF-IDF vectorization, so missing values do not break modeling, degrade embeddings, or skew similarity scores. Instead, this design preserves robustness: the recommender remains stable even when structured attributes are incomplete—which is exactly what we want in a real production environment.

## 3.7 Create `match_text` Column

In this step we build a single final text field called match_text. It combines:
- marketplace text: name_clean + desc_clean
- tructured tokens: process_{...}, roast_{...}, bean_{...}, notes_{...}
- optional guardrail tags: tag_house_blend, tag_blend
- origin_{...} if origin is clean

This makes TF-IDF matching more consistent because both user queries and products speak the same “token language.”

In [29]:
def clean_token(x) -> str:
    """
    Make a safe token chunk:
    - handle NaN/None
    - lowercase
    - replace spaces with underscore
    - remove weird punctuation (keep letters, numbers, underscore)
    """

    if pd.isna(x):
        return ""
    s = str(x).strip().lower()
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^a-z0-9_]+", "", s)
    return s

def build_match_text(row) -> str:
    parts = []

    # Base text
    base_name = row.get("name_clean", "")
    base_desc = row.get("desc_clean", "")
    parts.append(base_name)
    parts.append(base_desc)

    # Structured tokens (only if exist)
    proc = clean_token(row.get("process"))
    if proc:
        parts.append(f"process_{proc}")

    roast = clean_token(row.get("roast"))
    if roast:
        parts.append(f"roast_{roast}")

    bean = clean_token(row.get("bean_type"))
    if bean:
        parts.append(f"bean_{bean}")

    notes_str = clean_token(row.get("notes_str"))
    # Give prefix notes_ to make it token-based
    if notes_str:
        parts.append(f"notes_{notes_str}")

    # Tag flags (guardrails/extra signals)
    if bool(row.get("is_house_blend", False)):
        parts.append("tag_house_blend")
    if bool(row.get("is_blend", False)):
        parts.append("tag_blend")

    # Origin token
    origin = clean_token(row.get("origin"))
    if origin:
        parts.append(f"origin_{origin}")

    # Join and strip
    return " ".join([p for p in parts if p]).strip()

df["match_text"] = df.apply(build_match_text, axis=1)

# Quick check
df[["name_clean", "desc_clean", "process", "roast", "bean_type", "notes_str", "origin", "match_text"]].head(5)

Unnamed: 0,name_clean,desc_clean,process,roast,bean_type,notes_str,origin,match_text
0,candy apple espresso roast instinct roastery 2...,50 toraja washed 20 colombia sequoias watermel...,washed,espresso,,fruity sweet,"colombia, toraja, flores",candy apple espresso roast instinct roastery 2...
1,ethiopia tima washed instinct roastery 150 gram,region guji process washed variety 74110 altit...,washed,light,,floral,ethiopia,ethiopia tima washed instinct roastery 150 gra...
2,espresso roast kenya tatu instinct roastery 20...,process natural region kiambu county farm tatu...,washed,espresso,,sweet,kenya,espresso roast kenya tatu instinct roastery 20...
3,ethiopia sidama baturo instinct roastery 150 gram,process anaerobic natural variety 74158 region...,natural,,,fruity herbal,ethiopia,ethiopia sidama baturo instinct roastery 150 g...
4,colombia eliecer ordez instinct roastery 150 gram,process washed variety pink bourbon region hui...,washed,,,sweet,colombia,colombia eliecer ordez instinct roastery 150 g...


## 3.8 Quality Check before Modeling

In this QC step, we verify that the Feature Engineering output from the updated dataset is still consistent, valid, and safe for TF-IDF similarity and the reranking layer. Since the data changed (more rows, new patterns, different seller wording), QC acts as the last guardrail to catch silent issues—like empty match_text, unexpected values in process/roast, or overly-triggered flags (capsule/tools) due to noisy keywords. In practice, this is what keeps a recommender reliable when the dataset grows and evolves.

In [32]:
# Shape check
print("QC 1: Shape check\n")
print("Rows, Cols:", df.shape)

# Duplicates check (based on key identity columns)
key_cols = [c for c in ["source", "name", "price"] if c in df.columns]
if len(key_cols) >= 2:
    dup_count = df.duplicated(subset=key_cols).sum()
    print(f"Duplicate rows (subset={key_cols}):", int(dup_count))
else:
    print("Skip duplicate check (not enough key columns).")

QC 1: Shape check

Rows, Cols: (614, 24)
Duplicate rows (subset=['source', 'name', 'price']): 1


In [36]:
# Missing rate
print("QC 2: Missing rate")
qc_cols = [c for c in ["process", "roast", "notes", "notes_str", "bean_type"] if c in df.columns]
missing_report = {}

for c in qc_cols:
    # Treat "", None, NaN as missing
    miss = df[c].isna().mean()
    empty = (df[c].astype(str).str.strip() == "").mean()
    missing_report[c] = {
        "missing_na_rate": miss,
        "empty_string_rate": empty,
        "total_effective_missing": max(miss, empty)
    }

missing_df = pd.DataFrame(missing_report).T.sort_values("total_effective_missing", ascending=False)
display(missing_df)

# how many "unknown" after hybrid/fallback?
if "notes" in df.columns:
    unknown_count = (df["notes"].astype(str).str.lower().str.strip() == "unknown").sum()
    print("Notes == 'unknown' count:", int(unknown_count))

QC 2: Missing rate


Unnamed: 0,missing_na_rate,empty_string_rate,total_effective_missing
bean_type,0.563518,0.0,0.563518
process,0.0,0.273616,0.273616
roast,0.0,0.154723,0.154723
notes_str,0.0,0.096091,0.096091
notes,0.0,0.0,0.0


Notes == 'unknown' count: 59


In [40]:
# Value sanity
print("QC 3: Value sanity\n")

allowed_process = {None, "washed", "natural", "honey", "anaerobic"}
allowed_roast   = {None, "light", "medium", "dark", "espresso"}
allowed_bean    = {None, "arabica", "robusta", "arabica_robusta_blend"}

def unexpected_values(series: pd.Series, allowed: set):
    vals = series.dropna().astype(str).str.strip().unique().tolist()
    bad = [v for v in vals if v not in {x for x in allowed if x is not None}]
    return bad

if "process" in df.columns:
    bad_proc = unexpected_values(df["process"], allowed_process)
    print("Unexpected process values      :", bad_proc[:20])

if "roast" in df.columns:
    bad_roast = unexpected_values(df["roast"], allowed_roast)
    print("Unexpected roast values        :", bad_roast[:20])

if "bean_type" in df.columns:
    bad_bean = unexpected_values(df["bean_type"], allowed_bean)
    print("Unexpected bean_type values    :", bad_bean[:20])

QC 3: Value sanity

Unexpected process values      : ['']
Unexpected roast values        : ['']
Unexpected bean_type values    : []


In [41]:
# Non-empty match_text
print("QC 4: match_text non-empty\n")

if "match_text" in df.columns:
    match_empty = (df["match_text"].isna() | (df["match_text"].astype(str).str.strip() == "")).sum()
    print("match_text empty count:", int(match_empty))

    print("\nSample 5 rows (to eyeball tokens):")
    sample_cols = [c for c in ["name", "match_text", "process", "roast", "bean_type", "notes_str"] if c in df.columns]
    display(df[sample_cols].sample(5, random_state=42))
else:
    print("ERROR: match_text column not found. TF-IDF step will break.")


QC 4: match_text non-empty

match_text empty count: 0

Sample 5 rows (to eyeball tokens):


Unnamed: 0,name,match_text,process,roast,bean_type,notes_str
350,kopi arabika bali kintamani natural arabica co...,kopi arabika bali kintamani natural arabica co...,natural,medium,arabica,fruity sweet
377,kopi arabika lintong sumatra arabica coffee be...,kopi arabika lintong sumatra arabica coffee be...,washed,dark,arabica,chocolate_nutty fruity sweet
163,kopi arabika kamojang green apple 1 kg bijibub...,kopi arabika kamojang green apple 1 kg bijibub...,washed,medium,,fruity herbal
609,arutala kopi robusta jawa java coffee indonesi...,arutala kopi robusta jawa java coffee indonesi...,,dark,robusta,chocolate_nutty sweet
132,sulawesi tana luwu 200gr single origin special...,sulawesi tana luwu 200gr single origin special...,honey,medium,,fruity


In [43]:
# Flag distribution
print("QC 5: Flag distribution\n")
flag_cols = [c for c in ["is_house_blend", "is_blend", "is_capsule", "is_liquid", "is_merch_tools"] if c in df.columns]

if flag_cols:
    dist = {}
    for c in flag_cols:
        dist[c] = {
            "true_count": int((df[c] == True).sum()),
            "true_rate": float((df[c] == True).mean())
        }
    dist_df = pd.DataFrame(dist).T.sort_values("true_rate", ascending=False)
    display(dist_df)
else:
    print("No flag columns found. (Skip)")

QC 5: Flag distribution



Unnamed: 0,true_count,true_rate
is_merch_tools,402.0,0.654723
is_liquid,217.0,0.35342
is_blend,90.0,0.14658
is_house_blend,10.0,0.016287
is_capsule,8.0,0.013029


---
**Final Check Conclusion**

---

Overall, this updated FE output looks production-safe for the next steps (TF-IDF retrieval + reranking). The core “output contract” is intact: the dataset shape is reasonable, match_text is never empty, and your categorical fields are not drifting outside the allowed label set.

Three takeaways matter most:

1. The dataset has 614 rows and 24 cols and we only have 1 duplicate when checking source + name + price. That’s not a serious issue because marketplace data often contains repeated listings, and we have dropped this duplicated data.

2. Missing/empty values exist, but they’re not blockers—they’re mostly a data reality:
    - process and roast are not NaN, but they have empty strings (~27% and ~15%). That suggests the “explicit” columns aren’t always filled. Our pipeline is still fine because it can fall back to raw text signals and still build a usable match_text.
    - bean_type has a high missing rate (~56%). That’s expected: many product listings simply don’t state arabica/robusta. This won’t break TF-IDF—missing just means we don’t inject the bean_{...} token for those rows.
    - notes == "unknown" appears 59 times. This is also reasonable: “unknown” is better than forcing a wrong guess. The key is to treat “unknown” consistently (typically: don’t convert it into notes tokens).

3. Flag distribution gives business insight and one soft warning:
    - is_merch_tools firing on 65% of rows is unusually high. This likely means our tools/merch keyword list is too aggressive (e.g., the word “filter” shows up in “filter coffee beans” but gets interpreted as a brewing tool). This won’t crash anything, but it can hurt relevance if our hard filters or penalties start eliminating too many valid beans.
    - is_liquid at 35% is plausible (RTD is common).
    - is_capsule and is_house_blend are small (1–2%), which looks realistic.

Final takeaway: the FE pipeline is technically solid and ready to proceed.

## 3.9 Export FE Dataset

In [48]:
df.to_csv("engineered_dataset.csv", index=False)

# iv. Modeling

---
**Why we use TF-IDF fot Product Matching**

In this project, the core task is text-to-text matching: we take a user’s preference or query and retrieve products that are most relevant, not predict a numeric value or a class label. This is fundamentally an information retrieval / ranking problem.

TF-IDF is a strong fit because it is:

- A reliable retrieval baseline: For product titles, short descriptions, categories, and tags, TF-IDF often delivers surprisingly solid relevance.

- Interpretable: We can explain why an item is retrieved (high-weight terms overlap), which is valuable for business stakeholders.

- Efficient and deployment-friendly: Fast to train and fast to serve, no GPU dependency, stable in production.

- Label-free: It does not require query–product relevance labels, which we often do not have in early-stage systems.

In short, TF-IDF aligns with the nature of the problem: retrieve and rank, not “predict a label”.

---
**Why we do not use other models**

Supervised models such as Logistic Regression, Random Forest, or XGBoost are designed for prediction tasks with clear labels, whereas this project focuses on retrieving and ranking products based on textual relevance rather than predicting a class or value. Without reliable ground truth indicating which products should match a given query, applying such models would introduce artificial assumptions and unnecessary complexity, potentially producing results that look sophisticated but do not actually solve the business problem. A text-based retrieval approach like TF-IDF aligns more naturally with the problem structure and remains easier to interpret and validate from a business perspective.

---
**Why embeddings are not required at this stage**

While embeddings can capture deeper semantic relationships, their added complexity is not yet justified for the current scope of this project. The vocabulary used in user preferences and product descriptions still overlaps sufficiently, allowing TF-IDF to capture relevance effectively without additional infrastructure, higher computational costs, or reliance on external models. By starting with TF-IDF as a strong baseline, we maintain a lightweight and explainable solution, while keeping the option open to introduce embeddings later if semantic gaps become a recurring limitation.



## 4.1 Query Construction

We make a function to converts the structured user preferences (decision rule output in key–value form) into a standardized query string whose tokens are aligned with the product match_text tokens. The resulting query is then used as TF-IDF input to compute similarity scores and rank the most relevant products.

It strongly matters because TF-IDF operates on text, not dictionaries/JSON, so we need a consistent “shared language” between user preferences and the product catalog (e.g., roast_dark, bean_arabica, notes_fruity). A standardized query improves retrieval stability, makes debugging easier, and supports production-friendly behavior.

In [49]:
def build_query_text(user_profile: Dict[str, Any]) -> str:
    """
    Expected keys:
    - roast_level: "light"|"medium"|"dark"|None
    - process_preference: "washed"|"natural"|"honey"|"anaerobic"|"any"|None
    - bean_pref: "arabica"|"robusta"|"any"|None
    - flavor_direction: e.g. "fruity", "floral", "chocolate_nutty", "spicy", "sweet" (or list)
    - acidity_level: optional; we can map to note proxies if you want
    """

    tokens: List[str] = []

    roast = user_profile.get("roast_level")
    if roast and roast != "any":
        tokens.append(f"roast_{roast}")

    proc = user_profile.get("process_preference")
    if proc and proc != "any":
        tokens.append(f"process_{proc}")

    bean = user_profile.get("bean_pref")
    if bean and bean != "any":
        tokens.append(f"bean_{bean}")

    # Flavor_direction could be str or list[str]
    flavor = user_profile.get("flavor_direction")
    if isinstance(flavor, str) and flavor:
        tokens.append(f"notes_{flavor}")
    elif isinstance(flavor, list):
        for f in flavor:
            if f:
                tokens.append(f"notes_{f}")

    # Map acidity preference to note proxies
    # This is a lightweight heuristic, not mandatory.
    acidity = user_profile.get("acidity_level")
    if acidity == "high":
        tokens.append("notes_fruity")
        tokens.append("notes_floral")
    elif acidity == "low":
        tokens.append("notes_chocolate_nutty")

    # Join tokens into a compact query string
    query = " ".join(tokens).strip()
    return query

In [50]:
# Quick check

# Create a sample user input
sample_profile = {
    "roast_level": "light",
    "process_preference": "washed",
    "bean_pref": "arabica",
    "flavor_direction": ["fruity", "floral"],
    "acidity_level": "high",
}

# Run query function
query = build_query_text(sample_profile)
print(query)

roast_light process_washed bean_arabica notes_fruity notes_floral notes_fruity notes_floral


In [51]:
# Deduplicate tokens
def build_query_text_dedup(user_profile: Dict[str, Any]) -> str:
    """
    This function removes duplicate tokens from the constructed query 
    text while preserving their original order. It ensures that each 
    user preference contributes only once to the TF-IDF similarity \
    calculation.
    """

    q = build_query_text(user_profile)
    tokens = q.split()
    tokens = list(dict.fromkeys(tokens))  # preserve order, remove duplicates
    return " ".join(tokens)

## 4.2 TF-IDF and Cosine Similarity

### Load Feature Engineered Dataset

We load the feature-engineered dataset (engineered_dataset.csv) and enforce a minimal column contract required for TF-IDF-based product matching. The match_text column is then standardized to remove missing values and ensure string type consistency so it is safe for vectorization.

It's important because retrieval pipelines rely on a clear input data contract. Validating required columns early enables a fail-fast workflow and prevents hard-to-trace downstream errors (e.g., during TF-IDF fit/transform). Standardizing match_text is also critical because TF-IDF expects clean text input with no NaN values.

In [53]:
# Load featured engineered dataset
df_fe = pd.read_csv("engineered_dataset.csv")

In [54]:
# Define the required columns
required_cols = ["source", "name", "price", "description", "match_text"]
missing = [c for c in required_cols if c not in df_fe.columns]
if missing:
    raise ValueError(f"Missing columns in FE dataset: {missing}")

df_fe["match_text"] = df_fe["match_text"].fillna("").astype(str)

# Quick sanity check
print(df_fe.shape)
df_fe[["name", "match_text"]].head()

(614, 24)


Unnamed: 0,name,match_text
0,candy apple espresso roast instinct roastery ...,candy apple espresso roast instinct roastery 2...
1,ethiopia tima washed instinct roastery 150 gram,ethiopia tima washed instinct roastery 150 gra...
2,espresso roast kenya tatu instinct roastery ...,espresso roast kenya tatu instinct roastery 20...
3,ethiopia sidama baturo instinct roastery 150...,ethiopia sidama baturo instinct roastery 150 g...
4,colombia eliecer ordez instinct roastery 150...,colombia eliecer ordez instinct roastery 150 g...


### Fit TF-IDF on match_text

This step converts the product match_text corpus into numeric TF-IDF vectors. The resulting TF-IDF matrix serves as the retrieval “index” used to compare user queries against products via cosine similarity.

In [56]:
# Defune the vectorizer
vectorizer = TfidfVectorizer(
    lowercase=True,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    token_pattern=r"(?u)\b\w[\w_]+\b"
)

# Define the TF-IDF matrix by fit_transform the vectorizer to match_text
X_tfidf = vectorizer.fit_transform(df_fe["match_text"])

# Sanity check
print("TF-IDF matrix shape:", X_tfidf.shape)

TF-IDF matrix shape: (614, 8910)


### Build Query Text from User Profile

This cell runs a lightweight end-to-end check from user preferences to decision rules (build_profile) to profile dictionary output to query construction (build_query_text_dedup). The final output is a token-based query string aligned with product match_text, ready for TF-IDF and cosine similarity ranking.

This acts as a small integration test to confirm there is no hidden format mismatch between components (user input, rule outputs, and query builder). Printing both user_profile and query_text makes it easy to validate that user intent is correctly translated into retrieval tokens that drive product ranking.

In [57]:
# Example of user input
user = UserPref(
    stomach_sensitivity="high",
    caffeine_sensitivity="medium",
    time_of_day="morning",
    purpose="focus",
    flavor_direction=["fruity", "floral"],  # bisa string atau list
    brew_method="filter",
)

profile_out = build_profile(user)

# Take dictionary profile that will be used as a query
user_profile = profile_out["profile"] if "profile" in profile_out else profile_out

query_text = build_query_text_dedup(user_profile)

print("User profile:", user_profile)
print("Query text:", query_text)

User profile: {'acidity_level': 'low', 'caffeine_tendency': 'medium', 'roast_level': 'medium', 'flavor_direction': 'fruity', 'process_preference': 'washed', 'brew_suitability': 'filter', 'bean_preference': 'arabica'}
Query text: roast_medium process_washed notes_fruity notes_chocolate_nutty


### Transform Query and Cosine Similarity

This process converts the user query into a TF-IDF vector, compute cosine similarity against all product vectors, and retrieve the Top-K most relevant products along with their similarity scores, because cosine similarity provides a numeric measure of contextual alignment between user preferences and product descriptions. This enables consistent, explainable ranking instead of arbitrary selection.

In [58]:
q_vec = vectorizer.transform([query_text])

# Cosine similarity: (1 x N)
similarity = cosine_similarity(q_vec, X_tfidf).ravel()

# Take top-K
top_n = 10
top_idx = np.argsort(similarity)[::-1][:top_n]

results = df_fe.loc[top_idx, ["source", "name", "price", "description"]].copy()
results["similarity"] = similarity[top_idx]

results

Unnamed: 0,source,name,price,description,similarity
187,tatido coffee roasters,costa rica mirazu dcafetal espresso roast sin...,150000,costa rica tarrazu mirazu dcafetal black honey...,0.097683
186,tatido coffee roasters,indonesia gulang 2817 natural anaerobic 150g,150000,indonesia flores manggarai lot gulang 2817 nat...,0.086921
299,collins roasters,bali karana madu honey anaerobic collins roas...,114000,bali karana madu honey anaerobic collins roas...,0.068407
172,fugol coffee roasters,kopi arabika watermelon smash 100 gram natural...,81000,halal id32110016691860324 pirt308327301034727 ...,0.064859
275,swargi roasters,mid grade espresso malt crema 100 arabica 200 gr,65000,malt crema beans yang diroasting khusus untuk ...,0.064742
301,collins roasters,ethiopia yirgacheffe arabica specialty coffee...,301500,ethiopia yirgacheffe specialty coffee collin...,0.063927
236,a roastworks,black gold espresso blend a roastworks biji...,245000,black gold espresso blend brazil alto caparao...,0.062753
288,collins roasters,yunnan mengzhu honey asd filter specialty cof...,130000,yunnan mengzhu honey asd filter specialty cof...,0.056469
586,arutala coffee,arutala kopi vietnam central highland robusta ...,260900,kenapa arutala coffee dengan memiliki roastery...,0.053666
602,arutala coffee,arutala kopi colombia decaf arabika 200 gram ...,139900,kenapa arutala coffee dengan memiliki roastery...,0.053493


The similarity column represents relative relevance scores (range 0–1). Higher values indicate stronger textual alignment with the query. These scores are not probabilities; they are meaningful only for ranking within the same query. Top-ranked products share important, high-weight tokens with the query (e.g., washed, fruity), making them more contextually suitable for the user.

In [60]:
# Sanity check

# Check small similarity distribution
print("Similarity stats:")
print(pd.Series(similarity).describe())

Similarity stats:
count    614.000000
mean       0.014676
std        0.015750
min        0.000000
25%        0.000000
50%        0.008372
75%        0.022902
max        0.097683
dtype: float64


In [61]:
# Sanity check

# Check top 3 match and the match_text
debug_cols = ["name", "match_text"]
df_fe.loc[top_idx[:3], debug_cols]

Unnamed: 0,name,match_text
187,costa rica mirazu dcafetal espresso roast sin...,costa rica mirazu dcafetal espresso roast sing...
186,indonesia gulang 2817 natural anaerobic 150g,indonesia gulang 2817 natural anaerobic 150g i...
299,bali karana madu honey anaerobic collins roas...,bali karana madu honey anaerobic collins roast...


## 4.3 Ranking and Penalty Layer

In the ranking and penalty layer, the goal is no longer to find products that are merely textually similar, but to refine the recommendation order so it makes sense in real usage. TF-IDF and cosine similarity provide a strong relevance baseline based on product descriptions, but in real business scenarios, textual similarity alone is rarely sufficient. This layer allows us to adjust the ranking using practical considerations that are not always captured in text, such as user constraints, consumption context, or suitability rules (e.g., sensitivity, time of use, or brewing method). The penalty mechanism does not discard relevant products outright, but softly deprioritizes options that are less appropriate, ensuring the final recommendations are not just linguistically relevant, but context-aware and user-aligned.

In [62]:
# Define columns of ranking and penalty
rank_cols = [
    "is_capsule", "is_liquid", "is_merch_tools", "is_house_blend", "is_blend",
    "process", "roast", "bean_type", "notes"
]

# Sanity check
[c for c in rank_cols if c not in df_fe.columns]

[]

In [63]:
# Create scoring config
ranking_config = {
    # Hard filters (if True, throw away)
    "hard_filter": {
        "exclude_merch_tools": True,
        "exclude_liquid": True,
    },

    # Penalties
    "penalty": {
        "capsule": 0.08,
        "house_blend": 0.03,
        "blend": 0.015,
    },
    
    # Boosts
    "boost": {
        "process_match": 0.015,
        "roast_match": 0.010,
        "bean_match": 0.010,
        "notes_match": 0.008,
    },
}

In [64]:
# Define rerank function
def rerank_candidates(
    candidate_df: pd.DataFrame,
    user_profile: Dict[str, Any],
    config: Dict[str, Any],
    ) -> pd.DataFrame:
    """
    This function performs post-ranking on TF-IDF candidates. It starts 
    from the base similarity score and adjusts it using business rules: 
    hard filters (remove items), penalties (softly demote), and boosts 
    (promote items that match user preferences). The final output is 
    sorted by final_score
    """

    df = candidate_df.copy()

    # Hard filter
    if config["hard_filter"].get("exclude_merch_tools", False) and "is_merch_tools" in df.columns:
        df = df[df["is_merch_tools"] == False]
    if config["hard_filter"].get("exclude_liquid", False) and "is_liquid" in df.columns:
        df = df[df["is_liquid"] == False]

    # If the filter output is none, fallback: skip filtering step
    if df.empty:
        df = candidate_df.copy()

    # Start from similarity
    df["final_score"] = df["similarity"].astype(float)

    # Apply penalties
    pen = config["penalty"]

    if "is_capsule" in df.columns:
        df.loc[df["is_capsule"] == True, "final_score"] -= pen.get("capsule", 0.0)

    if "is_house_blend" in df.columns:
        df.loc[df["is_house_blend"] == True, "final_score"] -= pen.get("house_blend", 0.0)

    if "is_blend" in df.columns:
        df.loc[df["is_blend"] == True, "final_score"] -= pen.get("blend", 0.0)

    # Apply boosts (match with user_profile)
    boo = config["boost"]

    # Process match
    proc = user_profile.get("process_preference")
    if proc and proc != "any" and "process" in df.columns:
        df.loc[df["process"] == proc, "final_score"] += boo.get("process_match", 0.0)

    # Roast match
    roast = user_profile.get("roast_level")
    if roast and roast != "any" and "roast" in df.columns:
        df.loc[df["roast"] == roast, "final_score"] += boo.get("roast_match", 0.0)

    # Bean match
    bean = user_profile.get("bean_pref") or user_profile.get("bean_preference")
    if bean and bean != "any" and "bean_type" in df.columns:
        # Dataset bean_type can be "arabica" / "robusta" / "arabica_robusta_blend"
        df.loc[df["bean_type"] == bean, "final_score"] += boo.get("bean_match", 0.0)

    # Notes match
    flavor = user_profile.get("flavor_direction")
    if flavor and "notes" in df.columns:
        if isinstance(flavor, str):
            flavor_set = {flavor}
        else:
            flavor_set = set([f for f in flavor if f])

        def notes_has_overlap(notes_val) -> bool:
            if pd.isna(notes_val):
                return False
            # If the result is saved in string
            s = str(notes_val).lower()
            return any(f in s for f in flavor_set)

        overlap_mask = df["notes"].apply(notes_has_overlap)
        df.loc[overlap_mask, "final_score"] += boo.get("notes_match", 0.0)

    # Sort by final_score
    df = df.sort_values("final_score", ascending=False).reset_index(drop=True)

    return df

In [65]:
# QUick check
final_results = rerank_candidates(results, user_profile, ranking_config)

final_results.head(10)

Unnamed: 0,source,name,price,description,similarity,final_score
0,tatido coffee roasters,costa rica mirazu dcafetal espresso roast sin...,150000,costa rica tarrazu mirazu dcafetal black honey...,0.097683,0.097683
1,tatido coffee roasters,indonesia gulang 2817 natural anaerobic 150g,150000,indonesia flores manggarai lot gulang 2817 nat...,0.086921,0.086921
2,collins roasters,bali karana madu honey anaerobic collins roas...,114000,bali karana madu honey anaerobic collins roas...,0.068407,0.068407
3,fugol coffee roasters,kopi arabika watermelon smash 100 gram natural...,81000,halal id32110016691860324 pirt308327301034727 ...,0.064859,0.064859
4,swargi roasters,mid grade espresso malt crema 100 arabica 200 gr,65000,malt crema beans yang diroasting khusus untuk ...,0.064742,0.064742
5,collins roasters,ethiopia yirgacheffe arabica specialty coffee...,301500,ethiopia yirgacheffe specialty coffee collin...,0.063927,0.063927
6,a roastworks,black gold espresso blend a roastworks biji...,245000,black gold espresso blend brazil alto caparao...,0.062753,0.062753
7,collins roasters,yunnan mengzhu honey asd filter specialty cof...,130000,yunnan mengzhu honey asd filter specialty cof...,0.056469,0.056469
8,arutala coffee,arutala kopi vietnam central highland robusta ...,260900,kenapa arutala coffee dengan memiliki roastery...,0.053666,0.053666
9,arutala coffee,arutala kopi colombia decaf arabika 200 gram ...,139900,kenapa arutala coffee dengan memiliki roastery...,0.053493,0.053493


Based on the final_results, the final recommendation order aligns well with both business expectations and the user’s contextual preferences. The top-ranked products are dominated by washed single-origin coffees with clean and fruity profiles—items that were already strong candidates at the TF-IDF stage and were not subject to any penalties. This indicates that the ranking and penalty layer does not distort the initial retrieval results, but instead acts as a refinement and validation step.

Notably, the final_score for most top candidates remains identical to their original similarity score. From a business perspective, this is a positive signal: it shows that the system does not enforce rules aggressively by default, and only intervenes when necessary. In other words, the ranking layer is designed to be non-intrusive, applying corrections only when contextual mismatches are detected.

From a recommendation quality standpoint, the top results come from multiple roasteries with consistent product characteristics (washed, filter-friendly, non-capsule, non-tools). This demonstrates that the system avoids brand bias and prioritizes contextual alignment with user preferences. Additionally, the presence of a reasonable price range among top-ranked items allows users to make trade-offs based on budget without sacrificing relevance.

Overall, these results confirm that the recommendation pipeline functions as intended: TF-IDF serves as a robust retrieval mechanism for identifying relevant candidates, while the ranking and penalty layer operates as a business guardrail—ensuring the final recommendations are practical, context-aware, and aligned with a responsible user experience.

In [66]:
# Sanity check
print("Before rerank (top 5 by similarity):")
display(results.head(5))

print("After rerank (top 5 by final_score):")
display(final_results.head(5))

Before rerank (top 5 by similarity):


Unnamed: 0,source,name,price,description,similarity
187,tatido coffee roasters,costa rica mirazu dcafetal espresso roast sin...,150000,costa rica tarrazu mirazu dcafetal black honey...,0.097683
186,tatido coffee roasters,indonesia gulang 2817 natural anaerobic 150g,150000,indonesia flores manggarai lot gulang 2817 nat...,0.086921
299,collins roasters,bali karana madu honey anaerobic collins roas...,114000,bali karana madu honey anaerobic collins roas...,0.068407
172,fugol coffee roasters,kopi arabika watermelon smash 100 gram natural...,81000,halal id32110016691860324 pirt308327301034727 ...,0.064859
275,swargi roasters,mid grade espresso malt crema 100 arabica 200 gr,65000,malt crema beans yang diroasting khusus untuk ...,0.064742


After rerank (top 5 by final_score):


Unnamed: 0,source,name,price,description,similarity,final_score
0,tatido coffee roasters,costa rica mirazu dcafetal espresso roast sin...,150000,costa rica tarrazu mirazu dcafetal black honey...,0.097683,0.097683
1,tatido coffee roasters,indonesia gulang 2817 natural anaerobic 150g,150000,indonesia flores manggarai lot gulang 2817 nat...,0.086921,0.086921
2,collins roasters,bali karana madu honey anaerobic collins roas...,114000,bali karana madu honey anaerobic collins roas...,0.068407,0.068407
3,fugol coffee roasters,kopi arabika watermelon smash 100 gram natural...,81000,halal id32110016691860324 pirt308327301034727 ...,0.064859,0.064859
4,swargi roasters,mid grade espresso malt crema 100 arabica 200 gr,65000,malt crema beans yang diroasting khusus untuk ...,0.064742,0.064742


---
**Conclusion**

---

Overall, the recommendation system developed in this project produces final results that are consistent, relevant, and defensible from a business perspective. The TF-IDF and cosine similarity approach serves effectively as a retrieval engine, identifying candidate products based on textual alignment between user preferences and product descriptions. At this stage, the system successfully narrows down the catalog to candidates that are already meaningful in terms of descriptive relevance.

The key strength of this solution lies in the ranking and penalty layer. This layer ensures that recommendations are not only textually similar, but also context-aware and practically suitable for the user. By combining hard filters, penalties, and boosts, the system applies corrections selectively—adjusting rankings only when necessary, while remaining non-intrusive when the initial candidates are already appropriate. The observed stability of the final ranking in certain cases confirms that the system avoids unnecessary intervention and is not over-engineered.

From a business standpoint, this pipeline strikes a balanced approach between flexibility and control. It does not rely solely on rigid rules, nor does it depend entirely on similarity scores. This balance results in recommendations that are more trustworthy, explainable, and easier to evaluate. With its modular design and transparent logic, the system provides a solid foundation for future enhancements, including rule refinement, data expansion, and eventual production deployment.

# v. Model Saving

In [67]:
# Create new directory folder
from pathlib import Path

artifact_dir = Path("artifacts")
artifact_dir.mkdir(parents=True, exist_ok=True)
artifact_dir

WindowsPath('artifacts')

In [68]:
# Save vectorizer
import joblib

joblib.dump(vectorizer, artifact_dir / "tfidf_vectorizer.joblib")

['artifacts\\tfidf_vectorizer.joblib']

In [69]:
# Save feature engineered dataset to directory
df_fe.to_csv(artifact_dir / "products_fe.csv", index=False)

In [None]:
# Check saved dataset
df_loaded = pd.read_csv(artifact_dir / "products_fe.csv")
print(df_loaded.shape)
print(df_loaded.columns[:10])

(614, 24)
Index(['source', 'name', 'price', 'description', 'origin', 'process',
       'roast_level', 'notes', 'name_clean', 'desc_clean'],
      dtype='object')


In [71]:
# Save ranking config
import json

with open(artifact_dir / "ranking_config.json", "w", encoding="utf-8") as f:
    json.dump(ranking_config, f, indent=2)

In [None]:
# Check saved ranking config
with open(artifact_dir / "ranking_config.json", "r", encoding="utf-8") as f:
    cfg_loaded = json.load(f)

print(cfg_loaded.keys())

dict_keys(['hard_filter', 'penalty', 'boost'])


In [73]:
# Save X_tfidf
joblib.dump(X_tfidf, artifact_dir / "products_tfidf_matrix.joblib")
print("Saved:", artifact_dir / "products_tfidf_matrix.joblib")

Saved: artifacts\products_tfidf_matrix.joblib


In [None]:
# Check saved X_tfidf
X_loaded = joblib.load(artifact_dir / "products_tfidf_matrix.joblib")
print("Matrix shape:", X_loaded.shape)

Matrix shape: (614, 8910)


In [76]:
# Save feature names
joblib.dump(vectorizer.get_feature_names_out(), artifact_dir / "tfidf_features.joblib")

['artifacts\\tfidf_features.joblib']

In [None]:
# Save metadata
import platform
import sklearn

meta = {
    "python_version": platform.python_version(),
    "sklearn_version": sklearn.__version__,
    "numpy_version": np.__version__,
    "n_products": int(df_fe.shape[0]),
    "n_features": int(X_tfidf.shape[1]) if "X_tfidf" in globals() else None,
}

with open(artifact_dir / "artifact_meta.json", "w", encoding="utf-8") as f:
    json.dump(meta, f, indent=2)

# Quick check
meta

{'python_version': '3.9.25',
 'sklearn_version': '1.6.1',
 'numpy_version': '1.26.4',
 'n_products': 614,
 'n_features': 8910}