In [1]:
pip install pandas numpy rank-bm25

Collecting rank-bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank-bm25
Successfully installed rank-bm25-0.2.2
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
import re
import pandas as pd
from rank_bm25 import BM25Okapi

# ========= KONFIG =========
CSV_PATH = "aksesoriWanita_enriched.csv"   # ganti sesuai file kamu
ENCODING = "utf-8"                         # kalau error coba "utf-8-sig"
TOPK = 20

# Kolom yang dipakai untuk membentuk dokumen (boleh kamu ubah)
TEXT_COLS = [
    "name",
    "category_breadcrumb",   # atau "categoryBreadcrumbs" / "category.breadcrumb"
    "shop_city"              # atau "shop.city"
]

# Kolom output yang mau ditampilkan
OUT_COLS = [
    "id", "name", "url",
    "category_breadcrumb",
    "price_number",
    "rating",
    "discountPercentage",
    "shop_id", "shop_name", "shop_city", "shop_tier"
]

# ========= UTIL =========
def safe_get_col(df: pd.DataFrame, col: str) -> pd.Series:
    """Ambil kolom kalau ada, kalau tidak ada buat kolom kosong."""
    if col in df.columns:
        return df[col].fillna("").astype(str)
    return pd.Series([""] * len(df))

def tokenize(text: str):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text.split()

def build_document(df: pd.DataFrame) -> pd.Series:
    parts = []
    for c in TEXT_COLS:
        parts.append(safe_get_col(df, c))
    doc = parts[0]
    for p in parts[1:]:
        doc = doc + " " + p
    return doc

# ========= LOAD =========
df = pd.read_csv(CSV_PATH, encoding=ENCODING)

# Pastikan kolom penting ada (kalau berbeda nama, mapping manual di bawah)
# Kalau CSV kamu pakai nama kolom versi nested (mis. "shop.id"), kamu bisa rename.
rename_map = {
    # contoh mapping yang sering terjadi (aktifkan bila perlu):
    "shop.id": "shop_id",
    "shop.name": "shop_name",
    "shop.city": "shop_city",
    "shop.tier": "shop_tier",
    "category.breadcrumb": "category_breadcrumb",
    "price.number": "price_number",
    "price.discountPercentage": "discountPercentage",
    "mediaURL.image": "mediaURL_image",
}
for old, new in rename_map.items():
    if old in df.columns and new not in df.columns:
        df = df.rename(columns={old: new})

# ========= BUILD CORPUS =========
df["doc"] = build_document(df)
corpus_tokens = df["doc"].apply(tokenize).tolist()

bm25 = BM25Okapi(corpus_tokens)

# ========= SEARCH =========
def bm25_search(query: str, topk: int = 20) -> pd.DataFrame:
    q_tokens = tokenize(query)
    scores = bm25.get_scores(q_tokens)

    top_idx = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)[:topk]
    out = df.iloc[top_idx].copy()
    out["bm25_score"] = [scores[i] for i in top_idx]

    # pilih kolom output yang ada saja
    cols = [c for c in OUT_COLS if c in out.columns]
    cols = cols + ["bm25_score"]
    return out[cols].reset_index(drop=True)

if __name__ == "__main__":
    print("BM25 siap. Contoh query:")
    q = input("Masukkan query (contoh: 'baju wanita korea'): ").strip()
    res = bm25_search(q, topk=TOPK)
    print(res.to_string(index=False))

    res.to_csv("bm25_results.csv", index=False, encoding="utf-8-sig")
    print("\n Disimpan: bm25_results.csv")
