### import các thư viện cần thiết

In [39]:
import os
import random
import time
import itertools

import numpy as np
import pandas as pd

from PIL import Image
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

import scipy.sparse as sp
from scipy.sparse import csr_matrix

import torch
import torch.nn as nn
import torch.nn.functional as F

print("PyTorch:", torch.__version__)

PyTorch: 2.6.0+cu124


# Chuẩn bị dữ liệu H&M

In [6]:
DATASET_NAME = "hm"  # để log / tên file…

BASE_PATH = "/kaggle/input/h-and-m-personalized-fashion-recommendations"

# Thư mục ảnh: chỉnh thành path mà bạn đang có
# Ví dụ nếu bạn có bộ ảnh kiểu Kaggle community:
#   images/010/0108775015.jpg
IMAGE_DIR = "/kaggle/input/hm-images/images"   # <-- sửa cho đúng của bạn

CFG = {
    "outfits_file": "articles.csv",
    "pictures_file": None,   # H&M không có file ảnh riêng như Vibrent
    "user_activity_file": "transactions_train.csv",
    "csv_sep": ",",

    # cột user / item / thời gian trong transactions_train
    "user_col": "customer_id",
    "item_col": "article_id",
    "start_time_col": "t_dat",
    "end_time_col": "t_dat",

    # cột “ảnh” – mình sẽ tự build dựa trên article_id
    "picture_outfit_col": "article_id",
    "picture_filename_col": "file_name",   # sẽ tạo giả trong code
    "picture_displayorder_col": None,

    # cột text để build câu mô tả cho FashionCLIP
    "outfit_text_cols": [
        "prod_name",
        "product_type_name",
        "product_group_name",
        "graphical_appearance_name",
        "colour_group_name",
        "perceived_colour_value_name",
        "perceived_colour_master_name",
        "department_name",
        "index_name",
        "index_group_name",
        "section_name",
        "garment_group_name",
        "detail_desc",
    ],
}

print("BASE_PATH:", BASE_PATH)
print("IMAGE_DIR:", IMAGE_DIR)
print("CFG:", CFG)


BASE_PATH: /kaggle/input/h-and-m-personalized-fashion-recommendations
IMAGE_DIR: /kaggle/input/hm-images/images
CFG: {'outfits_file': 'articles.csv', 'pictures_file': None, 'user_activity_file': 'transactions_train.csv', 'csv_sep': ',', 'user_col': 'customer_id', 'item_col': 'article_id', 'start_time_col': 't_dat', 'end_time_col': 't_dat', 'picture_outfit_col': 'article_id', 'picture_filename_col': 'file_name', 'picture_displayorder_col': None, 'outfit_text_cols': ['prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'colour_group_name', 'perceived_colour_value_name', 'perceived_colour_master_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc']}


In [7]:
def load_dataset(base_path, cfg):
    sep = cfg.get("csv_sep", ",")

    outfits = pd.read_csv(
        os.path.join(base_path, cfg["outfits_file"]),
        sep=sep,
        quotechar='"',
        engine="python",
        on_bad_lines="skip"
    )

    pictures_file = cfg.get("pictures_file", None)
    if pictures_file is not None:
        pictures = pd.read_csv(
            os.path.join(base_path, pictures_file),
            sep=sep,
            quotechar='"',
            engine="python"
        )
    else:
        pictures = None

    user_activity = pd.read_csv(
        os.path.join(base_path, cfg["user_activity_file"]),
        sep=sep,
        quotechar='"',
        engine="python"
    )

    print("Outfits (articles):", outfits.shape)
    print("Pictures:", None if pictures is None else pictures.shape)
    print("User activity (transactions):", user_activity.shape)
    return outfits, pictures, user_activity


outfits, pictures, user_activity = load_dataset(BASE_PATH, CFG)


Outfits (articles): (105542, 25)
Pictures: None
User activity (transactions): (31788324, 5)


In [8]:
def article_id_to_image_path(article_id, image_root):
    aid_str = str(article_id).zfill(10)   # H&M article_id 10 digits
    folder = aid_str[:3]
    filename = aid_str + ".jpg"
    return os.path.join(image_root, folder, filename)


In [9]:
def load_image(path):
    if os.path.exists(path):
        try:
            return Image.open(path).convert("RGB")
        except:
            return None
    return None


In [10]:
def build_group_images(outfits: pd.DataFrame, cfg, image_dir: str):
    """
    Tạo một DataFrame mapping:
        group_id (product_code) -> list ảnh + list article_id trong group
    """
    if "product_code" not in outfits.columns:
        raise ValueError("articles.csv không có cột product_code")

    records = []
    for _, row in outfits.iterrows():
        article_id = row[cfg["item_col"]]       # article_id
        group_id = row["product_code"]          # nhóm trang phục
        img_path = article_id_to_image_path(article_id, image_dir)
        records.append({
            "product_code": group_id,
            "article_id": article_id,
            "image_path": img_path,
        })

    df = pd.DataFrame(records)
    return df


group_images = build_group_images(outfits, CFG, IMAGE_DIR)
print("Số article có path ảnh (theo rule):", len(group_images))
print(group_images.head())


Số article có path ảnh (theo rule): 105542
   product_code  article_id                                         image_path
0        108775   108775015  /kaggle/input/hm-images/images/010/0108775015.jpg
1        108775   108775044  /kaggle/input/hm-images/images/010/0108775044.jpg
2        108775   108775051  /kaggle/input/hm-images/images/010/0108775051.jpg
3        110065   110065001  /kaggle/input/hm-images/images/011/0110065001.jpg
4        110065   110065002  /kaggle/input/hm-images/images/011/0110065002.jpg


In [11]:
def build_text_column(outfits: pd.DataFrame, cfg):
    text_cols = cfg.get("outfit_text_cols", [])

    def _build(row):
        parts = []
        for c in text_cols:
            if c in row and pd.notna(row[c]):
                parts.append(str(row[c]))
        return " ".join(parts)

    return outfits.apply(_build, axis=1)


In [15]:
import sys
import subprocess
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "fashion-clip"])

   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 363.4/363.4 MB 5.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 13.8/13.8 MB 106.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 24.6/24.6 MB 83.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 883.7/883.7 kB 37.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 664.8/664.8 MB 1.2 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 211.5/211.5 MB 5.5 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 56.3/56.3 MB 35.7 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 127.9/127.9 MB 15.4 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 207.5/207.5 MB 7.6 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.1/21.1 MB 13.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 44.7/44.7 kB 2.1 MB/s eta 0:00:00
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 193.6/193.6 kB 11.1 MB/s eta 0:00:00


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
bigframes 2.8.0 requires google-cloud-bigquery-storage<3.0.0,>=2.30.0, which is not installed.
cesium 0.12.4 requires numpy<3.0,>=2.0, but you have numpy 1.26.4 which is incompatible.
gcsfs 2025.3.2 requires fsspec==2025.3.2, but you have fsspec 2025.3.0 which is incompatible.
bigframes 2.8.0 requires google-cloud-bigquery[bqstorage,pandas]>=3.31.0, but you have google-cloud-bigquery 3.25.0 which is incompatible.
bigframes 2.8.0 requires rich<14,>=12.4.4, but you have rich 14.0.0 which is incompatible.


CompletedProcess(args=['/usr/bin/python3', '-m', 'pip', 'install', '-q', 'fashion-clip'], returncode=0)

In [None]:
from fashion_clip.fashion_clip import FashionCLIP
fclip = FashionCLIP('fashion-clip')

EMB_FILE_NAME = "outfit_embeddings.npy"
OUTFIT_EMB_PATH = os.path.join("/kaggle/working", EMB_FILE_NAME)

outfit_embeddings = None

if os.path.exists(OUTFIT_EMB_PATH):
    print("=> Đang load outfit_embeddings từ", OUTFIT_EMB_PATH)
    arr = np.load(OUTFIT_EMB_PATH, allow_pickle=True)
    if isinstance(arr, np.ndarray) and arr.dtype == object and arr.size == 1:
        outfit_embeddings = arr.item()
    else:
        raise ValueError("Định dạng file embeddings cũ không đúng (mong đợi dict).")
else:
    print("=> Không tìm thấy", OUTFIT_EMB_PATH, "→ tính lại bằng FashionCLIP...")

    # ---- TEXT EMBEDDING ----
    outfits["__text_for_clip"] = build_text_column(outfits, CFG)
    texts = outfits["__text_for_clip"].fillna("").tolist()
    article_ids = outfits[CFG["item_col"]].values  # article_id

    text_embeddings = fclip.encode_text(texts, batch_size=32)
    text_emb_map = dict(zip(article_ids, text_embeddings))

    # ---- IMAGE EMBEDDING THEO NHÓM product_code ----
    image_emb_group = {}   # product_code -> vector
    for g, df_g in group_images.groupby("product_code"):
        imgs = []
        for _, row in df_g.iterrows():
            img = load_image(row["image_path"])
            if img is not None:
                imgs.append(img)
        if not imgs:
            continue
        embs = fclip.encode_images(imgs, batch_size=16)
        image_emb_group[g] = embs.mean(axis=0)

    # ---- GỘP VÀO outfit_embeddings (article level) ----
    outfit_embeddings = {}
    for _, row in outfits.iterrows():
        aid = row[CFG["item_col"]]            # article_id
        g = row["product_code"]               # nhóm
        t = text_emb_map.get(aid)
        v = image_emb_group.get(g)            # embedding nhóm

        if (t is not None) and (v is not None):
            outfit_embeddings[aid] = 0.5 * t + 0.5 * v
        elif t is not None:
            outfit_embeddings[aid] = t
        elif v is not None:
            outfit_embeddings[aid] = v

    print("=> Lưu", OUTFIT_EMB_PATH)
    np.save(OUTFIT_EMB_PATH, outfit_embeddings)

print("Số article có embedding:", len(outfit_embeddings))
feat_dim = next(iter(outfit_embeddings.values())).shape[0]
print("Kích thước embedding:", feat_dim)


# Chuẩn bị dữ liệu vibrent-clothes-rental-dataset

In [40]:
# ========= CONFIG DATASET =========
# Chỉ cần sửa block này khi đổi dataset

DATASET_NAME = "vibrent"  # tên để đặt file output embeddings, log,...

# local:
# BASE_PATH = r"D:\DACNTT"
# IMAGE_DIR = os.path.join(BASE_PATH, "images")

# Kaggle:
BASE_PATH = "/kaggle/input/vibrent-clothes-rental-dataset"
IMAGE_DIR = os.path.join(BASE_PATH, "images")
# File & cột tương ứng với dataset Vibrent
CFG = {
    "outfits_file": "outfits.csv",
    "pictures_file": "picture_triplets.csv",
    "user_activity_file": "user_activity_triplets.csv",
    "csv_sep": ";",
    # cột user / item / thời gian trong user_activity
    "user_col": "customer.id",
    "item_col": "outfit.id",
    "start_time_col": "rentalPeriod.start",
    "end_time_col": "rentalPeriod.end",
    # cột trong pictures
    "picture_outfit_col": "outfit.id",
    "picture_filename_col": "file_name",
    "picture_displayorder_col": "displayOrder",
    # cột để xây text cho FashionCLIP
    # sẽ join các cột này bằng khoảng trắng (bỏ NA)
    "outfit_text_cols": ["description", "outfit_tags"],
}

print("BASE_PATH:", BASE_PATH)
print("IMAGE_DIR:", IMAGE_DIR)
print("CFG:", CFG)


BASE_PATH: /kaggle/input/vibrent-clothes-rental-dataset
IMAGE_DIR: /kaggle/input/vibrent-clothes-rental-dataset/images
CFG: {'outfits_file': 'outfits.csv', 'pictures_file': 'picture_triplets.csv', 'user_activity_file': 'user_activity_triplets.csv', 'csv_sep': ';', 'user_col': 'customer.id', 'item_col': 'outfit.id', 'start_time_col': 'rentalPeriod.start', 'end_time_col': 'rentalPeriod.end', 'picture_outfit_col': 'outfit.id', 'picture_filename_col': 'file_name', 'picture_displayorder_col': 'displayOrder', 'outfit_text_cols': ['description', 'outfit_tags']}


load dữ liệu

In [41]:
def load_dataset(base_path, cfg):
    sep = cfg.get("csv_sep", ",")

    outfits = pd.read_csv(
        os.path.join(base_path, cfg["outfits_file"]),
        sep=sep,
        quotechar='"',
        engine="python",
        on_bad_lines="skip"
    )

    pictures = pd.read_csv(
        os.path.join(base_path, cfg["pictures_file"]),
        sep=sep,
        quotechar='"',
        engine="python"
    )

    user_activity = pd.read_csv(
        os.path.join(base_path, cfg["user_activity_file"]),
        sep=sep,
        quotechar='"',
        engine="python"
    )

    print("Outfits:", outfits.shape)
    print("Pictures:", pictures.shape)
    print("User activity:", user_activity.shape)
    return outfits, pictures, user_activity


outfits, pictures, user_activity = load_dataset(BASE_PATH, CFG)

Outfits: (15649, 11)
Pictures: (50193, 4)
User activity: (64419, 4)


In [42]:
image_dir = "/kaggle/input/vibrent-clothes-rental-dataset/images"

def load_image(file_name):
    path = os.path.join(image_dir, file_name)
    if os.path.exists(path):
        try:
            return Image.open(path)
        except:
            return None
    return None


def build_main_pictures(pictures: pd.DataFrame, cfg, image_dir: str):
    outfit_col = cfg["picture_outfit_col"]
    fname_col = cfg["picture_filename_col"]
    disp_col = cfg.get("picture_displayorder_col", None)

    df = pictures.copy()

    if disp_col is not None and disp_col in df.columns:
        df = (
            df.sort_values(disp_col)
              .groupby(outfit_col)
              .first()
              .reset_index()
        )
    else:
        # Nếu dataset khác không có displayOrder, lấy dòng đầu tiên cho mỗi outfit
        df = (
            df.groupby(outfit_col)
              .first()
              .reset_index()
        )

    df["image_path"] = df[fname_col].apply(lambda fn: os.path.join(image_dir, fn))
    return df


main_pictures = build_main_pictures(pictures, CFG, IMAGE_DIR)
print("Số outfit có main picture (theo CSV):", len(main_pictures))
print(main_pictures.head())

Số outfit có main picture (theo CSV): 15157
                                 outfit.id  \
0  outfit.00004b4d01ca4ab0a70cf073ba74fefa   
1  outfit.0013691ff35b440e9dcfe1748ec184c7   
2  outfit.0014a5c89b244077a3d7cffd4549718e   
3  outfit.0018701ce6b049ebadc314d16623caa8   
4  outfit.001bf665330140cf854dcfb1cbff6b5f   

                                 picture.id  displayOrder  \
0  picture.a2b794c7ef83495a8997e7b0c318d65a             1   
1  picture.9c821ecbecb14c959f35078010fb91f3             1   
2  picture.b9aa39eb40f5410fa4fe101236241b19             1   
3  picture.b944a50f20fd4c7f954213dc7c38a776             1   
4  picture.fb1ff67a0bbc418b88ebb5560fac88a1             0   

                              file_name  \
0  a2b794c7ef83495a8997e7b0c318d65a.jpg   
1  9c821ecbecb14c959f35078010fb91f3.jpg   
2  b9aa39eb40f5410fa4fe101236241b19.jpg   
3  b944a50f20fd4c7f954213dc7c38a776.jpg   
4  fb1ff67a0bbc418b88ebb5560fac88a1.jpg   

                                          image_path 

# Tạo outfitsEmbedding hoặc load lên nếu đã có

In [43]:
import sys
import subprocess

# Cài fashion-clip nếu chưa có
#subprocess.run([sys.executable, "-m", "pip", "install", "-q", "fashion-clip"])

# from fashion_clip.fashion_clip import FashionCLIP

# fclip = FashionCLIP('fashion-clip')

EMB_FILE_NAME = f"/kaggle/working/outfit_embeddings.npy"
OUTFIT_EMB_PATH = os.path.join(BASE_PATH, EMB_FILE_NAME)

outfit_embeddings = None

def build_text_column(outfits: pd.DataFrame, cfg):
    text_cols = cfg.get("outfit_text_cols", [])
    def _build(row):
        parts = []
        for c in text_cols:
            if c in row and pd.notna(row[c]):
                parts.append(str(row[c]))
        return " ".join(parts)
    return outfits.apply(_build, axis=1)


if os.path.exists(OUTFIT_EMB_PATH):
    print("=> Đang load outfit_embeddings từ", OUTFIT_EMB_PATH)
    arr = np.load(OUTFIT_EMB_PATH, allow_pickle=True)
    print("  - Loaded shape:", getattr(arr, "shape", None), "| dtype:", getattr(arr, "dtype", None))

    if isinstance(arr, np.ndarray) and arr.dtype == object and arr.size == 1:
        # Trường hợp đã lưu dict bằng np.save(dict)
        outfit_embeddings = arr.item()
        print("  - Giải mã thành dict, số outfit:", len(outfit_embeddings))

    elif isinstance(arr, np.ndarray) and arr.ndim == 2:
        # Trường hợp đã lưu thuần mảng (N, D): ta cần gắn lại với outfit_id
        outfit_col = CFG["picture_outfit_col"]
        outfit_ids = main_pictures[outfit_col].values

        if arr.shape[0] != len(outfit_ids):
            raise ValueError(
                f"Số embedding ({arr.shape[0]}) khác số main_pictures ({len(outfit_ids)}). "
                "Cần chắc chắn thứ tự khi encode và khi load giống nhau."
            )

        outfit_embeddings = {
            oid: emb for oid, emb in zip(outfit_ids, arr)
        }
        print("  - Đã build dict outfit_embeddings từ array: số outfit =", len(outfit_embeddings))

    else:
        raise ValueError(
            f"File {EMB_FILE_NAME} có kiểu không hỗ trợ (shape={getattr(arr, 'shape', None)}, "
            f"dtype={getattr(arr, 'dtype', None)})."
        )

else:
    print("=> Không tìm thấy", EMB_FILE_NAME, "→ tính lại bằng FashionCLIP...")

    # Text embedding cho outfit
    outfits["__text_for_clip"] = build_text_column(outfits, CFG)
    texts = outfits["__text_for_clip"].fillna("").tolist()
    text_embeddings = fclip.encode_text(texts, batch_size=32)
    text_emb_map = dict(zip(outfits["id"], text_embeddings))

    # Image embedding cho main picture
    images = []
    image_outfit_ids = []

    outfit_col = CFG["picture_outfit_col"]
    fname_col = CFG["picture_filename_col"]

    for _, row in main_pictures.iterrows():
        oid = row[outfit_col]
        img = load_image(row[fname_col], IMAGE_DIR)
        if img is not None:
            images.append(img)
            image_outfit_ids.append(oid)

    print("Số ảnh encode được:", len(images))
    if images:
        image_embeddings = fclip.encode_images(images, batch_size=32)
        image_emb_map = dict(zip(image_outfit_ids, image_embeddings))
    else:
        image_emb_map = {}

    # Gộp text + image -> outfit_embeddings
    outfit_embeddings = {}
    for oid in outfits["id"]:
        t = text_emb_map.get(oid)
        v = image_emb_map.get(oid)
        if t is not None and v is not None:
            outfit_embeddings[oid] = 0.5 * t + 0.5 * v
        elif t is not None:
            outfit_embeddings[oid] = t
        elif v is not None:
            outfit_embeddings[oid] = v

    print("=> Lưu", EMB_FILE_NAME, "vào", BASE_PATH)
    np.save(OUTFIT_EMB_PATH, outfit_embeddings)

print("Số outfit có embedding:", len(outfit_embeddings))
feat_dim = next(iter(outfit_embeddings.values())).shape[0]
print("Kích thước embedding:", feat_dim)

=> Đang load outfit_embeddings từ /kaggle/working/outfit_embeddings.npy
  - Loaded shape: (15157, 512) | dtype: float32
  - Đã build dict outfit_embeddings từ array: số outfit = 15157
Số outfit có embedding: 15157
Kích thước embedding: 512


Hàm lấy outfits từ text 

In [44]:
import numpy as np

def find_outfits_from_text(query_text, outfit_embeddings, model, topk=10):
    """
    Tìm top-K outfit phù hợp với text (query) dựa trên FashionCLIP embedding.
    """
    # 1) Encode text thành embedding
    text_emb = model.encode_text([query_text], batch_size=1)[0]
    text_emb = text_emb.astype(np.float32)

    # L2 normalize để cosine similarity
    text_emb = text_emb / (np.linalg.norm(text_emb) + 1e-8)

    # 2) Chuẩn bị item embedding matrix
    outfit_ids = list(outfit_embeddings.keys())
    item_matrix = np.vstack([outfit_embeddings[oid] for oid in outfit_ids]).astype(np.float32)

    # Normalize item embeddings
    item_matrix = item_matrix / (np.linalg.norm(item_matrix, axis=1, keepdims=True) + 1e-8)

    # 3) Tính cosine similarity
    scores = item_matrix @ text_emb     # shape = [num_items]

    # 4) Lấy top-k
    top_idx = np.argsort(-scores)[:topk]

    results = [(outfit_ids[i], float(scores[i])) for i in top_idx]
    return results
def get_images_for_query(query_text, topk=5):
    results = find_outfits_from_text(query_text, outfit_embeddings, fclip, topk=topk)

    pics = []
    for oid, score in results:
        row = main_pictures[main_pictures["outfit.id"] == oid]
        if len(row) > 0:
            file = row.iloc[0]["file_name"]
            img = load_image(file)
            pics.append((img, oid, score))
    return pics


In [None]:
from IPython.display import display

query = ""
pics = get_images_for_query(query, topk=5)

for img, oid, score in pics:
    print(f"Outfit: {oid} | score={score:.3f}")
    display(img)


{"timestamp":"2025-12-04T09:50:02.938906Z","level":"WARN","fields":{"message":"Status Code: 429. Retrying...","request_id":""},"filename":"/home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs","line_number":236}
{"timestamp":"2025-12-04T09:50:02.938940Z","level":"WARN","fields":{"message":"Retry attempt #4. Sleeping 24.820109788s before the next attempt"},"filename":"/root/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs","line_number":171}
{"timestamp":"2025-12-04T09:50:04.465757Z","level":"WARN","fields":{"message":"Status Code: 429. Retrying...","request_id":""},"filename":"/home/runner/work/xet-core/xet-core/cas_client/src/http_client.rs","line_number":236}
{"timestamp":"2025-12-04T09:50:04.465779Z","level":"WARN","fields":{"message":"Retry attempt #4. Sleeping 23.296358132s before the next attempt"},"filename":"/root/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/reqwest-retry-0.7.0/src/middleware.rs","line_number":171}


# BERT+ImageEmbedding có sẵn

In [None]:
# import sys
# import subprocess
# subprocess.run([sys.executable, "-m", "pip", "install", "-q", "transformers"])

In [None]:
# from transformers import BertTokenizer, BertModel
# import torch
# from tqdm import tqdm

# device = "cuda" if torch.cuda.is_available() else "cpu"

# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# bert_model = BertModel.from_pretrained("bert-base-uncased").to(device)
# bert_model.eval()

# def encode_texts_bert(texts, batch_size=16):
#     embeddings = []
#     for i in range(0, len(texts), batch_size):
#         batch = texts[i:i+batch_size]
#         tokens = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=64).to(device)
#         with torch.no_grad():
#             outputs = bert_model(**tokens)
#             cls_emb = outputs.pooler_output  # vector [CLS]
#             cls_emb = torch.nn.functional.normalize(cls_emb, p=2, dim=1)
#             embeddings.append(cls_emb.cpu().numpy())
#     return np.vstack(embeddings)

# # === Tạo embedding văn bản bằng BERT ===
# texts = outfits["text"].fillna("").tolist()
# bert_text_embeddings = encode_texts_bert(texts, batch_size=16)
# print("BERT embedding shape:", bert_text_embeddings.shape)



In [None]:
# embedding_dir = os.path.join(base_path, "embeddings", "EfficientNet_V2_L_final")
# dataset_image_embeddings = {}
# for _, row in main_pictures.iterrows():
#     picture_id = row["picture.id"]
#     file_path = os.path.join(embedding_dir, f"picture.{picture_id}.npy")
#     if os.path.exists(file_path):
#         try:
#             vec = np.load(file_path)
#             dataset_image_embeddings[row["outfit.id"]] = vec
#         except Exception as e:
#             print(f"Lỗi load {file_path}: {e}")

# CFRS 

Sort giao dịch user theo thời gian và mapping

In [53]:
#CF preprocessing – encode user/item, split train/test, build A_train

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

u_col = CFG["user_col"]
i_col = CFG["item_col"]
t_start = CFG["start_time_col"]
t_end = CFG["end_time_col"]

# 1) Parse thời gian & sort
for col in [t_start, t_end]:
    user_activity[col] = pd.to_datetime(user_activity[col], errors="coerce", utc=True)
user_activity = user_activity.dropna(subset=[t_start, t_end])
user_activity = user_activity.sort_values([u_col, t_start, t_end])
print("[OK] Parse & sort thời gian.")

# 2) Encode user / item
user_enc = LabelEncoder().fit(user_activity[u_col].astype(str))
item_enc = LabelEncoder().fit(user_activity[i_col].astype(str))

user_activity["user_id"] = user_enc.transform(user_activity[u_col].astype(str))
user_activity["item_id"] = item_enc.transform(user_activity[i_col].astype(str))

num_users = user_activity["user_id"].nunique()
num_items = user_activity["item_id"].nunique()

# mapping giữa item_id (index trong matrix) và outfit-id gốc
oid_from_item = dict(zip(user_activity["item_id"], user_activity[i_col]))
item_from_oid = {v: k for k, v in oid_from_item.items()}

print(f"[COUNTS] users={num_users} | items={num_items} | interactions={len(user_activity)}")

# 3) Chia train/test theo thời gian cho từng user
TEST_RATIO = 0.2
user_train_items, user_test_items = {}, {}

for u, g in user_activity.groupby("user_id", sort=False):
    g = g.sort_values([t_start, t_end])
    items = list(g["item_id"])
    if len(items) < 3:
        continue
    test_size = max(1, int(len(items) * TEST_RATIO))
    user_train_items[u] = items[:-test_size]
    user_test_items[u] = items[-test_size:]

print(f"[SPLIT] users trong split = {len(user_train_items)}")

# Loại user không còn test hợp lệ
train_item_set = set(i for items in user_train_items.values() for i in items)
for u in list(user_test_items.keys()):
    filtered = [i for i in user_test_items[u] if i in train_item_set]
    if len(filtered) == 0:
        user_test_items.pop(u, None)
        user_train_items.pop(u, None)
    else:
        user_test_items[u] = filtered

print(f"[SPLIT after filter] users = {len(user_train_items)}")

# 4) Build A_train (CSR)
rows, cols = [], []
for u, items in user_train_items.items():
    rows.extend([u] * len(items))
    cols.extend(items)

if not rows:
    raise RuntimeError("Không còn tương tác train nào sau khi split.")

A_train = csr_matrix(
    (np.ones(len(rows), dtype=np.float32),
     (np.array(rows), np.array(cols))),
    shape=(num_users, num_items),
    dtype=np.float32
)
A_train.data[:] = 1.0
A_train.eliminate_zeros()

print(f"[MATRIX] A_train shape={A_train.shape} | nnz={A_train.nnz}")


Device: cuda
[OK] Parse & sort thời gian.
[COUNTS] users=2293 | items=10986 | interactions=64419
[SPLIT] users trong split = 1979
[SPLIT after filter] users = 1956
[MATRIX] A_train shape=(2293, 10986) | nnz=47356


In [32]:
#Các hàm cần thiết
class BPRLoss(nn.Module):
    def __init__(self, reg_lambda=1e-4):
        super().__init__()
        self.reg_lambda = reg_lambda

    def forward(self, user_emb, pos_emb, neg_emb):
        pos_score = torch.sum(user_emb * pos_emb, dim=1)
        neg_score = torch.sum(user_emb * neg_emb, dim=1)
        diff = pos_score - neg_score
        bpr = F.softplus(-diff).mean()
        reg = (
            user_emb.norm(2).pow(2)
            + pos_emb.norm(2).pow(2)
            + neg_emb.norm(2).pow(2)
        ) / user_emb.size(0)
        return bpr + self.reg_lambda * reg


def l2n_t(x, eps=1e-8):
    return x / (x.norm(dim=1, keepdim=True) + eps)


def sample_triplets(user_train_items, num_items, n_samples_per_user=5):
    users, pos_items, neg_items = [], [], []
    for u, pos_list in user_train_items.items():
        if not pos_list:
            continue
        pos_set = set(pos_list)
        for _ in range(n_samples_per_user):
            p = random.choice(pos_list)
            n = np.random.randint(0, num_items)
            while n in pos_set:
                n = np.random.randint(0, num_items)
            users.append(u)
            pos_items.append(p)
            neg_items.append(n)
    if not users:
        users, pos_items, neg_items = [0], [0], [1]
    return (
        torch.tensor(users, dtype=torch.long),
        torch.tensor(pos_items, dtype=torch.long),
        torch.tensor(neg_items, dtype=torch.long),
    )


@torch.no_grad()
def build_item_neighbors(item_emb, topk=100, batch=2048):
    item_emb = l2n_t(item_emb)
    N, _ = item_emb.shape
    all_topk = []
    for i0 in range(0, N, batch):
        xb = item_emb[i0 : i0 + batch]
        scores = xb @ item_emb.T
        for b in range(scores.shape[0]):
            idx = i0 + b
            if idx < N:
                scores[b, idx] = -1e9
        _, idxs = torch.topk(scores, k=min(topk, N - 1), dim=1)
        all_topk.append(idxs.cpu().numpy())
    return np.vstack(all_topk)


def hard_negative_sampling_from_neighbors(user_train_items, neighbors_idx, n_samples_per_user=5):
    users, pos_items, neg_items = [], [], []
    for u, pos_list in user_train_items.items():
        if not pos_list:
            continue
        pos_set = set(pos_list)
        for _ in range(n_samples_per_user):
            p = random.choice(pos_list)
            cand = neighbors_idx[p]
            cand = [c for c in cand if c not in pos_set]
            if not cand:
                continue
            n = random.choice(cand)
            users.append(u)
            pos_items.append(p)
            neg_items.append(n)
    if not users:
        return sample_triplets(user_train_items, num_items, n_samples_per_user)
    return (
        torch.tensor(users, dtype=torch.long),
        torch.tensor(pos_items, dtype=torch.long),
        torch.tensor(neg_items, dtype=torch.long),
    )


def mixed_negative_sampling(user_train_items, num_items, item_emb=None, neighbors_idx=None,
                            n_samples_per_user=10, mix=0.5):
    if (item_emb is not None) and (neighbors_idx is not None) and (np.random.rand() < mix):
        return hard_negative_sampling_from_neighbors(user_train_items, neighbors_idx, n_samples_per_user)
    else:
        return sample_triplets(user_train_items, num_items, n_samples_per_user)


def build_norm_adj_sparse(A_csr: csr_matrix) -> sp.csr_matrix:
    n_users, n_items = A_csr.shape
    R = A_csr.tocsr()
    upper = sp.hstack([sp.csr_matrix((n_users, n_users), dtype=np.float32), R], format="csr")
    lower = sp.hstack([R.T, sp.csr_matrix((n_items, n_items), dtype=np.float32)], format="csr")
    adj = sp.vstack([upper, lower], format="csr").astype(np.float32)

    deg = np.array(adj.sum(axis=1)).flatten().astype(np.float32)
    deg_inv_sqrt = np.zeros_like(deg, dtype=np.float32)
    mask = deg > 0
    deg_inv_sqrt[mask] = np.power(deg[mask], -0.5, dtype=np.float32)
    D_inv_sqrt = sp.diags(deg_inv_sqrt, format="csr", dtype=np.float32)

    norm_adj = D_inv_sqrt @ adj @ D_inv_sqrt
    return norm_adj.tocsr()


def scipy_to_torch_sparse(mat: sp.csr_matrix) -> torch.Tensor:
    coo = mat.tocoo()
    idx = np.vstack([coo.row, coo.col]).astype(np.int64)
    indices = torch.from_numpy(idx)
    values = torch.from_numpy(coo.data.astype(np.float32))
    return torch.sparse_coo_tensor(indices, values, coo.shape).coalesce()


def evaluate_embeddings(user_emb, item_emb, user_train_items, user_test_items, k=10):
    U = l2n_t(user_emb.cpu())
    I = l2n_t(item_emb.cpu())
    I_np = I.numpy()

    precisions, recalls, ndcgs, hit_rates = [], [], [], []

    for u in user_test_items.keys():
        train_items = set(user_train_items.get(u, []))
        test_items = set(user_test_items.get(u, []))
        if not test_items:
            continue

        scores = cosine_similarity(U[u].unsqueeze(0).numpy(), I_np)[0]
        if train_items:
            scores[list(train_items)] = -1e9

        top_k = np.argpartition(scores, -k)[-k:]
        top_k = top_k[np.argsort(scores[top_k])[::-1]]

        hits = len(set(top_k) & test_items)

        # ---- Recall/Precision như cũ ----
        precision = hits / k
        recall = hits / len(test_items)

        # ---- NDCG như cũ ----
        dcg = sum(1 / np.log2(i + 2) for i, iid in enumerate(top_k) if iid in test_items)
        idcg = sum(1 / np.log2(i + 2) for i in range(min(len(test_items), k)))
        ndcg = dcg / idcg if idcg > 0 else 0

        # ---- HitRate@K = 1 nếu có ít nhất 1 hit ----
        hit = 1.0 if hits > 0 else 0.0

        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)
        hit_rates.append(hit)

    return {
        f"Precision@{k}": float(np.mean(precisions)),
        f"Recall@{k}": float(np.mean(recalls)),
        f"NDCG@{k}": float(np.mean(ndcgs)),
        f"HitRate@{k}": float(np.mean(hit_rates)),
        "Users_eval": int(len(precisions)),
    }



In [60]:
# Item–item similarity từ FashionCLIP + build norm_adj_dev

feat_dim = next(iter(outfit_embeddings.values())).shape[0]
item_feat = np.zeros((num_items, feat_dim), dtype=np.float32)
has_emb = np.zeros(num_items, dtype=bool)

for item_id, oid in oid_from_item.items():
    v = outfit_embeddings.get(oid)
    if v is not None:
        item_feat[item_id] = v.astype(np.float32)
        has_emb[item_id] = True

print("Số item có embedding CLIP:", has_emb.sum(), "/", num_items)

item_feat = item_feat / (np.linalg.norm(item_feat, axis=1, keepdims=True) + 1e-8)

# Dense similarity (sau đó sparse hóa top-K)
sim_item_item = item_feat @ item_feat.T
print("Shape sim_item_item:", sim_item_item.shape)

K = 50
rows, cols, vals = [], [], []
for i in range(num_items):
    row = sim_item_item[i]
    topk_idx = np.argpartition(-row, min(K + 1, num_items - 1))[: K + 1]
    topk_idx = topk_idx[topk_idx != i]
    topk_idx = topk_idx[:K]
    for j in topk_idx:
        sim_ij = float(row[j])
        if sim_ij > 0:
            rows.append(i)
            cols.append(j)
            vals.append(sim_ij)

S_sparse = csr_matrix((vals, (rows, cols)), shape=(num_items, num_items), dtype=np.float32)
print("S_sparse: shape =", S_sparse.shape,
      "| nnz =", S_sparse.nnz,
      "| avg neighbors per item ~", S_sparse.nnz / num_items)

# Filter user + remap, build A_train_new và norm_adj_dev (tái dùng giữa dataset)
MIN_TRAIN = 3
MIN_TEST = 1

kept_users = [
    u for u in user_train_items.keys()
    if len(user_train_items[u]) >= MIN_TRAIN and len(user_test_items.get(u, [])) >= MIN_TEST
]
print(f"[FILTER] kept users: {len(kept_users)} / {len(user_train_items)}")

old2new_user = {u_old: i for i, u_old in enumerate(sorted(kept_users))}
new2old_user = {i: u_old for u_old, i in old2new_user.items()}
num_users_new = len(old2new_user)

user_train_items_new, user_test_items_new = {}, {}
for u_old in kept_users:
    u_new = old2new_user[u_old]
    user_train_items_new[u_new] = list(user_train_items[u_old])
    user_test_items_new[u_new] = list(user_test_items[u_old])

rows, cols = [], []
for u_new, items in user_train_items_new.items():
    rows.extend([u_new] * len(items))
    cols.extend(items)

A_train_new = csr_matrix(
    (np.ones(len(rows), dtype=np.float32), (np.array(rows), np.array(cols))),
    shape=(num_users_new, num_items),
    dtype=np.float32
)
A_train_new.data[:] = 1.0
A_train_new.eliminate_zeros()
print(f"[MATRIX] A_train_new shape={A_train_new.shape} | nnz={A_train_new.nnz}")

lambda_s = 0.3
A_train_soft = A_train_new @ S_sparse
A_train_soft = A_train_new + lambda_s * A_train_soft
A_train_soft.eliminate_zeros()

A_for_adj = A_train_new      
# A_for_adj = A_train_soft

norm_adj_sp_new = build_norm_adj_sparse(A_for_adj)
norm_adj_new = scipy_to_torch_sparse(norm_adj_sp_new)

user_train_items = user_train_items_new
user_test_items = user_test_items_new
A_train = A_train_new
num_users = num_users_new

norm_adj_dev = norm_adj_new.to(device)
print("[READY] norm_adj_dev shape:", norm_adj_dev.shape,
      "| num_users + num_items =", num_users + num_items)


Số item có embedding CLIP: 10980 / 10986
Shape sim_item_item: (10986, 10986)
S_sparse: shape = (10986, 10986) | nnz = 549000 | avg neighbors per item ~ 49.97269251774986
[FILTER] kept users: 1828 / 1956
[MATRIX] A_train_new shape=(1828, 10986) | nnz=47102
[READY] norm_adj_dev shape: torch.Size([12814, 12814]) | num_users + num_items = 12814


In [38]:
class NGCF(nn.Module):
    def __init__(self, n_users, n_items, dim=128, layers=1, dropout=0.2):
        super().__init__()
        self.n_users = n_users
        self.n_items = n_items
        self.dim = dim
        self.layers = layers

        self.embedding = nn.Embedding(n_users + n_items, dim)
        nn.init.xavier_uniform_(self.embedding.weight)

        self.W1 = nn.ModuleList([nn.Linear(dim, dim) for _ in range(layers)])
        self.W2 = nn.ModuleList([nn.Linear(dim, dim) for _ in range(layers)])
        self.dropout = nn.Dropout(dropout)

    def forward(self, norm_adj):
        x = self.embedding.weight
        embs = [x]
        for k in range(self.layers):
            side = torch.sparse.mm(norm_adj, x)
            sum_emb = self.W1[k](side + x)
            bi_emb = self.W2[k](side * x)
            x = F.leaky_relu(sum_emb + bi_emb, 0.2)
            x = self.dropout(x)
            x = F.normalize(x, dim=1)
            embs.append(x)
        return torch.cat(embs, dim=1)



ngcf = NGCF(num_users, num_items, dim=128, layers=2, dropout=0.1).to(device)
optimizer = torch.optim.AdamW(ngcf.parameters(), lr=0.01, weight_decay=1e-4)
bpr = BPRLoss(reg_lambda=1e-4)

epochs = 200
refresh_every = 5
n_samples_per_user = 20
hard_ratio = 0.3

loss_history_NGCF = []
neighbors_idx = None

print("Training NGCF ...")
for epoch in range(epochs):
    ngcf.train()

    with torch.no_grad():
        emb_cache = ngcf(norm_adj_dev)
        item_emb_cache = emb_cache[num_users:]
        if (epoch == 0) or (epoch % refresh_every == 0):
            neighbors_idx = build_item_neighbors(item_emb_cache, topk=100, batch=1024)

    users, pos_items, neg_items = mixed_negative_sampling(
        user_train_items,
        num_items,
        item_emb=item_emb_cache,
        neighbors_idx=neighbors_idx,
        n_samples_per_user=n_samples_per_user,
        mix=hard_ratio,
    )
    users = users.to(device)
    pos_items = pos_items.to(device)
    neg_items = neg_items.to(device)

    emb = ngcf(norm_adj_dev)
    u_emb = emb[:num_users]
    i_emb = emb[num_users:]

    user_batch_emb = u_emb[users]
    pos_emb = i_emb[pos_items]
    neg_emb = i_emb[neg_items]

    loss = bpr(user_batch_emb, pos_emb, neg_emb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    loss_history_NGCF.append(loss.item())

    if (epoch + 1) % 5 == 0:
        ngcf.eval()
        with torch.no_grad():
            emb_eval = ngcf(norm_adj_dev)
            u_eval = emb_eval[:num_users]
            i_eval = emb_eval[num_users:]
            metrics = evaluate_embeddings(
                u_eval, i_eval, user_train_items, user_test_items, k=10
            )
        print(
            f"Epoch {epoch+1:03d} | loss={loss.item():.4f} | "
            f"HR@10={metrics['HitRate@10']:.4f} | "
            f"Recall@10={metrics['Recall@10']:.4f} | "
            f"NDCG@10={metrics['NDCG@10']:.4f} | "
            f"Precision@10={metrics['Precision@10']:.4f} | "
            f"Users_eval={metrics['Users_eval']}"
        )

ngcf.eval()
with torch.no_grad():
    emb_final = ngcf(norm_adj_dev)
    user_emb_ngcf = emb_final[:num_users]
    item_emb_ngcf = emb_final[num_users:]

print("Done. user_emb_ngcf:", user_emb_ngcf.shape,
      "| item_emb_ngcf:", item_emb_ngcf.shape)

Training NGCF ...
Epoch 005 | loss=0.4054 | HR@10=0.0126 | Recall@10=0.0028 | NDCG@10=0.0019 | Precision@10=0.0013 | Users_eval=1828
Epoch 010 | loss=0.2931 | HR@10=0.0159 | Recall@10=0.0031 | NDCG@10=0.0030 | Precision@10=0.0016 | Users_eval=1828
Epoch 015 | loss=0.4682 | HR@10=0.0181 | Recall@10=0.0040 | NDCG@10=0.0031 | Precision@10=0.0019 | Users_eval=1828
Epoch 020 | loss=0.1993 | HR@10=0.0202 | Recall@10=0.0050 | NDCG@10=0.0037 | Precision@10=0.0020 | Users_eval=1828
Epoch 025 | loss=0.4036 | HR@10=0.0235 | Recall@10=0.0055 | NDCG@10=0.0044 | Precision@10=0.0024 | Users_eval=1828
Epoch 030 | loss=0.1396 | HR@10=0.0208 | Recall@10=0.0050 | NDCG@10=0.0041 | Precision@10=0.0021 | Users_eval=1828
Epoch 035 | loss=0.3092 | HR@10=0.0263 | Recall@10=0.0048 | NDCG@10=0.0043 | Precision@10=0.0026 | Users_eval=1828
Epoch 155 | loss=0.0107 | HR@10=0.0186 | Recall@10=0.0035 | NDCG@10=0.0031 | Precision@10=0.0020 | Users_eval=1828
Epoch 160 | loss=0.0356 | HR@10=0.0197 | Recall@10=0.0036 | ND

In [54]:
print("Done. user_emb_ngcf:", user_emb_ngcf.shape,
      "| item_emb_ngcf:", item_emb_ngcf.shape)

Done. user_emb_ngcf: torch.Size([1828, 384]) | item_emb_ngcf: torch.Size([10986, 384])


# CBFRS

In [55]:
# ==== CBF: Chuẩn bị content embedding cho item (từ CLIP) ====
import torch
import torch.nn.functional as F
import numpy as np

# item_feat: np.ndarray [num_items, feat_dim] đã build ở trên
# has_emb:   np.ndarray [num_items] bool

item_feat_t = torch.from_numpy(item_feat).float().to(device)   # [num_items, D]
item_feat_t = F.normalize(item_feat_t, p=2, dim=1)             # đảm bảo L2-norm

has_emb_t = torch.from_numpy(has_emb).to(device)               # [num_items]
print("CBF item_feat_t:", item_feat_t.shape, "| has_emb_t:", has_emb_t.shape)

CBF item_feat_t: torch.Size([10986, 512]) | has_emb_t: torch.Size([10986])


In [56]:
# ==== CBF: build user profile + recommend + evaluate (dùng chung train/test với CF) ====

def build_user_profile_cbf(u, user_train_items, item_feat_t, has_emb_t=None):
    """
    Xây profile nội dung cho user u bằng trung bình embedding CLIP
    của các item user đó đã tương tác (trong TRAIN).

    u: user_id đã encode (0..num_users-1)
    user_train_items: dict[u] -> list[item_id]
    item_feat_t: torch.Tensor [num_items, D] (đã normalize)
    has_emb_t: optional, torch.BoolTensor [num_items] (item nào có embedding)
    """
    items = user_train_items.get(u, [])
    if not items:
        return None

    items = torch.tensor(items, dtype=torch.long, device=item_feat_t.device)

    if has_emb_t is not None:
        mask = has_emb_t[items]
        items = items[mask]
        if items.numel() == 0:
            return None

    embs = item_feat_t[items]        # [n, D]
    prof = embs.mean(dim=0)          # [D]
    prof = F.normalize(prof, p=2, dim=0)
    return prof


def recommend_cbf_for_user_cf_idx(
    u,
    user_train_items,
    item_feat_t,
    has_emb_t=None,
    topk=10,
    exclude_seen=True,
):
    """
    Recommend THUẦN CBF cho user u (ID đã encode).
    Trả về list (item_id, score).
    """
    user_prof = build_user_profile_cbf(u, user_train_items, item_feat_t, has_emb_t)
    if user_prof is None:
        return []

    scores = item_feat_t @ user_prof   # [num_items], cosine vì đã normalize

    # loại các item đã thấy trong train nếu cần
    if exclude_seen:
        seen = set(user_train_items.get(u, []))
        if seen:
            seen_idx = torch.tensor(list(seen), dtype=torch.long, device=item_feat_t.device)
            scores[seen_idx] = -1e9

    # nếu có has_emb_t, loại item không có embedding
    if has_emb_t is not None:
        scores[~has_emb_t] = -1e9

    # top-k
    topk = min(topk, scores.shape[0])
    top_scores, top_idx = torch.topk(scores, k=topk)

    recs = [(int(top_idx[i].item()), float(top_scores[i].item())) for i in range(topk)]
    return recs


def evaluate_cbf_cf_idx(
    item_feat_t,
    user_train_items,
    user_test_items,
    has_emb_t=None,
    k=10,
):
    """
    Đánh giá CBF-only với cùng split train/test như CF.
    Trả về Precision@k, Recall@k, NDCG@k, HitRate@k.
    """
    item_feat_n = F.normalize(item_feat_t, p=2, dim=1)

    precisions, recalls, ndcgs, hit_rates = [], [], [], []

    for u in user_test_items.keys():
        test_items = set(user_test_items.get(u, []))
        if not test_items:
            continue

        user_prof = build_user_profile_cbf(u, user_train_items, item_feat_n, has_emb_t)
        if user_prof is None:
            continue

        scores = item_feat_n @ user_prof   # [num_items]

        train_items = set(user_train_items.get(u, []))
        if train_items:
            train_idx = torch.tensor(list(train_items), dtype=torch.long, device=item_feat_t.device)
            scores[train_idx] = -1e9

        if has_emb_t is not None:
            scores[~has_emb_t] = -1e9

        top_k = torch.topk(scores, k=min(k, scores.shape[0])).indices.cpu().numpy()
        hits = len(set(top_k) & test_items)

        precision = hits / k
        recall = hits / len(test_items)

        # NDCG
        dcg = 0.0
        for rank, iid in enumerate(top_k):
            if iid in test_items:
                dcg += 1.0 / np.log2(rank + 2)
        idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(test_items), k)))
        ndcg = dcg / idcg if idcg > 0 else 0.0

        hit = 1.0 if hits > 0 else 0.0

        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)
        hit_rates.append(hit)

    return {
        f"Precision@{k}": float(np.mean(precisions)) if precisions else 0.0,
        f"Recall@{k}": float(np.mean(recalls)) if recalls else 0.0,
        f"NDCG@{k}": float(np.mean(ndcgs)) if ndcgs else 0.0,
        f"HitRate@{k}": float(np.mean(hit_rates)) if hit_rates else 0.0,
        "Users_eval": int(len(precisions)),
    }

In [57]:
metrics_cbf = evaluate_cbf_cf_idx(
    item_feat_t,
    user_train_items,
    user_test_items,
    has_emb_t=has_emb_t,
    k=10
)
print("CBF only:", metrics_cbf)

u_test = list(user_test_items.keys())[0]
print("Test user:", u_test)
print("CBF recs:", recommend_cbf_for_user_cf_idx(
    u_test, user_train_items, item_feat_t, has_emb_t=has_emb_t, topk=5, exclude_seen=True
))

CBF only: {'Precision@10': 0.002658486707566462, 'Recall@10': 0.008637337331332075, 'NDCG@10': 0.006343949110171408, 'HitRate@10': 0.02607361963190184, 'Users_eval': 1956}
Test user: 706
CBF recs: [(8280, 0.8371842503547668), (667, 0.8301780819892883), (1078, 0.8289486169815063), (6199, 0.8246238231658936), (10848, 0.8242899775505066)]


In [58]:
# ==== Ensemble CF (NGCF) + CBF trên cùng split train/test ====

def evaluate_cf_cbf_ensemble(
    user_emb_cf,          # user_emb_ngcf: [num_users, D_cf]
    item_emb_cf,          # item_emb_ngcf: [num_items, D_cf]
    item_feat_t,          # content embedding: [num_items, D_cb]
    user_train_items,
    user_test_items,
    has_emb_t=None,
    alpha_cf=0.7,         # trọng số CF
    alpha_cb=0.3,         # trọng số CBF
    k=10,
):
    # Chuẩn hóa embedding CF và CBF
    U_cf = F.normalize(user_emb_cf, dim=1)        # [num_users, D_cf]
    I_cf = F.normalize(item_emb_cf, dim=1)        # [num_items, D_cf]
    I_cb = F.normalize(item_feat_t, dim=1)        # [num_items, D_cb]

    precisions, recalls, ndcgs, hit_rates = [], [], [], []

    for u in user_test_items.keys():
        test_items = set(user_test_items.get(u, []))
        if not test_items:
            continue

        # --- CF scores ---
        scores_cf = (I_cf @ U_cf[u]).clone()      # [num_items]

        # --- CBF scores ---
        user_prof_cb = build_user_profile_cbf(u, user_train_items, I_cb, has_emb_t)
        if user_prof_cb is None:
            scores_cb = torch.zeros_like(scores_cf)
        else:
            scores_cb = I_cb @ user_prof_cb       # [num_items]

        # --- Ensemble ---
        scores = alpha_cf * scores_cf + alpha_cb * scores_cb

        # loại item đã thấy trong train
        train_items = set(user_train_items.get(u, []))
        if train_items:
            train_idx = torch.tensor(list(train_items), dtype=torch.long, device=item_feat_t.device)
            scores[train_idx] = -1e9

        if has_emb_t is not None:
            scores[~has_emb_t] = -1e9

        # top-k
        top_k = torch.topk(scores, k=min(k, scores.shape[0])).indices.cpu().numpy()
        hits = len(set(top_k) & test_items)

        precision = hits / k
        recall = hits / len(test_items)

        # NDCG
        dcg = 0.0
        for rank, iid in enumerate(top_k):
            if iid in test_items:
                dcg += 1.0 / np.log2(rank + 2)
        idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(test_items), k)))
        ndcg = dcg / idcg if idcg > 0 else 0.0

        hit = 1.0 if hits > 0 else 0.0

        precisions.append(precision)
        recalls.append(recall)
        ndcgs.append(ndcg)
        hit_rates.append(hit)

    return {
        f"Precision@{k}": float(np.mean(precisions)) if precisions else 0.0,
        f"Recall@{k}": float(np.mean(recalls)) if recalls else 0.0,
        f"NDCG@{k}": float(np.mean(ndcgs)) if ndcgs else 0.0,
        f"HitRate@{k}": float(np.mean(hit_rates)) if hit_rates else 0.0,
        "Users_eval": int(len(precisions)),
    }

In [61]:
# CF-only
metrics_cf = evaluate_embeddings(
    user_emb_ngcf, item_emb_ngcf,
    user_train_items, user_test_items,
    k=10
)
print("NGCF only:", metrics_cf)

# CBF-only
metrics_cbf = evaluate_cbf_cf_idx(
    item_feat_t,
    user_train_items, user_test_items,
    has_emb_t=has_emb_t,
    k=10
)
print("CBF only:", metrics_cbf)

# Ensemble CF + CBF
metrics_ens = evaluate_cf_cbf_ensemble(
    user_emb_ngcf, item_emb_ngcf,
    item_feat_t,
    user_train_items, user_test_items,
    has_emb_t=has_emb_t,
    alpha_cf=0.7,  # có thể thử 0.6, 0.7, 0.8
    alpha_cb=0.3,
    k=10
)
print("Ensemble CF+CBF:", metrics_ens)


NGCF only: {'Precision@10': 0.0020787746170678337, 'Recall@10': 0.003079139168671969, 'NDCG@10': 0.003288297209876962, 'HitRate@10': 0.019693654266958426, 'Users_eval': 1828}
CBF only: {'Precision@10': 0.0026258205689277904, 'Recall@10': 0.007053956137902374, 'NDCG@10': 0.005932015630429195, 'HitRate@10': 0.025711159737417943, 'Users_eval': 1828}
Ensemble CF+CBF: {'Precision@10': 0.0030634573304157554, 'Recall@10': 0.005868011641396695, 'NDCG@10': 0.004975186707981191, 'HitRate@10': 0.02899343544857768, 'Users_eval': 1828}


In [62]:
# Popularity: số lần xuất hiện trong A_train
item_pop = np.asarray(A_train.sum(axis=0)).flatten().astype(np.float32)
if item_pop.max() > 0:
    item_pop_norm = item_pop / (item_pop.max() + 1e-8)
else:
    item_pop_norm = np.zeros_like(item_pop, dtype=np.float32)

item_pop_t = torch.from_numpy(item_pop_norm).float().to(device)

In [63]:
def recommend_popular(topk=10, exclude_items=None):
    scores = item_pop_t.clone()
    if exclude_items:
        idx = torch.tensor(list(exclude_items), dtype=torch.long, device=item_pop_t.device)
        scores[idx] = -1e9
    k = min(topk, scores.shape[0])
    top_scores, top_idx = torch.topk(scores, k=k)
    return [(int(top_idx[i].item()), float(top_scores[i].item())) for i in range(k)]

In [64]:
def recommend_for_user(
    u,
    user_train_items,
    user_emb_cf,
    item_emb_cf,
    item_feat_t,
    has_emb_t=None,
    topk=10,
    min_interactions_cf=3,
):
    """
    u: user_id đã encode (0..num_users-1) và đã đi qua remap kept_users
    """

    # 1) Lấy history của user
    history = user_train_items.get(u, [])
    n_hist = len(history)

    # Nếu user không có trong train (trong thực tế online gặp, trong offline ít gặp)
    if n_hist == 0:
        # cold user hoàn toàn: recommend theo POP
        return recommend_popular(topk=topk, exclude_items=None)

    # 2) Xác định trọng số CF / CBF tùy theo độ dày history
    if n_hist < min_interactions_cf:
        # lịch sử mỏng: ưu tiên CBF
        alpha_cf = 0.3
        alpha_cb = 0.7
    else:
        # lịch sử đủ dày: CF mạnh hơn
        alpha_cf = 0.7
        alpha_cb = 0.3

    # 3) Chuẩn hóa embedding CF / CBF
    U_cf = F.normalize(user_emb_cf, dim=1)
    I_cf = F.normalize(item_emb_cf, dim=1)
    I_cb = F.normalize(item_feat_t, p=2, dim=1)

    # --- CF scores ---
    scores_cf = (I_cf @ U_cf[u]).clone()  # [num_items]

    # --- CBF scores ---
    user_prof_cb = build_user_profile_cbf(u, user_train_items, I_cb, has_emb_t)
    if user_prof_cb is None:
        scores_cb = torch.zeros_like(scores_cf)
    else:
        scores_cb = I_cb @ user_prof_cb

    # --- Ensemble CF + CBF ---
    scores = alpha_cf * scores_cf + alpha_cb * scores_cb

    # Mask các item đã xem
    seen = set(history)
    if seen:
        seen_idx = torch.tensor(list(seen), dtype=torch.long, device=item_feat_t.device)
        scores[seen_idx] = -1e9

    # Không dùng item không có content nếu muốn
    if has_emb_t is not None:
        scores[~has_emb_t] = -1e9

    # 4) Nếu user quá lạnh, có thể blend thêm POP
    # (option): scores += gamma * item_pop_t
    # Ví dụ cho user rất mới:
    if n_hist < 2:
        gamma = 0.2
        scores = scores + gamma * item_pop_t

    # top-k
    k = min(topk, scores.shape[0])
    top_scores, top_idx = torch.topk(scores, k=k)
    recs = [(int(top_idx[i].item()), float(top_scores[i].item())) for i in range(k)]
    return recs

In [78]:
import random
i = random.randint(1, 1828)   
u_test = list(user_test_items.keys())[i]

recs = recommend_for_user(
    u_test,
    user_train_items,
    user_emb_ngcf,
    item_emb_ngcf,
    item_feat_t,
    has_emb_t=has_emb_t,
    topk=10,
    min_interactions_cf=3,
)
print("Hybrid recs for user", u_test, ":", recs[:5])

Hybrid recs for user 524 : [(9230, 0.49658119678497314), (6189, 0.48777157068252563), (2181, 0.4794938564300537), (5430, 0.4740217924118042), (2944, 0.4693940579891205)]
