In [11]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
review = pd.read_csv("./data_full/final/home_life_reviews.csv")

# chỉ giữ CF columns
df_cf = review[['customer_id', 'product_id', 'rating']].dropna()
df_cf['customer_id'] = df_cf['customer_id'].astype(int)
df_cf['product_id'] = df_cf['product_id'].astype(int)


In [7]:
df_cf

Unnamed: 0,customer_id,product_id,rating
0,30416766,101995256,5
1,8366209,101995256,5
2,29332709,101995256,4
3,16704174,101995256,4
4,28449953,101995256,5
...,...,...,...
53004,20891998,137226640,5
53005,11103373,137226640,4
53006,17764292,137226640,4
53007,8917895,192850357,5


In [8]:
df_cf.info()

<class 'pandas.core.frame.DataFrame'>
Index: 52950 entries, 0 to 53008
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   customer_id  52950 non-null  int32
 1   product_id   52950 non-null  int32
 2   rating       52950 non-null  int64
dtypes: int32(2), int64(1)
memory usage: 1.2 MB


In [9]:
df_cf.isnull().sum()

customer_id    0
product_id     0
rating         0
dtype: int64

In [None]:
def split_by_user(df, min_items=3, seed=42):
    rng = np.random.default_rng(seed)
    train_parts, test_parts = [], []

    for u, g in df.groupby('customer_id'):
        if len(g) < min_items:
            continue
        test_idx = rng.choice(g.index, size=1, replace=False)
        test_parts.append(g.loc[test_idx])
        train_parts.append(g.drop(test_idx))

    return pd.concat(train_parts), pd.concat(test_parts)

train_df, test_df = split_by_user(df_cf)

In [None]:
train_ui = train_df.pivot_table(
    index='customer_id',
    columns='product_id',
    values='rating',
    aggfunc='mean'
)

# Mean-centering theo user
user_mean = train_ui.mean(axis=1)
train_centered = train_ui.sub(user_mean, axis=0).fillna(0)

In [14]:
# User similarity
user_sim = pd.DataFrame(
    cosine_similarity(train_centered),
    index=train_ui.index,
    columns=train_ui.index
)

# Item similarity
item_sim = pd.DataFrame(
    cosine_similarity(train_centered.T),
    index=train_ui.columns,
    columns=train_ui.columns
)

# USER-BASED CF

In [15]:
def predict_user_based(user_id, item_id, train_ui, user_sim, k=30):
    if user_id not in train_ui.index or item_id not in train_ui.columns:
        return np.nan

    raters = train_ui[item_id].dropna()
    if raters.empty:
        return np.nan

    sims = user_sim.loc[user_id, raters.index]
    top = sims.sort_values(ascending=False).head(k)

    denom = np.sum(np.abs(top.values))
    if denom == 0:
        return np.nan

    return float(np.dot(top.values, raters.loc[top.index].values) / denom)

In [16]:
def recommend_user_based(
    user_id, candidates, train_ui, user_sim, k_neighbors=30
):
    scores = []
    for item_id in candidates:
        pred = predict_user_based(
            user_id, item_id, train_ui, user_sim, k=k_neighbors
        )
        if not np.isnan(pred):
            scores.append((item_id, pred))

    scores.sort(key=lambda x: x[1], reverse=True)
    return [i for i, _ in scores]


# ITEM-BASED CF

In [17]:
def predict_item_based(user_id, item_id, train_ui, item_sim, k=30):
    if user_id not in train_ui.index or item_id not in train_ui.columns:
        return np.nan

    user_ratings = train_ui.loc[user_id].dropna()
    if user_ratings.empty:
        return np.nan

    sims = item_sim.loc[item_id, user_ratings.index]
    top = sims.sort_values(ascending=False).head(k)

    denom = np.sum(np.abs(top.values))
    if denom == 0:
        return np.nan

    return float(np.dot(top.values, user_ratings.loc[top.index].values) / denom)


In [18]:
def recommend_item_based(
    user_id, candidates, train_ui, item_sim, k_neighbors=30
):
    scores = []
    for item_id in candidates:
        pred = predict_item_based(
            user_id, item_id, train_ui, item_sim, k=k_neighbors
        )
        if not np.isnan(pred):
            scores.append((item_id, pred))

    scores.sort(key=lambda x: x[1], reverse=True)
    return [i for i, _ in scores]


In [19]:
def sample_candidates(all_items, rated_items, true_item, n_neg=99, seed=42):
    rng = np.random.default_rng(seed)
    pool = list(set(all_items) - set(rated_items) - {true_item})
    neg = rng.choice(pool, size=min(n_neg, len(pool)), replace=False).tolist()
    return [true_item] + neg


In [20]:
def precision_at_k(pred, true, k):
    return len(set(pred[:k]) & set(true)) / k

def recall_at_k(pred, true, k):
    return len(set(pred[:k]) & set(true)) / len(true) if len(true) else 0.0

def f1_at_k(pred, true, k):
    p, r = precision_at_k(pred,true,k), recall_at_k(pred,true,k)
    return 2*p*r/(p+r) if (p+r)>0 else 0.0

def mrr(pred, true):
    for i,x in enumerate(pred,1):
        if x in true:
            return 1/i
    return 0.0

def ndcg(pred, true, k):
    dcg = sum(1/np.log2(i+1) for i,x in enumerate(pred[:k],1) if x in true)
    idcg = sum(1/np.log2(i+1) for i in range(1,min(len(true),k)+1))
    return dcg/idcg if idcg>0 else 0.0


In [21]:
def evaluate_cf(
    mode, train_df, test_df, train_ui,
    user_sim, item_sim,
    k_neighbors=30, top_k=10, n_neg=99
):
    all_items = train_ui.columns.tolist()
    results = []

    for _, row in test_df.iterrows():
        u = row['customer_id']
        true_item = row['product_id']

        if u not in train_ui.index:
            continue

        rated_items = train_df[train_df['customer_id']==u]['product_id'].tolist()
        candidates = sample_candidates(all_items, rated_items, true_item, n_neg)

        if mode == "user":
            ranked = recommend_user_based(
                u, candidates, train_ui, user_sim, k_neighbors
            )
        else:
            ranked = recommend_item_based(
                u, candidates, train_ui, item_sim, k_neighbors
            )

        results.append({
            "Precision@K": precision_at_k(ranked,[true_item],top_k),
            "Recall@K": recall_at_k(ranked,[true_item],top_k),
            "F1@K": f1_at_k(ranked,[true_item],top_k),
            "MRR": mrr(ranked,[true_item]),
            "NDCG": ndcg(ranked,[true_item],top_k)
        })

    return pd.DataFrame(results)


In [22]:
df_user = evaluate_cf(
    "user", train_df, test_df, train_ui,
    user_sim, item_sim
)

df_item = evaluate_cf(
    "item", train_df, test_df, train_ui,
    user_sim, item_sim
)

print("USER-BASED CF")
print(df_user.mean())

print("\nITEM-BASED CF")
print(df_item.mean())


USER-BASED CF
Precision@K    0.009593
Recall@K       0.095933
F1@K           0.017442
MRR            0.089243
NDCG           0.090967
dtype: float64

ITEM-BASED CF
Precision@K    0.032361
Recall@K       0.323607
F1@K           0.058838
MRR            0.313675
NDCG           0.316242
dtype: float64


In [23]:
import numpy as np

def rmse_on_test(
    test_df: pd.DataFrame,
    train_ui: pd.DataFrame,
    user_sim: pd.DataFrame,
    item_sim: pd.DataFrame,
    mode: str = "item",
    k: int = 30
):
    se = []

    for _, row in test_df.iterrows():
        u = row['customer_id']
        i = row['product_id']
        r = row['rating']

        if mode == "user":
            pred = predict_user_based(
                u, i,
                train_ui, user_sim,
                k=k
            )
        else:
            pred = predict_item_based(
                u, i,
                train_ui, item_sim,
                k=k
            )

        if not np.isnan(pred):
            se.append((pred - r) ** 2)

    return float(np.sqrt(np.mean(se))) if se else np.nan


In [24]:
rmse_user = rmse_on_test(
    test_df,
    train_ui,
    user_sim,
    item_sim,
    mode="user",
    k=30
)

rmse_item = rmse_on_test(
    test_df,
    train_ui,
    user_sim,
    item_sim,
    mode="item",
    k=30
)

print("RMSE USER-BASED CF :", rmse_user)
print("RMSE ITEM-BASED CF :", rmse_item)


RMSE USER-BASED CF : 1.987075289942915
RMSE ITEM-BASED CF : 2.430458630512645
