In [None]:
import numpy as np
import pandas as pd
import ast
from collections import defaultdict

file_path = r"D:\Project\web_mining\notebooks\ratings_test_clean.csv"
df = pd.read_csv(file_path)

def parse_list(value):
    if pd.isna(value):
        return None
    if isinstance(value, str):
        try:
            parsed = ast.literal_eval(value)
            if isinstance(parsed, list):
                return parsed
        except:
            return None
    if isinstance(value, list):
        return value
    return None

unique_counts = {}

for col in df.columns:
    unique_values = set()

    for val in df[col]:
        lst = parse_list(val)
        if lst:
            for item in lst:
                # Tránh list lồng
                if isinstance(item, list):
                    for sub in item:
                        unique_values.add(str(sub))
                else:
                    unique_values.add(str(item))

    # Chỉ lưu cột có dữ liệu dạng list
    if unique_values:
        unique_counts[col] = len(unique_values)

# In kết quả
for col, cnt in unique_counts.items():
    print(f"{col}: {cnt}")

def parse_vector(data_str):
    data_array = np.fromstring(data_str.strip("[]"), sep=' ')
    data_list = data_array.tolist()
    return data_list


In [3]:
import math
from collections import Counter
import numpy as np

def safe_list(x):
    if isinstance(x, list):
        return set(x)
    return set()

def jaccard(a, b):
    if not a or not b:
        return 0.0
    return len(a & b) / len(a | b)

def cosine_sim(vec1, vec2, eps=1e-8):
    vec1 = np.asarray(vec1)
    vec2 = np.asarray(vec2)

    return np.dot(vec1, vec2) / (
        np.linalg.norm(vec1) * np.linalg.norm(vec2) + eps
    )

def compute_similarity(metric, v1, v2):
    if metric == "jaccard":
        return jaccard(safe_list(v1), safe_list(v2))
    elif metric == "cosine":
        return cosine_sim(v1, v2)
    else:
        raise ValueError(f"Unknown metric: {metric}")

def movie_similarity(m1, m2, sim_params):
    score = 0.0

    for field, cfg in sim_params.items():
        weight = cfg["weight"]
        metric = cfg["metric"]

        val1 = m1.get(field)
        val2 = m2.get(field)

        if val1 is None or val2 is None:
            continue
        
        sim = compute_similarity(metric, val1, val2)
        score += weight * sim

    return score

In [None]:
def recommend_for_user(user_id, movie_obj, train_df, sim_params, K=5):
    """
    user_id: int
    movie_obj: dict (1 movie)
    train_df: DataFrame (ratings_train merged metadata)
    sim_params: dict (similarity parameters)
    """
    user_movies = train_df[train_df["userId"] == user_id]
    if user_movies.empty:
        return 0.0

    scores = []

    for _, row in user_movies.iterrows():
        for field in ["keyword_names_bert", "overview_bert"]:
            row[field] = parse_vector(row[field])
        sim = movie_similarity(movie_obj, row, sim_params)
        scores.append((row["movieId"], row["rating"], sim))

    scores.sort(key=lambda x: x[2], reverse=True)

    top_k = scores[:K]

    avg_rating = sum(r[1] for r in top_k) / len(top_k)
    return avg_rating

In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.metrics import mean_squared_error
from functools import partial

try:
    train_df = pd.read_csv(r"D:\Project\web_mining\notebooks\ratings_train_clean_bert.csv")
    test_df = pd.read_csv(r"D:\Project\web_mining\notebooks\ratings_test_clean_bert.csv")
    
except FileNotFoundError:
    train_df = pd.DataFrame() 
    test_df = pd.DataFrame()


def objective(trial):
    weights = {
        {'genres': {'metric': 'jaccard', 'weight': 0.07695564868227772}, 'keyword_names': {'metric': 'jaccard', 'weight': 0.20469569241159363}, 'top_cast': {'metric': 'jaccard', 'weight': 0.8330251175685949}, 'writers': {'metric': 'jaccard', 'weight': 0.17213970919049904}, 'producers': {'metric': 'jaccard', 'weight': 0.3575349111578864}, 'belongs_to_collection': {'metric': 'jaccard', 'weight': 0.023259311007562167}, 'keyword_names_bert': {'metric': 'cosine', 'weight': 0.5896342359730261}, 'overview_bert': {'metric': 'cosine', 'weight': 0.8317850183322544}}
    }

    MOVIE_FIELDS = [
        "genres", "keyword_names", "top_cast", "writers", 
        "producers", "belongs_to_collection", "keyword_names_bert", "overview_bert"
    ]
    
    K = 5
    y_true = []
    y_pred = []

    for _, row in test_df.iterrows():
        movie = {f: row[f] for f in MOVIE_FIELDS}
        
        for field in ["keyword_names_bert", "overview_bert"]:
            movie[field] = parse_vector(movie[field])
            
        try:
            pred_rating = recommend_for_user(row["userId"], movie, train_df, weights, K)
        except NameError:
            pred_rating = 0.0 
        
        y_true.append(row["rating"])
        y_pred.append(pred_rating)

    mse = mean_squared_error(y_true, y_pred)
    
    return mse


"""if __name__ == "__main__":
    study = optuna.create_study(direction="minimize")
    print("Bắt đầu tìm kiếm tham số tối ưu...")
    study.optimize(objective, n_trials=50)

    print("\n" + "="*30)
    print(f"Best MSE: {study.best_value}")
    print("Best Params:")
    for key, value in study.best_params.items():
        print(f"  {key}: {value}")
    
    best_sim_params = {}
    metrics_map = { 
        "w_genres": ("genres", "jaccard"),
        "w_keyword": ("keyword_names", "jaccard"),
        "w_cast": ("top_cast", "jaccard"),
        "w_writers": ("writers", "jaccard"),
        "w_producers": ("producers", "jaccard"),
        "w_collection": ("belongs_to_collection", "jaccard"),
        "w_key_bert": ("keyword_names_bert", "cosine"),
        "w_overview_bert": ("overview_bert", "cosine")
    }
    
    for param_name, weight_val in study.best_params.items():
        field, metric = metrics_map[param_name]
        best_sim_params[field] = {"metric": metric, "weight": weight_val}
        
    print(best_sim_params)"""

In [7]:
import numpy as np
import pandas as pd

df_train = pd.read_csv(r"D:\Project\web_mining\notebooks\ratings_train_clean_bert.csv")
df_test  = pd.read_csv(r"D:\Project\web_mining\notebooks\ratings_test_clean_bert.csv")

all_users = pd.concat([df_train["userId"], df_test["userId"]]).unique()
all_items = pd.concat([df_train["movieId"], df_test["movieId"]]).unique()

user2idx = {u: i for i, u in enumerate(all_users)}
item2idx = {i: j for j, i in enumerate(all_items)}

idx2user = {i: u for u, i in user2idx.items()}
idx2item = {j: i for i, j in item2idx.items()}


from collections import defaultdict

train_interactions = defaultdict(set)

for _, row in df_train.iterrows():
    u = user2idx[row["userId"]]
    i = item2idx[row["movieId"]]
    train_interactions[u].add(i)

test_ratings = defaultdict(dict)


MOVIE_FIELDS = [
        "genres", "keyword_names", "top_cast", "writers", 
        "p"
        "roducers", "belongs_to_collection", "keyword_names_bert", "overview_bert"
    ]
weights = {'genres': {'metric': 'jaccard', 'weight': 0.07695564868227772}, 'keyword_names': {'metric': 'jaccard', 'weight': 0.20469569241159363}, 'top_cast': {'metric': 'jaccard', 'weight': 0.8330251175685949}, 'writers': {'metric': 'jaccard', 'weight': 0.17213970919049904}, 'producers': {'metric': 'jaccard', 'weight': 0.3575349111578864}, 'belongs_to_collection': {'metric': 'jaccard', 'weight': 0.023259311007562167}, 'keyword_names_bert': {'metric': 'cosine', 'weight': 0.5896342359730261}, 'overview_bert': {'metric': 'cosine', 'weight': 0.8317850183322544}}
num_users = len(user2idx)
num_items = len(item2idx)
ratings_pred = np.zeros((num_users, num_items))
for _, row in df_test.iterrows():
    u = user2idx[row["userId"]]
    i = item2idx[row["movieId"]]
    test_ratings[u][i] = float(row["rating"])

    movie_obj = {f: row[f] for f in MOVIE_FIELDS}
    for field in ["keyword_names_bert", "overview_bert"]:
        movie_obj[field] = parse_vector(movie_obj[field])
    ratings_pred[u][i] = recommend_for_user(row["userId"], movie_obj, df_train, weights, K=5)


In [None]:
def compute_mrr_at_k(
    ratings_pred,
    test_ratings,
    rating_threshold=3.5,
    k=5
):

    rr_scores = []

    for u in range(ratings_pred.shape[0]):
        if u not in test_ratings:
            continue

        # top-k theo rating dự đoán
        top_k_items = np.argsort(-ratings_pred[u])[:k]

        rr_u = 0.0
        for rank, item_id in enumerate(top_k_items, start=1):
            if (
                item_id in test_ratings[u]
                and test_ratings[u][item_id] >= rating_threshold
            ):
                rr_u = 1.0 / rank
                break   # chỉ lấy item relevant đầu tiên

        rr_scores.append(rr_u)

    return float(np.mean(rr_scores)) if rr_scores else 0.0


In [9]:
def compute_hr_at_k(
    ratings_pred,
    test_ratings,
    rating_threshold=3.5,
    k=5
):
    """
    ratings_pred: np.ndarray (num_users, num_items)
    test_ratings: dict {u: {item_id: rating}}
    """

    hits = []

    for u in range(ratings_pred.shape[0]):
        if u not in test_ratings:
            continue

        # top-k items
        top_k_items = np.argsort(-ratings_pred[u])[:k]

        hit_u = 0
        for item_id in top_k_items:
            if (
                item_id in test_ratings[u]
                and test_ratings[u][item_id] >= rating_threshold
            ):
                hit_u = 1
                break

        hits.append(hit_u)

    return float(np.mean(hits)) if hits else 0.0

In [10]:
mrr5 = compute_mrr_at_k(
    ratings_pred=ratings_pred,
    test_ratings=test_ratings,
    k=5
)

print("MRR@5 =", mrr5)

MRR@5 = 0.7678463855421687


In [11]:
hr = compute_hr_at_k(
    ratings_pred=ratings_pred,
    test_ratings=test_ratings,
    k=5
)

print("HR@5 =", hr)

HR@5 = 0.9367469879518072


In [None]:
user_counts = Counter(df_train["userId"])
sorted_users = sorted(user_counts.items(), key=lambda x: x[1])
least_5_users_raw = [u for u, _ in sorted_users[:5]]
most_5_users_raw = [u for u, _ in sorted_users[-5:]]
least_5_users = [user2idx[u] for u in least_5_users_raw if u in user2idx]
most_5_users  = [user2idx[u] for u in most_5_users_raw if u in user2idx]

In [13]:
def compute_user_mse(u, ratings_pred, test_ratings):
    if u not in test_ratings:
        return None

    errors = []
    for i, true_rating in test_ratings[u].items():
        pred = ratings_pred[u][i]
        errors.append((pred - true_rating) ** 2)

    return np.mean(errors) if errors else None

def compute_user_mrr_at_k(
    u,
    ratings_pred,
    test_ratings,
    rating_threshold=3.5,
    k=5
):
    if u not in test_ratings:
        return 0.0

    top_k_items = np.argsort(-ratings_pred[u])[:k]

    for rank, i in enumerate(top_k_items, start=1):
        if i in test_ratings[u] and test_ratings[u][i] >= rating_threshold:
            return 1.0 / rank

    return 0.0

def compute_user_hr_at_k(
    u,
    ratings_pred,
    test_ratings,
    rating_threshold=3.5,
    k=5
):
    if u not in test_ratings:
        return 0.0

    top_k_items = np.argsort(-ratings_pred[u])[:k]

    for i in top_k_items:
        if i in test_ratings[u] and test_ratings[u][i] >= rating_threshold:
            return 1.0 

    return 0.0     



In [15]:
def evaluate_user_group(
    users,
    ratings_pred,
    test_ratings
):
    mses = []
    hrs = []
    mrrs  = []

    for u in users:
        mse = compute_user_mse(u, ratings_pred, test_ratings)
        if mse is not None:
            mses.append(mse)

        hrs.append(
            compute_user_hr_at_k(u, ratings_pred, test_ratings)
        )

        mrrs.append(
            compute_user_mrr_at_k(u, ratings_pred, test_ratings)
        )

    return {
        "Mean MSE": float(np.mean(mses)) if mses else None,
        "Mean HR@5": float(np.mean(hrs)) if hrs else 0.0,
        "MRR@5": float(np.mean(mrrs)) if mrrs else 0.0
    }

In [16]:
results_most = evaluate_user_group(
    most_5_users,
    ratings_pred,
    test_ratings
)

results_least = evaluate_user_group(
    least_5_users,
    ratings_pred,
    test_ratings
)

print("===== 5 USERS NHIỀU DATA NHẤT =====")
print(results_most)

print("\n===== 5 USERS ÍT DATA NHẤT =====")
print(results_least)

===== 5 USERS NHIỀU DATA NHẤT =====
{'Mean MSE': 1.1941419687275943, 'Mean HR@5': 1.0, 'MRR@5': 0.5666666666666667}

===== 5 USERS ÍT DATA NHẤT =====
{'Mean MSE': 0.92578125, 'Mean HR@5': 0.6, 'MRR@5': 0.5}
