In [7]:
# =========================================
# MovieLens 100K — UserCF vs ItemCF vs SVD
# Path: /kaggle/input/movielens-100k-dataset/ml-100k
# Evaluate with Precision@K
# =========================================

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

# --------------------------
# 1) Load ratings & movies
# --------------------------
RATINGS_PATH = "/kaggle/input/movielens-100k-dataset/ml-100k/u.data"
MOVIES_PATH  = "/kaggle/input/movielens-100k-dataset/ml-100k/u.item"

# u.data: userId, movieId, rating, timestamp (tab-separated)
ratings = pd.read_csv(
    RATINGS_PATH, sep="\t",
    names=["userId", "movieId", "rating", "timestamp"],
    engine="python"
)
ratings = ratings.drop(columns="timestamp")

# u.item: pipe-separated with many columns; first two are movieId, title
movies_raw = pd.read_csv(
    MOVIES_PATH, sep="|", header=None, encoding="latin-1", engine="python"
)
movies = movies_raw[[0, 1]].copy()
movies.columns = ["movieId", "title"]
movies["movieId"] = movies["movieId"].astype(int)
movie_title = dict(zip(movies.movieId, movies.title))

print("Ratings shape:", ratings.shape, "Unique users:", ratings.userId.nunique(), "Unique movies:", ratings.movieId.nunique())
print(ratings.head(), "\n")
print(movies.head(), "\n")

# --------------------------
# 2) Train/Test split (by interactions)
#    Ensure test users exist in train
# --------------------------
train, test = train_test_split(ratings, test_size=0.2, random_state=42)
users_in_train = set(train.userId.unique())
orphan_mask = ~test.userId.isin(users_in_train)
if orphan_mask.any():
    # move orphan test rows back to train
    train = pd.concat([train, test[orphan_mask]], ignore_index=True)
    test = test[~orphan_mask].reset_index(drop=True)

# --------------------------
# 3) Build user-item matrices (TRAIN)
# --------------------------
# Use movieId as columns (safer), map to titles only for display.
user_item_train = train.pivot_table(index="userId", columns="movieId", values="rating")
user_item_train_filled = user_item_train.fillna(0.0)

users = user_item_train.index.tolist()
items = user_item_train.columns.tolist()
user_index = {u: i for i, u in enumerate(users)}
item_index = {m: j for j, m in enumerate(items)}

# Helper: items rated in train by a user
def seen_items_train(uid):
    if uid not in user_item_train.index:
        return set()
    return set(user_item_train.loc[uid].dropna().index.tolist())

# --------------------------
# 4) USER-BASED CF
# --------------------------
user_sim = cosine_similarity(user_item_train_filled.values)  # (U x U)
user_sim_df = pd.DataFrame(user_sim, index=users, columns=users)

def recommend_usercf(uid, top_k=10, n_neighbors=50, min_common=3):
    if uid not in users:
        return []
    # Top similar neighbors
    sims = user_sim_df.loc[uid].drop(uid, errors="ignore").sort_values(ascending=False).head(n_neighbors)
    neighbors = sims.index.tolist()
    weights = sims.values

    # Neighbor ratings matrix (neighbors x items)
    neigh_mat = user_item_train.loc[neighbors, items]
    # Mask of where neighbors rated
    rated_mask = ~neigh_mat.isna()

    # Weighted numerator & denominator
    ratings_filled = neigh_mat.fillna(0.0).values
    numer = (ratings_filled * weights.reshape(-1, 1)).sum(axis=0)
    denom = (rated_mask.values * weights.reshape(-1, 1)).sum(axis=0)

    # Scores
    with np.errstate(divide="ignore", invalid="ignore"):
        scores = np.where(denom > 0, numer / denom, 0.0)
    scores_s = pd.Series(scores, index=items)

    # Require at least min_common neighbor ratings per item
    common_counts = pd.Series(rated_mask.values.sum(axis=0), index=items)
    scores_s = scores_s[common_counts >= min_common]

    # Exclude seen items in TRAIN
    scores_s = scores_s.drop(labels=list(seen_items_train(uid)), errors="ignore")

    # Top-K itemIds
    recs = scores_s.sort_values(ascending=False).head(top_k).index.tolist()
    return recs

# --------------------------
# 5) ITEM-BASED CF
# --------------------------
# Compute item similarity on TRAIN
item_item_matrix = user_item_train_filled.T  # (I x U)
item_sim = cosine_similarity(item_item_matrix.values)  # (I x I)
item_sim_df = pd.DataFrame(item_sim, index=items, columns=items)

def recommend_itemcf(uid, top_k=10, n_neighbors=50, min_common=3):
    if uid not in users:
        return []
    user_ratings = user_item_train.loc[uid]  # sparse (with NaN)
    seen = user_ratings.dropna()

    # Weighted score for an unseen item j:
    # sum_{i in seen top-n similar} sim(j,i) * r(u,i) / sum sim(j,i)
    scores = {}
    for j in items:
        if j in seen.index:
            continue
        sims = item_sim_df.loc[j, seen.index].copy()
        # Keep top-N most similar rated items
        sims = sims.sort_values(ascending=False).head(n_neighbors)
        sim_vals = sims.values
        rat_vals = seen.loc[sims.index].values
        # Count common (non-zero sims)
        common = (sim_vals > 0).sum()
        if common < min_common or sim_vals.sum() <= 0:
            continue
        scores[j] = np.dot(sim_vals, rat_vals) / sim_vals.sum()

    # Top-K itemIds
    recs = [m for m, _ in sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_k]]
    return recs

# --------------------------
# 6) SVD (Matrix Factorization)
# --------------------------
# Center by user mean to reduce bias, reconstruct scores, then uncenter.
R = user_item_train_filled.values  # zeros for missing
# Compute user means on observed ratings only
user_means = user_item_train.apply(lambda row: row.mean(), axis=1).fillna(0.0).values
R_centered = user_item_train.subtract(user_means, axis=0).fillna(0.0).values

# Low-rank approximation with TruncatedSVD
rank = 50  # you can tune (e.g., 20–100)
svd = TruncatedSVD(n_components=rank, random_state=42)
U = svd.fit_transform(R_centered)          # (U x k)
S = svd.singular_values_                   # (k,)
VT = svd.components_                       # (k x I)

# Reconstruct centered scores and add user means back
R_hat_centered = U @ np.diag(S) @ VT       # (U x I)
R_hat = (R_hat_centered + user_means.reshape(-1, 1))  # add back means

def recommend_svd(uid, top_k=10):
    if uid not in users:
        return []
    uidx = user_index[uid]
    scores = pd.Series(R_hat[uidx, :], index=items)
    scores = scores.drop(labels=list(seen_items_train(uid)), errors="ignore")
    recs = scores.sort_values(ascending=False).head(top_k).index.tolist()
    return recs

# --------------------------
# 7) Evaluation: Precision@K
# --------------------------
def precision_at_k(method_func, uid, k=10, relevance_threshold=4.0):
    recs = method_func(uid, top_k=k)
    if not recs:
        return 0.0
    # Relevant items in TEST: rating >= threshold
    user_test = test[test.userId == uid]
    if user_test.empty:
        return 0.0
    relevant = set(user_test.loc[user_test.rating >= relevance_threshold, "movieId"].tolist())
    if not relevant:
        return 0.0
    hits = sum(1 for m in recs if m in relevant)
    return hits / k

def evaluate_all(users_subset=None, k=10):
    if users_subset is None:
        users_subset = users
    metrics = {"UserCF": [], "ItemCF": [], "SVD": []}
    for uid in users_subset:
        metrics["UserCF"].append(precision_at_k(recommend_usercf, uid, k=k))
        metrics["ItemCF"].append(precision_at_k(recommend_itemcf, uid, k=k))
        metrics["SVD"].append(precision_at_k(recommend_svd, uid, k=k))
    summary = {
        "UserCF_P@{}".format(k): np.mean(metrics["UserCF"]) if metrics["UserCF"] else 0.0,
        "ItemCF_P@{}".format(k): np.mean(metrics["ItemCF"]) if metrics["ItemCF"] else 0.0,
        "SVD_P@{}".format(k): np.mean(metrics["SVD"]) if metrics["SVD"] else 0.0,
        "Users_Evaluated": len(users_subset),
    }
    return summary

# --------------------------
# 8) Run: sample recommendations & overall metrics
# --------------------------
K = 10
sample_user = users[0]

print("Sample user:", sample_user)
print("\nUserCF recommendations:")
for mid in recommend_usercf(sample_user, top_k=10):
    print("-", movie_title.get(mid, str(mid)))

print("\nItemCF recommendations:")
for mid in recommend_itemcf(sample_user, top_k=10):
    print("-", movie_title.get(mid, str(mid)))

print("\nSVD recommendations:")
for mid in recommend_svd(sample_user, top_k=10):
    print("-", movie_title.get(mid, str(mid)))

# Evaluate on all users (can be slow; you can subsample)
results = evaluate_all(users_subset=users, k=K)
print("\n=== Precision@{} (higher is better) ===".format(K))
for k_, v in results.items():
    print(f"{k_}: {v:.4f}" if isinstance(v, float) else f"{k_}: {v}")


Ratings shape: (100000, 3) Unique users: 943 Unique movies: 1682
   userId  movieId  rating
0     196      242       3
1     186      302       3
2      22      377       1
3     244       51       2
4     166      346       1 

   movieId              title
0        1   Toy Story (1995)
1        2   GoldenEye (1995)
2        3  Four Rooms (1995)
3        4  Get Shorty (1995)
4        5     Copycat (1995) 

Sample user: 1

UserCF recommendations:
- Braindead (1992)
- Touch of Evil (1958)
- Waiting for Guffman (1996)
- Down by Law (1986)
- Titanic (1997)
- Secrets & Lies (1996)
- Casablanca (1942)
- Some Folks Call It a Sling Blade (1993)
- Shawshank Redemption, The (1994)
- Close Shave, A (1995)

ItemCF recommendations:
- Further Gesture, A (1996)
- Burnt By the Sun (1994)
- Looking for Richard (1996)
- C'est arrivé près de chez vous (1992)
- Wings of the Dove, The (1997)
- Strictly Ballroom (1992)
- Celluloid Closet, The (1995)
- Go Fish (1994)
- Crumb (1994)
- Sweet Hereafter, The (1