In [6]:
import os
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.utils.extmath import randomized_svd
from collections import defaultdict
import gradio as gr

In [7]:
DATA_DIR = "/kaggle/input/movielens-100k-dataset/ml-100k"

In [9]:
def load_movielens_100k(ratings_file, movies_file):
    ratings = pd.read_csv(
        ratings_file,
        sep="\t",
        names=["user_id", "item_id", "rating", "timestamp"],
        engine="python"
    )
    movies = pd.read_csv(
        movies_file,
        sep="|",
        names=["item_id", "title"] + [f"col{i}" for i in range(22)],
        engine="python",
        encoding="latin-1"
    )[["item_id", "title"]]
    return ratings, movies

ratings, movies = load_movielens_100k(
    os.path.join(DATA_DIR, "u.data"),
    os.path.join(DATA_DIR, "u.item")
)

print(ratings.head())
print(movies.head())


   user_id  item_id  rating  timestamp
0      196      242       3  881250949
1      186      302       3  891717742
2       22      377       1  878887116
3      244       51       2  880606923
4      166      346       1  886397596
   item_id              title
0        1   Toy Story (1995)
1        2   GoldenEye (1995)
2        3  Four Rooms (1995)
3        4  Get Shorty (1995)
4        5     Copycat (1995)


In [11]:
def train_test_split_by_user(ratings, test_ratio=0.2, min_items_per_user=5):
    train, test = [], []
    for user, user_data in ratings.groupby("user_id"):
        if len(user_data) < min_items_per_user:
            train.append(user_data)
            continue
        test_size = max(1, int(len(user_data) * test_ratio))
        test_idx = np.random.choice(user_data.index, size=test_size, replace=False)
        train.append(user_data.drop(test_idx))
        test.append(user_data.loc[test_idx])
    return pd.concat(train), pd.concat(test)

train, test = train_test_split_by_user(ratings)


In [12]:
def build_user_item_matrix(ratings):
    user_ids = ratings["user_id"].unique()
    item_ids = ratings["item_id"].unique()
    user_id_map = {u: i for i, u in enumerate(user_ids)}
    item_id_map = {m: i for i, m in enumerate(item_ids)}
    R = np.zeros((len(user_ids), len(item_ids)))
    for row in ratings.itertuples():
        R[user_id_map[row.user_id], item_id_map[row.item_id]] = row.rating
    return R, user_id_map, item_id_map

R_train, user_id_map, item_id_map = build_user_item_matrix(train)


In [13]:
def user_similarity(R, mean_center=True):
    if mean_center:
        mask = R > 0
        means = np.divide(R.sum(axis=1), mask.sum(axis=1), where=mask.sum(axis=1) > 0)
        R = (R - means[:, None]) * mask
    return cosine_similarity(R)

def item_similarity(R):
    return cosine_similarity(R.T)

def predict_ratings_user_based(R, S, k_neighbors=30):
    num_users, num_items = R.shape
    P = np.zeros_like(R)
    for u in range(num_users):
        top_k = np.argsort(S[u])[::-1][1:k_neighbors+1]
        sim = S[u, top_k]
        for i in range(num_items):
            if R[u, i] == 0:
                ratings = R[top_k, i]
                mask = ratings > 0
                if mask.any():
                    P[u, i] = np.dot(sim[mask], ratings[mask]) / (np.sum(sim[mask]) + 1e-8)
    return P

def predict_ratings_item_based(R, S, k_neighbors=50):
    num_users, num_items = R.shape
    P = np.zeros_like(R)
    for u in range(num_users):
        for i in range(num_items):
            if R[u, i] == 0:
                top_k = np.argsort(S[i])[::-1][1:k_neighbors+1]
                ratings = R[u, top_k]
                mask = ratings > 0
                if mask.any():
                    P[u, i] = np.dot(S[i, top_k][mask], ratings[mask]) / (np.sum(S[i, top_k][mask]) + 1e-8)
    return P

def predict_ratings_svd(R, n_factors=50):
    mask = R > 0
    mean_user_rating = np.divide(R.sum(axis=1), mask.sum(axis=1), where=mask.sum(axis=1) > 0)
    R_centered = (R - mean_user_rating[:, None]) * mask
    U, Sigma, VT = randomized_svd(R_centered, n_components=n_factors, random_state=42)
    return np.dot(np.dot(U, np.diag(Sigma)), VT) + mean_user_rating[:, None]


In [14]:
def precision_at_k(P, train, test, user_id_map, item_id_map, k=10, threshold=4.0):
    correct, total = 0, 0
    for u in test["user_id"].unique():
        if u not in user_id_map: 
            continue
        u_idx = user_id_map[u]
        user_preds = P[u_idx]
        seen_items = train.loc[train["user_id"] == u, "item_id"].values
        pred_scores = [(item, user_preds[item_id_map[item]]) for item in item_id_map if item not in seen_items]
        top_k = sorted(pred_scores, key=lambda x: x[1], reverse=True)[:k]
        true_items = test[(test["user_id"] == u) & (test["rating"] >= threshold)]["item_id"].values
        correct += sum(1 for item, _ in top_k if item in true_items)
        total += k
    return correct / total if total > 0 else 0

# Build predictions
S_user = user_similarity(R_train)
P_user = predict_ratings_user_based(R_train, S_user)

S_item = item_similarity(R_train)
P_item = predict_ratings_item_based(R_train, S_item)

P_svd = predict_ratings_svd(R_train)

print("Precision@10 (User-CF):", precision_at_k(P_user, train, test, user_id_map, item_id_map, k=10))
print("Precision@10 (Item-CF):", precision_at_k(P_item, train, test, user_id_map, item_id_map, k=10))
print("Precision@10 (SVD):", precision_at_k(P_svd, train, test, user_id_map, item_id_map, k=10))


Precision@10 (User-CF): 0.00943796394485684
Precision@10 (Item-CF): 0.0046659597030752915
Precision@10 (SVD): 0.12576882290562036


In [15]:
def recommend_top_n_for_user(user_id, P, R, user_id_map, item_id_map, movies, n=10):
    if user_id not in user_id_map:
        return pd.DataFrame({"title": [], "predicted_rating": []})
    u_idx = user_id_map[user_id]
    seen_items = np.where(R[u_idx] > 0)[0]
    preds = [(item, P[u_idx, idx]) for item, idx in item_id_map.items() if idx not in seen_items]
    top_n = sorted(preds, key=lambda x: x[1], reverse=True)[:n]
    df = pd.DataFrame(top_n, columns=["item_id", "predicted_rating"])
    return df.merge(movies, on="item_id")[["title", "predicted_rating"]]

In [17]:
def recommend_interface(user_id, method, n):
    if method == "User-CF":
        recs = recommend_top_n_for_user(user_id, P_user, R_train, user_id_map, item_id_map, movies, n=n)
    elif method == "Item-CF":
        recs = recommend_top_n_for_user(user_id, P_item, R_train, user_id_map, item_id_map, movies, n=n)
    else:
        recs = recommend_top_n_for_user(user_id, P_svd, R_train, user_id_map, item_id_map, movies, n=n)
    return recs

demo = gr.Interface(
    fn=recommend_interface,
    inputs=[
        gr.Number(label="User ID", value=1),
        gr.Radio(["User-CF", "Item-CF", "SVD"], label="Method"),
        gr.Slider(5, 20, value=10, step=1, label="Top-N")
    ],
    outputs="dataframe",
    title="🎬 MovieLens Recommender",
    description="Choose a method (User-CF, Item-CF, or SVD) and get top-N movie recommendations for any user."
)

demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://4230c046df04a573ad.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Created dataset file at: .gradio/flagged/dataset1.csv
