In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [3]:
md = pd.read_csv("./kaggle/ratings.csv")   # change filename if needed
md = md[['userId', 'movieId', 'rating']]  # keep relevant columns

In [4]:
md.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [5]:
md.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   userId   100836 non-null  int64  
 1   movieId  100836 non-null  int64  
 2   rating   100836 non-null  float64
dtypes: float64(1), int64(2)
memory usage: 2.3 MB


In [7]:
md.shape

(100836, 3)

In [6]:
def leave_one_out(md, seed=42):
    test_idx = md.groupby('userId').apply(lambda g: g.sample(n=1, random_state=seed)).index.get_level_values(1)
    test = md.loc[test_idx].reset_index(drop=True)
    train = md.drop(test_idx).reset_index(drop=True)
    return train, test

train_md, test_md = leave_one_out(md, seed=42)
print("Train size:", len(train_md))
print("Test size:", len(test_md))

Train size: 100226
Test size: 610


  test_idx = md.groupby('userId').apply(lambda g: g.sample(n=1, random_state=seed)).index.get_level_values(1)


In [8]:
# Map users and items to indices
user_mapper = {u: i for i, u in enumerate(train_md['userId'].unique())}
item_mapper = {m: i for i, m in enumerate(train_md['movieId'].unique())}
inv_user = {v: k for k, v in user_mapper.items()}
inv_item = {v: k for k, v in item_mapper.items()}

n_users = len(user_mapper)
n_items = len(item_mapper)

# Create user–item rating matrix
R = np.zeros((n_users, n_items), dtype=np.float32)
for _, row in train_md.iterrows():
    u = user_mapper[row['userId']]
    i = item_mapper[row['movieId']]
    R[u, i] = row['rating']

In [9]:
R.shape

(610, 9710)

In [10]:
user_sim = cosine_similarity(R)
np.fill_diagonal(user_sim, 0.0)  # ignore self-similarity

In [11]:
NEIGHBORS = 30
K_RECOMMEND = 10

def predict_for_user(u_idx, R, user_sim, k_neighbors=NEIGHBORS):
    sim_scores = user_sim[u_idx].copy()
    if k_neighbors < len(sim_scores):
        top_k_idx = np.argpartition(-sim_scores, k_neighbors)[:k_neighbors]
        mask = np.zeros_like(sim_scores, dtype=bool)
        mask[top_k_idx] = True
        sim = sim_scores * mask
    else:
        sim = sim_scores
    numer = sim.dot(R)
    denom = np.abs(sim).sum()
    if denom == 0:
        return np.zeros(R.shape[1])
    return numer / denom

def recommend_for_user(user_id, K=K_RECOMMEND):
    if user_id not in user_mapper:
        return []
    u_idx = user_mapper[user_id]
    preds = predict_for_user(u_idx, R, user_sim)
    preds[R[u_idx] > 0] = -np.inf  # mask seen movies
    top_idxs = np.argpartition(-preds, K)[:K]
    return [inv_item[i] for i in top_idxs[np.argsort(-preds[top_idxs])]]

In [12]:
def precision_at_k(test_md, K=K_RECOMMEND):
    hits, total = 0, 0
    for _, row in test_md.iterrows():
        u, true_item = row['userId'], row['movieId']
        if u not in user_mapper:  # skip users not in training
            continue
        recs = recommend_for_user(u, K=K)
        total += 1
        if true_item in recs:
            hits += 1
    return hits / total if total > 0 else 0.0

prec_k = precision_at_k(test_md, K=K_RECOMMEND)
print(f"Precision@{K_RECOMMEND}: {prec_k:.4f}")

Precision@10: 0.1918


In [13]:
some_user = train_md['userId'].iloc[0]
print("Sample User ID:", some_user)
print("Recommended Movies:", recommend_for_user(some_user, K=10))

Sample User ID: 1
Recommended Movies: [np.int64(589), np.int64(858), np.int64(1200), np.int64(2762), np.int64(1036), np.int64(32), np.int64(2918), np.int64(1387), np.int64(1968), np.int64(1259)]
