In [12]:
# Step 1: Imports
import os
import zipfile
import pandas as pd
import urllib.request
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Step 2: Download and extract MovieLens 100K
url = "https://files.grouplens.org/datasets/movielens/ml-100k.zip"
file_path = "ml-100k.zip"
if not os.path.exists(file_path):
    urllib.request.urlretrieve(url, file_path)

with zipfile.ZipFile(file_path, 'r') as zip_ref:
    zip_ref.extractall()

# Step 3: Load 'u.data'
col_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('ml-100k/u.data', sep='\t', names=col_names)

# Map user/item IDs to zero-based continuous indices
df["user_id"] = df["user_id"].astype("category").cat.codes
df["item_id"] = df["item_id"].astype("category").cat.codes

n_users = df["user_id"].nunique()
n_items = df["item_id"].nunique()
print(f"Loaded MovieLens-100k: {len(df)} ratings, {n_users} users, {n_items} movies.")

# Step 4: Create PyTorch dataset & dataloader
class RatingDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df["user_id"].values, dtype=torch.long)
        self.items = torch.tensor(df["item_id"].values, dtype=torch.long)
        self.ratings = torch.tensor(df["rating"].values, dtype=torch.float32)
    def __len__(self): return len(self.ratings)
    def __getitem__(self, idx): return self.users[idx], self.items[idx], self.ratings[idx]

train_data = RatingDataset(df)
train_loader = DataLoader(train_data, batch_size=512, shuffle=True)

# Step 5: Define Matrix Factorization model
class MF(nn.Module):
    def __init__(self, n_users, n_items, n_factors=50):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, n_factors)
        self.item_emb = nn.Embedding(n_items, n_factors)
        self.fc = nn.Linear(n_factors, 1)
    def forward(self, u, i):
        x = self.user_emb(u) * self.item_emb(i)
        return self.fc(x).squeeze()

model = MF(n_users, n_items)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Step 6: Train model
for epoch in range(50):
    total_loss = 0
    for u, i, r in train_loader:
        pred = model(u, i)
        loss = criterion(pred, r)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader):.4f}")

# Step 7: Recommend for a given user
def recommend(user_id, top_n=5):
    user = torch.tensor([user_id] * n_items)
    items = torch.arange(n_items)
    with torch.no_grad():
        preds = model(user, items).numpy()
    top_items = preds.argsort()[-top_n:][::-1]
    return top_items

print("Recommendations for User 0:", recommend(0, top_n=5))


Loaded MovieLens-100k: 100000 ratings, 943 users, 1682 movies.
Epoch 1, Loss: 8.7544
Epoch 2, Loss: 2.2049
Epoch 3, Loss: 1.0745
Epoch 4, Loss: 0.9334
Epoch 5, Loss: 0.8749
Epoch 6, Loss: 0.8134
Epoch 7, Loss: 0.7337
Epoch 8, Loss: 0.6487
Epoch 9, Loss: 0.5700
Epoch 10, Loss: 0.5004
Epoch 11, Loss: 0.4416
Epoch 12, Loss: 0.3927
Epoch 13, Loss: 0.3508
Epoch 14, Loss: 0.3171
Epoch 15, Loss: 0.2891
Epoch 16, Loss: 0.2666
Epoch 17, Loss: 0.2491
Epoch 18, Loss: 0.2321
Epoch 19, Loss: 0.2199
Epoch 20, Loss: 0.2091
Epoch 21, Loss: 0.1994
Epoch 22, Loss: 0.1917
Epoch 23, Loss: 0.1841
Epoch 24, Loss: 0.1781
Epoch 25, Loss: 0.1728
Epoch 26, Loss: 0.1677
Epoch 27, Loss: 0.1636
Epoch 28, Loss: 0.1602
Epoch 29, Loss: 0.1564
Epoch 30, Loss: 0.1529
Epoch 31, Loss: 0.1502
Epoch 32, Loss: 0.1470
Epoch 33, Loss: 0.1441
Epoch 34, Loss: 0.1419
Epoch 35, Loss: 0.1394
Epoch 36, Loss: 0.1369
Epoch 37, Loss: 0.1348
Epoch 38, Loss: 0.1329
Epoch 39, Loss: 0.1317
Epoch 40, Loss: 0.1308
Epoch 41, Loss: 0.1285
Epo

In [13]:
# Reload movies with proper columns
movie_cols = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb_URL',
              'unknown','Action','Adventure','Animation','Children','Comedy','Crime',
              'Documentary','Drama','Fantasy','Film-Noir','Horror','Musical','Mystery',
              'Romance','Sci-Fi','Thriller','War','Western']
movies = pd.read_csv('ml-100k/u.item', sep='|', names=movie_cols, encoding='latin-1')

# Re-code movie_id to match df's encoding
movies["item_id"] = movies["movie_id"].astype("category").cat.codes

# Build lookup dictionary: internal item_id â†’ movie title
id_to_title = dict(zip(movies["item_id"], movies["title"]))

def recommend_with_titles(user_id, top_n=5):
    user = torch.tensor([user_id] * n_items)
    items = torch.arange(n_items)
    with torch.no_grad():
        preds = model(user, items).numpy()
    top_items = preds.argsort()[-top_n:][::-1]   # best N items
    titles = [id_to_title[i] for i in top_items] # convert to titles
    return titles

print("Top Recommendations for User 0:")
print(recommend_with_titles(0, top_n=5))

Top Recommendations for User 0:
['Koyaanisqatsi (1983)', 'Amistad (1997)', 'Rob Roy (1995)', 'Blown Away (1994)', 'Rosencrantz and Guildenstern Are Dead (1990)']


In [14]:
def add_new_user_ratings(new_ratings, model, movies, top_n=5):
    """
    new_ratings: list of (movie_title, rating) given by the new user
    model: trained MF model
    movies: movie metadata dataframe
    top_n: number of recommendations to return
    """
    # Map titles to item_ids
    title_to_id = dict(zip(movies["title"], movies["item_id"]))

    # Convert ratings into tensors
    item_ids = []
    ratings = []
    for title, rating in new_ratings:
        if title in title_to_id:
            item_ids.append(title_to_id[title])
            ratings.append(rating)
        else:
            print(f"Movie '{title}' not found in dataset.")

    item_ids = torch.tensor(item_ids, dtype=torch.long)
    ratings = torch.tensor(ratings, dtype=torch.float32)

    # --- Step 1: Get embeddings of movies the new user rated ---
    item_vecs = model.item_emb(item_ids)

    # Weighted average of embeddings (by rating)
    user_vec = (item_vecs * ratings.unsqueeze(1)).mean(dim=0, keepdim=True)

    # --- Step 2: Predict for all movies ---
    items = torch.arange(n_items)
    preds = (user_vec * model.item_emb(items)).sum(dim=1)

    # --- Step 3: Exclude movies the user already rated ---
    preds[item_ids] = -999   # mask out rated movies

    # --- Step 4: Top-N recommendations (safe way) ---
    top_items = torch.topk(preds, top_n).indices.numpy()

    # Map to movie titles
    recs = movies[movies["item_id"].isin(top_items)][["title"]]
    return recs

# ðŸ”¹ Example usage
new_user_ratings = [
    ("Braveheart (1995)", 5),
    ("Toy Story (1995)", 4),
    ("Star Wars (1977)", 5)
]

print("Recommendations for New User:")
print(add_new_user_ratings(new_user_ratings, model, movies, top_n=5))


Recommendations for New User:
                                          title
171             Empire Strikes Back, The (1980)
173              Raiders of the Lost Ark (1981)
180                   Return of the Jedi (1983)
209   Indiana Jones and the Last Crusade (1989)
1128                   Chungking Express (1994)
