In [3]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn

In [None]:
# Neural Collaborative Filtering cell (to be added at index 1)

# paths (adjust if needed)
ratings_path = "../ml-32m/ratings.csv"
movies_path = "../ml-32m/movies.csv"

# load data (pd is already imported in another cell)
ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)

# keep only necessary columns
ratings = ratings[['userId', 'movieId', 'rating']]

# map ids to contiguous indices
user_ids = ratings['userId'].unique()
movie_ids = ratings['movieId'].unique()
user2idx = {u: i for i, u in enumerate(user_ids)}
movie2idx = {m: i for i, m in enumerate(movie_ids)}
ratings['user_idx'] = ratings['userId'].map(user2idx)
ratings['movie_idx'] = ratings['movieId'].map(movie2idx)

# normalize ratings to [0,1]
r_min, r_max = ratings['rating'].min(), ratings['rating'].max()
ratings['rating_norm'] = (ratings['rating'] - r_min) / (r_max - r_min)

# train / val split
train_df, val_df = train_test_split(ratings, test_size=0.1, random_state=42)

class RatingDataset(Dataset):
  def __init__(self, df):
    self.u = df['user_idx'].values.astype(np.int64)
    self.i = df['movie_idx'].values.astype(np.int64)
    self.r = df['rating_norm'].values.astype(np.float32)
  def __len__(self):
    return len(self.r)
  def __getitem__(self, idx):
    return self.u[idx], self.i[idx], self.r[idx]

train_ds = RatingDataset(train_df)
val_ds = RatingDataset(val_df)

batch_size = 1024
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

class NCF(nn.Module):
  def __init__(self, n_users, n_items, emb_size=64, hidden=[128,64,32]):
    super().__init__()
    self.user_emb = nn.Embedding(n_users, emb_size)
    self.item_emb = nn.Embedding(n_items, emb_size)
    layers = []
    input_dim = emb_size * 2
    for h in hidden:
      layers.append(nn.Linear(input_dim, h))
      layers.append(nn.ReLU(inplace=True))
      input_dim = h
    layers.append(nn.Linear(input_dim, 1))
    self.mlp = nn.Sequential(*layers)
    self._init_weights()
  def _init_weights(self):
    nn.init.normal_(self.user_emb.weight, std=0.01)
    nn.init.normal_(self.item_emb.weight, std=0.01)
    for m in self.mlp:
      if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        nn.init.zeros_(m.bias)
  def forward(self, u, i):
    u_e = self.user_emb(u)
    i_e = self.item_emb(i)
    x = torch.cat([u_e, i_e], dim=1)
    out = self.mlp(x).squeeze(1)
    return out

n_users = len(user_ids)
n_items = len(movie_ids)
model = NCF(n_users, n_items, emb_size=64, hidden=[128,64,32]).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
criterion = nn.MSELoss()

def evaluate(loader):
  model.eval()
  loss_sum = 0.0
  n = 0
  with torch.no_grad():
    for u,i,r in loader:
      u = u.to(device); i = i.to(device); r = r.to(device)
      pred = model(u,i)
      loss = criterion(pred, r)
      loss_sum += loss.item() * r.size(0)
      n += r.size(0)
  return loss_sum / n

# training loop (few epochs; increase epochs for better embeddings)
epochs = 6
for epoch in range(1, epochs+1):
  model.train()
  for u,i,r in train_loader:
    u = u.to(device); i = i.to(device); r = r.to(device)
    pred = model(u,i)
    loss = criterion(pred, r)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  val_loss = evaluate(val_loader)
  print(f"Epoch {epoch}/{epochs}  val_mse={val_loss:.6f}")

# extract embeddings and map back to original ids
user_embs = model.user_emb.weight.data.cpu().numpy()
movie_embs = model.item_emb.weight.data.cpu().numpy()

# create DataFrames with original ids
user_embeddings_df = pd.DataFrame(user_embs)
user_embeddings_df['userId'] = [int(user_ids[i]) for i in range(n_users)]
user_embeddings_df = user_embeddings_df.set_index('userId')

movie_embeddings_df = pd.DataFrame(movie_embs)
movie_embeddings_df['movieId'] = [int(movie_ids[i]) for i in range(n_items)]
movie_embeddings_df = movie_embeddings_df.set_index('movieId')


# save embeddings (optional)
user_embeddings_df.to_pickle("user_embeddings.pkl")
movie_embeddings_df.to_pickle("movie_embeddings.pkl")

print("user embeddings shape:", user_embs.shape)
print("movie embeddings shape:", movie_embs.shape)

KeyboardInterrupt: 