In [14]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import torch.nn as nn

In [None]:
# Neural Collaborative Filtering cell (to be added at index 1)

import os

# paths (adjust if needed)
ratings_path = "../../ml-32m/ratings.csv"
movies_path = "../../ml-32m/movies.csv"
user_emb_pkl = "user_embeddings.pkl"
movie_emb_pkl = "movie_embeddings.pkl"

# load data
# we'll need this data later on even if not used here
ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)

# Check if embeddings already exist
if os.path.exists(user_emb_pkl) and os.path.exists(movie_emb_pkl):
    print("Embedding files already exist. Skipping training and loading embeddings from pkl files.")
    user_embeddings_df = pd.read_pickle(user_emb_pkl)
    movie_embeddings_df = pd.read_pickle(movie_emb_pkl)
    user_embs = user_embeddings_df.values
    movie_embs = movie_embeddings_df.values
    print("user embeddings shape:", user_embeddings_df.shape)
    print("movie embeddings shape:", movie_embeddings_df.shape)
else:
    # keep only necessary columns
    ratings = ratings[['userId', 'movieId', 'rating']]

    # map ids to contiguous indices
    user_ids = ratings['userId'].unique()
    movie_ids = ratings['movieId'].unique()
    user2idx = {u: i for i, u in enumerate(user_ids)}
    movie2idx = {m: i for i, m in enumerate(movie_ids)}
    ratings['user_idx'] = ratings['userId'].map(user2idx)
    ratings['movie_idx'] = ratings['movieId'].map(movie2idx)

    # normalize ratings to [0,1]
    r_min, r_max = ratings['rating'].min(), ratings['rating'].max()
    ratings['rating_norm'] = (ratings['rating'] - r_min) / (r_max - r_min)

    # train / val split
    train_df, val_df = train_test_split(ratings, test_size=0.1, random_state=42)

    class RatingDataset(Dataset):
      def __init__(self, df):
        self.u = df['user_idx'].values.astype(np.int64)
        self.i = df['movie_idx'].values.astype(np.int64)
        self.r = df['rating_norm'].values.astype(np.float32)
      def __len__(self):
        return len(self.r)
      def __getitem__(self, idx):
        return self.u[idx], self.i[idx], self.r[idx]

    train_ds = RatingDataset(train_df)
    val_ds = RatingDataset(val_df)

    batch_size = 1024
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, drop_last=False)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    if torch.cuda.is_available():
      device = torch.device("cuda")
    elif torch.backends.mps.is_available():
      device = torch.device("mps")
    else:
      device = torch.device("cpu")

    class NCF(nn.Module):
      def __init__(self, n_users, n_items, emb_size=64, hidden=[128,64,32]):
        super().__init__()
        self.user_emb = nn.Embedding(n_users, emb_size)
        self.item_emb = nn.Embedding(n_items, emb_size)
        layers = []
        input_dim = emb_size * 2
        for h in hidden:
          layers.append(nn.Linear(input_dim, h))
          layers.append(nn.ReLU(inplace=True))
          input_dim = h
        layers.append(nn.Linear(input_dim, 1))
        self.mlp = nn.Sequential(*layers)
        self._init_weights()
      def _init_weights(self):
        nn.init.normal_(self.user_emb.weight, std=0.01)
        nn.init.normal_(self.item_emb.weight, std=0.01)
        for m in self.mlp:
          if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            nn.init.zeros_(m.bias)
      def forward(self, u, i):
        u_e = self.user_emb(u)
        i_e = self.item_emb(i)
        x = torch.cat([u_e, i_e], dim=1)
        out = self.mlp(x).squeeze(1)
        return out

    n_users = len(user_ids)
    n_items = len(movie_ids)
    model = NCF(n_users, n_items, emb_size=64, hidden=[128,64,32]).to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)
    criterion = nn.MSELoss()

    def evaluate(loader):
      model.eval()
      loss_sum = 0.0
      n = 0
      with torch.no_grad():
        for u,i,r in loader:
          u = u.to(device); i = i.to(device); r = r.to(device)
          pred = model(u,i)
          loss = criterion(pred, r)
          loss_sum += loss.item() * r.size(0)
          n += r.size(0)
      return loss_sum / n

    # training loop (few epochs; increase epochs for better embeddings)
    epochs = 6
    for epoch in range(1, epochs+1):
      model.train()
      for u,i,r in train_loader:
        u = u.to(device); i = i.to(device); r = r.to(device)
        pred = model(u,i)
        loss = criterion(pred, r)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
      val_loss = evaluate(val_loader)
      print(f"Epoch {epoch}/{epochs}  val_mse={val_loss:.6f}")

    # extract embeddings and map back to original ids
    user_embs = model.user_emb.weight.data.cpu().numpy()
    movie_embs = model.item_emb.weight.data.cpu().numpy()

    # create DataFrames with original ids
    user_embeddings_df = pd.DataFrame(user_embs)
    user_embeddings_df['userId'] = [int(user_ids[i]) for i in range(n_users)]
    user_embeddings_df = user_embeddings_df.set_index('userId')

    movie_embeddings_df = pd.DataFrame(movie_embs)
    movie_embeddings_df['movieId'] = [int(movie_ids[i]) for i in range(n_items)]
    movie_embeddings_df = movie_embeddings_df.set_index('movieId')

    # save embeddings (optional)
    user_embeddings_df.to_pickle(user_emb_pkl)
    movie_embeddings_df.to_pickle(movie_emb_pkl)

    print("user embeddings shape:", user_embs.shape)
    print("movie embeddings shape:", movie_embs.shape)

Embedding files already exist. Skipping training and loading embeddings from pkl files.
user embeddings shape: (200948, 64)
movie embeddings shape: (84432, 64)


In [16]:
# Find top 20 most rated movies
rating_counts = ratings.groupby('movieId').size().reset_index(name='rating_count')
top_20_movies = rating_counts.nlargest(20, 'rating_count')
top_20_with_info = top_20_movies.merge(movies, on='movieId')
top_20_with_info = top_20_with_info.sort_values('rating_count', ascending=False).reset_index(drop=True)

# Display top 20 movies for user selection
print("=" * 80)
print("TOP 20 MOST RATED MOVIES ON THE PLATFORM")
print("=" * 80)
print()
for idx, row in top_20_with_info.iterrows():
    print(f"{idx + 1:2d}. {row['title']:50s} | {row['genres']:30s} | {row['rating_count']:,} ratings")
print()
print("=" * 80)

# Collect user's 3 favorite movies with validation
selected_movies = None
selected_movie_ids = None  # GLOBAL VARIABLE to store the selected movie IDs

while selected_movies is None:
    user_input = input("\nPlease select 3 of your favorite movies by entering their numbers (1-20), separated by commas: ")
    
    try:
        # Parse input
        selections = [s.strip() for s in user_input.split(',')]
        
        # Validate count
        if len(selections) != 3:
            print(f"❌ Error: Please select exactly 3 movies (you entered {len(selections)})")
            continue
        
        # Validate numeric and range
        numbers = []
        for s in selections:
            if not s.isdigit():
                print(f"❌ Error: '{s}' is not a valid number. Please enter only numbers.")
                break
            num = int(s)
            if num < 1 or num > 20:
                print(f"❌ Error: {num} is out of range. Please enter numbers between 1 and 20.")
                break
            numbers.append(num)
        else:
            # Check for duplicates
            if len(numbers) != len(set(numbers)):
                print("❌ Error: Please select 3 different movies (no duplicates)")
                continue
            
            # Convert to 0-indexed and extract movie info
            indices = [n - 1 for n in numbers]
            movie_ids_selected = top_20_with_info.iloc[indices]['movieId'].tolist()
            selected_movies = {
                'movieIds': movie_ids_selected,
                'titles': top_20_with_info.iloc[indices]['title'].tolist(),
                'genres': top_20_with_info.iloc[indices]['genres'].tolist(),
                'user_selections': numbers
            }
            
            # Store in global variable for later use (movie IDs only)
            selected_movie_ids = movie_ids_selected
            
            # Confirm selections
            print("\n" + "=" * 80)
            print("✓ You selected:")
            for i, (num, title) in enumerate(zip(numbers, selected_movies['titles']), 1):
                print(f"  {i}. {title}")
            print("=" * 80)
    
    except Exception as e:
        print(f"❌ Error: An unexpected error occurred: {str(e)}")
        continue


TOP 20 MOST RATED MOVIES ON THE PLATFORM

 1. Shawshank Redemption, The (1994)                   | Crime|Drama                    | 102,929 ratings
 2. Forrest Gump (1994)                                | Comedy|Drama|Romance|War       | 100,296 ratings
 3. Pulp Fiction (1994)                                | Comedy|Crime|Drama|Thriller    | 98,409 ratings
 4. Matrix, The (1999)                                 | Action|Sci-Fi|Thriller         | 93,808 ratings
 5. Silence of the Lambs, The (1991)                   | Crime|Horror|Thriller          | 90,330 ratings
 6. Star Wars: Episode IV - A New Hope (1977)          | Action|Adventure|Sci-Fi        | 85,010 ratings
 7. Fight Club (1999)                                  | Action|Crime|Drama|Thriller    | 77,332 ratings
 8. Jurassic Park (1993)                               | Action|Adventure|Sci-Fi|Thriller | 75,233 ratings
 9. Schindler's List (1993)                            | Drama|War                      | 73,849 ratings
10. Lord 

In [17]:
# Calculate the new user's embedding by averaging the embeddings of the selected movies
# Assume that 'movie_embeddings' is a DataFrame indexed by movieId with embedding columns (e.g., embedding_0, embedding_1, ..., embedding_n)
# If the embedding columns are not known, you may have to adjust the column selection

# Collect embeddings for the selected movies
selected_embeddings = movie_embeddings_df.loc[selected_movie_ids]

# Convert to numpy (if necessary) and compute the mean across the selected movies
user_embedding = selected_embeddings.mean(axis=0).values

# 'user_embedding' now represents the new user's embedding as the average of the three selected movies' embeddings


In [18]:
print(user_embedding)

[-1.63515303e-02  1.18975714e-02 -8.28197226e-05 -1.37931705e-02
 -2.23015863e-02  1.21638679e-03 -1.06335729e-02  7.95494858e-03
 -1.50458654e-02  2.25625955e-03  1.33865438e-02 -4.35068225e-03
 -1.36610949e-02 -8.87140632e-04 -2.57609732e-04 -9.61612910e-03
 -8.74719489e-03 -5.66151254e-02 -1.08718621e-02  1.19886147e-02
  2.04771105e-02  1.34230889e-02  3.26241180e-03 -3.71041261e-02
 -1.12212263e-03  7.42857484e-03  9.77769122e-03  2.05876511e-02
  3.88471596e-02  1.63469315e-01 -4.87460801e-03 -3.19530666e-02
  1.18800206e-02  1.02151604e-02 -1.20166875e-01  1.73435267e-02
  8.45378730e-03 -5.57838520e-03  1.84103921e-02 -1.31772682e-02
  1.27190128e-01  7.11310888e-03 -1.10981183e-03 -6.39945129e-03
 -1.61919277e-02 -5.35232341e-03 -9.53991956e-04  1.52504407e-02
 -9.12806671e-03 -5.47446683e-03  1.95870008e-02  1.00748604e-02
  1.25926584e-02 -9.11147147e-03 -1.12572610e-02  1.27506377e-02
 -6.56734221e-03 -2.56615551e-03 -5.77133335e-03 -1.21947750e-02
  5.74509613e-04  1.57906

In [25]:
import faiss
import numpy as np

# Fix for handling embedding columns named as ints (e.g., 0, 1, 2, ...)
# Use all columns except for obvious metadata, or check dtype if possible.
metadata_columns = {"movieId", "title", "genres"}
embedding_columns = [col for col in movie_embeddings_df.columns if col not in metadata_columns]

# If embedding columns are still not detected, attempt to check for numeric-only columns
if not embedding_columns:
    embedding_columns = [col for col in movie_embeddings_df.columns if isinstance(col, int) or (isinstance(col, str) and col.isdigit())]

if not embedding_columns:
    raise ValueError(f"No embedding columns found in movie_embeddings_df! Columns found: {list(movie_embeddings_df.columns)}")

movie_embedding_matrix = movie_embeddings_df[embedding_columns].to_numpy(dtype='float32')
if movie_embedding_matrix.shape[1] == 0:
    raise ValueError(f"movie_embedding_matrix has zero columns! Shape: {movie_embedding_matrix.shape}. Columns used: {embedding_columns}")

if user_embedding.shape[0] != movie_embedding_matrix.shape[1]:
    raise ValueError(
        f"Dimension mismatch: user_embedding.shape={user_embedding.shape}, "
        f"embedding_matrix.shape={movie_embedding_matrix.shape}, "
        f"embedding_columns={embedding_columns}"
    )

# Normalize embeddings for cosine similarity
movie_embedding_matrix_normalized = movie_embedding_matrix.copy()
faiss.normalize_L2(movie_embedding_matrix_normalized)
index = faiss.IndexFlatIP(movie_embedding_matrix_normalized.shape[1])
index.add(movie_embedding_matrix_normalized)

user_emb_norm = user_embedding.astype('float32').reshape(1, -1)
faiss.normalize_L2(user_emb_norm)

try:
    D, I = index.search(user_emb_norm, 10)
except Exception as e:
    print(f"❌ Error during FAISS search: {e}")
    print(f"user_emb_norm.shape={user_emb_norm.shape}, index dimension={index.d}")
    raise

# Map back to DataFrame indices
recommended_movies = movie_embeddings_df.iloc[I[0]]

print("Top 10 movie recommendations for the new user:")
for rank, (idx, score) in enumerate(zip(I[0], D[0]), 1):
    # idx is the integer positional index, get the corresponding row from the DataFrame
    movie_row = movie_embeddings_df.iloc[idx]
    movie_id = movie_row['movieId'] if 'movieId' in movie_row else movie_row.name
    # Lookup title from the `movies` DataFrame using movieId
    if 'movieId' in movie_row:
        lookup_id = movie_row['movieId']
    else:
        lookup_id = movie_row.name
    match = movies[movies['movieId'] == lookup_id]
    if not match.empty and 'title' in match.columns:
        real_title = match.iloc[0]['title']
    else:
        real_title = str(lookup_id)
    print(f"{rank}. {real_title} (movieId {movie_id}) - score: {score:.4f}")


Top 10 movie recommendations for the new user:
1. Shawshank Redemption, The (1994) (movieId 318) - score: 0.9527
2. Ip Man (2008) (movieId 65514) - score: 0.9295
3. Lovers of the Café Flore (2006) (movieId 156551) - score: 0.9260
4. I Am Mother (2019) (movieId 202103) - score: 0.9236
5. The Way (2010) (movieId 87194) - score: 0.9207
6. Apocalyptic (2013) (movieId 153516) - score: 0.9185
7. The Marquis (2011) (movieId 141446) - score: 0.9181
8. Loving Annabelle (2006) (movieId 50183) - score: 0.9177
9. A Glitch in the Matrix (2021) (movieId 238052) - score: 0.9176
10. Hacker (2016) (movieId 166755) - score: 0.9159
