In [None]:
!pip install faiss-cpu scikit-learn pandas


Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [None]:
import pandas as pd
import zipfile
import requests
import os

# Define dataset URL
dataset_url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
dataset_path = "ml-100k.zip"
extract_path = "ml-100k"

# Download and extract dataset if not already present
if not os.path.exists(extract_path):
    print("Downloading MovieLens 100K dataset...")
    response = requests.get(dataset_url, stream=True)
    with open(dataset_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)

    # Extract the dataset
    print("Extracting dataset...")
    with zipfile.ZipFile(dataset_path, "r") as zip_ref:
        zip_ref.extractall(".")

# Load ratings file
ratings_path = os.path.join(extract_path, "u.data")
ratings = pd.read_csv(
    ratings_path,
    sep="\t",
    names=["userId", "movieId", "rating", "timestamp"]
)

# Load movies file
movies_path = os.path.join(extract_path, "u.item")
movies = pd.read_csv(
    movies_path,
    sep="|",
    encoding="latin-1",
    names=["movieId", "title", "release_date", "video_release_date", "IMDb_URL",
           "unknown", "Action", "Adventure", "Animation", "Children", "Comedy",
           "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror",
           "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"],
    usecols=["movieId", "title"]
)

# Merge ratings with movie titles
df = ratings.merge(movies, on="movieId", how="left")

# Preview the dataset
print(f"Loaded {len(df)} movie ratings.")
df.head()


Loaded 100000 movie ratings.


Unnamed: 0,userId,movieId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
import faiss

# Create user-movie interaction matrix
user_movie_matrix = df.pivot(index="userId", columns="movieId", values="rating").fillna(0)

# Normalize embeddings for better retrieval accuracy
scaler = StandardScaler()
user_embeddings = scaler.fit_transform(user_movie_matrix)  # Users (num_users, feature_dim)
movie_embeddings = scaler.fit_transform(user_movie_matrix.T)  # Movies (num_movies, feature_dim)

# Ensure correct embedding dimensions
movie_embeddings = movie_embeddings.T  # (num_movies, feature_dim)

print(f"User Embeddings Shape: {user_embeddings.shape}")
print(f"Movie Embeddings Shape: {movie_embeddings.shape}")


User Embeddings Shape: (943, 1682)
Movie Embeddings Shape: (943, 1682)


In [None]:
# Create FAISS Index
d = movie_embeddings.shape[1]  # Feature dimension
faiss_index = faiss.IndexFlatL2(d)  # L2 distance search
faiss_index.add(movie_embeddings.astype(np.float32))  # Ensure correct dtype

print(f"Stored {len(movie_embeddings)} movies in FAISS index.")

# 🔹 **Generate Candidate Movies for a User**
def generate_candidates(user_id, top_k=10):
    """Retrieve candidate movies for a user based on similarity in embedding space."""
    if user_id not in user_movie_matrix.index:
        return pd.DataFrame(columns=["movieId", "title"])

    # Extract user vector & reshape correctly
    user_vector = np.array(user_embeddings[user_id - 1]).astype(np.float32).reshape(1, -1)

    # Search for nearest neighbors in FAISS
    _, indices = faiss_index.search(user_vector, top_k)

    # Retrieve movie IDs
    movie_ids = [user_movie_matrix.columns[idx] for idx in indices[0]]

    return movies[movies["movieId"].isin(movie_ids)]

# Test candidate generation
user_id = 10
print(generate_candidates(user_id, top_k=10))


Stored 943 movies in FAISS index.
     movieId                                              title
5          6  Shanghai Triad (Yao a yao yao dao waipo qiao) ...
9         10                                 Richard III (1995)
233      234                                        Jaws (1975)
307      308                     FairyTale: A True Story (1997)
320      321                                      Mother (1996)
388      389                                Black Beauty (1994)
397      398                           Super Mario Bros. (1993)
425      426                Transformers: The Movie, The (1986)
473      474  Dr. Strangelove or: How I Learned to Stop Worr...
536      537                        My Own Private Idaho (1991)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Convert Data to Torch Tensors
user_ids = torch.tensor(df["userId"].values - 1, dtype=torch.long)  # User IDs (zero-indexed)
movie_ids = torch.tensor(df["movieId"].values - 1, dtype=torch.long)  # Movie IDs (zero-indexed)
ratings = torch.tensor(df["rating"].values / 5.0, dtype=torch.float32)  # Normalize Ratings (0 to 1)

# Create DataLoader
batch_size = 512
dataset = TensorDataset(user_ids, movie_ids, ratings)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define NCF Model
class NCF(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=64):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)

        self.mlp = nn.Sequential(
            nn.Linear(embedding_dim * 2, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1),
            nn.Sigmoid()  # Output probability of liking the movie
        )

    def forward(self, user_ids, movie_ids):
        user_embeds = self.user_embedding(user_ids)
        movie_embeds = self.movie_embedding(movie_ids)
        interaction = torch.cat([user_embeds, movie_embeds], dim=-1)
        return self.mlp(interaction)

# Instantiate model
num_users = df["userId"].nunique()
num_movies = df["movieId"].nunique()
embedding_dim = 64

model = NCF(num_users, num_movies, embedding_dim).cuda()
print("Model initialized")


Model initialized


In [None]:
# Loss and Optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for users, movies, labels in train_loader:
            users, movies, labels = users.cuda(), movies.cuda(), labels.cuda()

            optimizer.zero_grad()
            predictions = model(users, movies).squeeze()
            loss = criterion(predictions, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss / len(train_loader):.4f}")

train_model(model, train_loader, epochs=10)


Epoch 1/10, Loss: 0.5987
Epoch 2/10, Loss: 0.5785
Epoch 3/10, Loss: 0.5704
Epoch 4/10, Loss: 0.5663
Epoch 5/10, Loss: 0.5634
Epoch 6/10, Loss: 0.5611
Epoch 7/10, Loss: 0.5592
Epoch 8/10, Loss: 0.5573
Epoch 9/10, Loss: 0.5553
Epoch 10/10, Loss: 0.5536


In [None]:
def rank_candidates(user_id, candidate_movies):
    """Score & rank candidate movies using NCF model."""
    user_tensor = torch.tensor([user_id - 1] * len(candidate_movies), dtype=torch.long).cuda()
    movie_tensor = torch.tensor(candidate_movies["movieId"].values - 1, dtype=torch.long).cuda()

    with torch.no_grad():
        scores = model(user_tensor, movie_tensor).cpu().numpy().flatten()

    candidate_movies["score"] = scores
    return candidate_movies.sort_values(by="score", ascending=False).head(10)

# Generate & Rank Candidates
candidates = generate_candidates(user_id=10, top_k=50)
ranked_movies = rank_candidates(user_id=10, candidate_movies=candidates)
print(ranked_movies)


     movieId                                              title     score
312      313                                     Titanic (1997)  0.935273
193      194                                  Sting, The (1973)  0.900938
658      659                        Arsenic and Old Lace (1944)  0.887978
693      694                                  Persuasion (1995)  0.883072
497      498                          African Queen, The (1951)  0.881598
523      524                         Great Dictator, The (1940)  0.870980
536      537                        My Own Private Idaho (1991)  0.869678
9         10                                 Richard III (1995)  0.868200
473      474  Dr. Strangelove or: How I Learned to Stop Worr...  0.866770
150      151       Willy Wonka and the Chocolate Factory (1971)  0.859268


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  candidate_movies["score"] = scores
