Import Libraries and Read Data

In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from collections import defaultdict

# Read the data from your CSV file
df = pd.read_csv("/content/fina.csv")

# Display the first few rows of the DataFrame
print(df.head())

# Ensure that the DataFrame has the required columns
# Columns needed: 'UserID', 'MovieID', 'Rating', 'Title'


   UserID  MovieID  Rating                        Title
0       1        1     4.0             Toy Story (1995)
1       1        3     4.0      Grumpier Old Men (1995)
2       1        6     4.0                  Heat (1995)
3       1       47     5.0  Seven (a.k.a. Se7en) (1995)
4       1       50     5.0   Usual Suspects, The (1995)


Data Preprocessing

In [12]:
# ------------------- Data Preprocessing ------------------- #

# Map UserID and MovieID to indices starting from 0
user_ids = df['UserID'].unique()
movie_ids = df['MovieID'].unique()

# Create mappings from original IDs to indices
user2idx = {user_id: idx for idx, user_id in enumerate(user_ids)}
idx2user = {idx: user_id for user_id, idx in user2idx.items()}

movie2idx = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}
idx2movie = {idx: movie_id for movie_id, idx in movie2idx.items()}

# Add new columns to the DataFrame with mapped indices
df['user_idx'] = df['UserID'].map(user2idx)
df['movie_idx'] = df['MovieID'].map(movie2idx)

# Number of unique users and movies
num_users = len(user_ids)
num_movies = len(movie_ids)

print(f'Number of users: {num_users}')
print(f'Number of movies: {num_movies}')

# Split the data into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


Number of users: 610
Number of movies: 9724


Model Implementation (Neural Collaborative Filtering - NCF)

In [13]:
# ------------------- Model Implementation ------------------- #

# Hyperparameters
embedding_size = 10  # Size of the latent embedding vectors
mlp_layers = [64, 32, 16, 8]  # Sizes of MLP hidden layers
learning_rate = 0.001  # Learning rate for optimizer
reg = 0.0001  # Regularization parameter
num_epochs = 10  # Number of epochs to train
K = 10  # Number of items for Precision@K, Recall@K, NDCG@K

# Set random seed for reproducibility
np.random.seed(42)

# Initialize user and item embeddings for MF part
user_embedding_mf = np.random.normal(scale=0.1, size=(num_users, embedding_size))
item_embedding_mf = np.random.normal(scale=0.1, size=(num_movies, embedding_size))

# Initialize user and item embeddings for MLP part
user_embedding_mlp = np.random.normal(scale=0.1, size=(num_users, embedding_size))
item_embedding_mlp = np.random.normal(scale=0.1, size=(num_movies, embedding_size))

# Initialize MLP weights and biases
mlp_weights = []
mlp_biases = []

input_size = embedding_size * 2  # Because we concatenate user and item embeddings
for layer_size in mlp_layers:
    weight = np.random.normal(scale=0.1, size=(input_size, layer_size))
    bias = np.zeros(layer_size)
    mlp_weights.append(weight)
    mlp_biases.append(bias)
    input_size = layer_size  # Update input size for the next layer

# Initialize output layer weights and bias
output_weight = np.random.normal(scale=0.1, size=(embedding_size + mlp_layers[-1], 1))
output_bias = np.zeros(1)

# Activation functions
def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)


Define Forward and Backward Pass Functions

In [14]:
# Forward pass function
def forward(user_idx, item_idx):
    # MF part
    mf_user_emb = user_embedding_mf[user_idx]  # Shape: (embedding_size,)
    mf_item_emb = item_embedding_mf[item_idx]  # Shape: (embedding_size,)
    mf_vector = mf_user_emb * mf_item_emb      # Element-wise multiplication

    # MLP part
    mlp_user_emb = user_embedding_mlp[user_idx]  # Shape: (embedding_size,)
    mlp_item_emb = item_embedding_mlp[item_idx]  # Shape: (embedding_size,)
    mlp_vector = np.concatenate([mlp_user_emb, mlp_item_emb])  # Shape: (embedding_size * 2,)

    activations = [mlp_vector]
    pre_activations = []

    # Forward pass through MLP layers
    for weight, bias in zip(mlp_weights, mlp_biases):
        z = np.dot(activations[-1], weight) + bias
        pre_activations.append(z)
        a = relu(z)
        activations.append(a)

    # Concatenate MF and MLP parts
    final_vector = np.concatenate([mf_vector, activations[-1]])  # Shape: (embedding_size + mlp_layers[-1],)

    # Output layer
    prediction = np.dot(final_vector, output_weight) + output_bias  # Shape: (1,)

    # Cache intermediate values for backpropagation
    cache = {
        'mf_user_emb': mf_user_emb,
        'mf_item_emb': mf_item_emb,
        'mlp_user_emb': mlp_user_emb,
        'mlp_item_emb': mlp_item_emb,
        'activations': activations,
        'pre_activations': pre_activations,
        'final_vector': final_vector
    }

    return prediction.flatten()[0], cache

# Loss function (Mean Squared Error)
def compute_loss(prediction, target):
    loss = 0.5 * (prediction - target) ** 2
    return loss

# Backward pass function
def backward(target, prediction, cache, user_idx, item_idx):
    global user_embedding_mf, item_embedding_mf
    global user_embedding_mlp, item_embedding_mlp
    global mlp_weights, mlp_biases, output_weight, output_bias

    # Compute gradient of the loss w.r.t. prediction
    d_loss_pred = prediction - target  # Derivative of MSE loss w.r.t. prediction

    # Gradient for the output layer
    d_output_weight = np.outer(cache['final_vector'], d_loss_pred) + reg * output_weight
    d_output_bias = d_loss_pred

    # Backpropagate to final_vector
    d_final_vector = output_weight.flatten() * d_loss_pred

    # Split gradients back to MF and MLP parts
    mf_size = embedding_size
    mlp_size = mlp_layers[-1]

    d_mf_vector = d_final_vector[:mf_size]
    d_mlp_vector = d_final_vector[mf_size:]

    # Gradients for MF embeddings
    d_mf_user_emb = d_mf_vector * cache['mf_item_emb'] + reg * cache['mf_user_emb']
    d_mf_item_emb = d_mf_vector * cache['mf_user_emb'] + reg * cache['mf_item_emb']

    # Update MF embeddings
    user_embedding_mf[user_idx] -= learning_rate * d_mf_user_emb
    item_embedding_mf[item_idx] -= learning_rate * d_mf_item_emb

    # Backpropagate through MLP layers
    d_activation = d_mlp_vector
    for i in reversed(range(len(mlp_layers))):
        # Derivative w.r.t pre-activation
        d_pre_activation = d_activation * relu_derivative(cache['pre_activations'][i])

        # Gradients for weights and biases
        d_weight = np.outer(cache['activations'][i], d_pre_activation) + reg * mlp_weights[i]
        d_bias = d_pre_activation

        # Gradient w.r.t previous activation
        d_activation_prev = np.dot(d_pre_activation, mlp_weights[i].T)

        # Update weights and biases
        mlp_weights[i] -= learning_rate * d_weight
        mlp_biases[i] -= learning_rate * d_bias

        # Update activation for next layer
        d_activation = d_activation_prev

    # Gradients for MLP embeddings
    d_mlp_user_emb = d_activation[:embedding_size] + reg * cache['mlp_user_emb']
    d_mlp_item_emb = d_activation[embedding_size:] + reg * cache['mlp_item_emb']

    # Update MLP embeddings
    user_embedding_mlp[user_idx] -= learning_rate * d_mlp_user_emb
    item_embedding_mlp[item_idx] -= learning_rate * d_mlp_item_emb

    # Update output layer weights and bias
    output_weight -= learning_rate * d_output_weight
    output_bias -= learning_rate * d_output_bias


Training Loop

In [15]:
# ------------------- Training Loop ------------------- #

# Training the model
for epoch in range(num_epochs):
    total_loss = 0
    # Shuffle training data
    shuffled_indices = np.random.permutation(len(train_df))
    for idx in shuffled_indices:
        row = train_df.iloc[idx]
        user_idx = int(row['user_idx'])
        item_idx = int(row['movie_idx'])
        rating = row['Rating']

        # Forward pass
        prediction, cache = forward(user_idx, item_idx)

        # Compute loss
        loss = compute_loss(prediction, rating)
        total_loss += loss

        # Backward pass and update weights
        backward(rating, prediction, cache, user_idx, item_idx)

    avg_loss = total_loss / len(train_df)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}')


Epoch 1/10, Loss: 0.5658
Epoch 2/10, Loss: 0.4859
Epoch 3/10, Loss: 0.4499
Epoch 4/10, Loss: 0.4353
Epoch 5/10, Loss: 0.4234
Epoch 6/10, Loss: 0.4122
Epoch 7/10, Loss: 0.4019
Epoch 8/10, Loss: 0.3925
Epoch 9/10, Loss: 0.3842
Epoch 10/10, Loss: 0.3767


 Evaluation Metrics

In [16]:
# ------------------- Evaluation Metrics ------------------- #

# Predict on the test set
test_predictions = []
test_targets = []

for idx in range(len(test_df)):
    row = test_df.iloc[idx]
    user_idx = int(row['user_idx'])
    item_idx = int(row['movie_idx'])
    rating = row['Rating']

    # Forward pass
    prediction, _ = forward(user_idx, item_idx)

    test_predictions.append(prediction)
    test_targets.append(rating)

# Convert to numpy arrays
test_predictions = np.array(test_predictions)
test_targets = np.array(test_targets)

# Compute MAE and RMSE
mae = np.mean(np.abs(test_predictions - test_targets))
rmse = np.sqrt(np.mean((test_predictions - test_targets) ** 2))

print(f'\nTest MAE: {mae:.4f}')
print(f'Test RMSE: {rmse:.4f}')



Test MAE: 0.6875
Test RMSE: 0.9067


Compute Precision@K, Recall@K, NDCG@K

In [17]:
# Precision@K, Recall@K, NDCG@K
# Build user-item interactions for test data
user_test_items = defaultdict(set)
for row in test_df.itertuples():
    user_test_items[int(row.user_idx)].add(int(row.movie_idx))

# Build user-item interactions for training data
user_train_items = defaultdict(set)
for row in train_df.itertuples():
    user_train_items[int(row.user_idx)].add(int(row.movie_idx))

precision_list = []
recall_list = []
ndcg_list = []

# All movie indices
all_movie_indices = np.arange(num_movies)

for user_idx in range(num_users):
    train_items = user_train_items[user_idx]
    test_items = user_test_items[user_idx]

    if not test_items:
        continue  # Skip users with no test data

    # Items to predict (exclude items in training data)
    items_to_predict = list(set(all_movie_indices) - train_items)
    predictions = []

    for item_idx in items_to_predict:
        pred, _ = forward(user_idx, item_idx)
        predictions.append((item_idx, pred))

    # Rank items by predicted score
    predictions.sort(key=lambda x: x[1], reverse=True)
    ranked_items = [item for item, score in predictions]

    # Top K items
    top_k_items = ranked_items[:K]

    # Compute hits
    hits = [1 if item in test_items else 0 for item in top_k_items]

    # Precision@K
    precision = np.sum(hits) / K
    precision_list.append(precision)

    # Recall@K
    recall = np.sum(hits) / len(test_items)
    recall_list.append(recall)

    # NDCG@K
    dcg = 0
    for i, hit in enumerate(hits):
        dcg += hit / np.log2(i + 2)  # i+2 because log2(1) = 0
    idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(test_items), K)))
    ndcg = dcg / idcg if idcg > 0 else 0
    ndcg_list.append(ndcg)

# Compute average metrics
avg_precision = np.mean(precision_list)
avg_recall = np.mean(recall_list)
avg_ndcg = np.mean(ndcg_list)

print(f'\nAverage Precision@{K}: {avg_precision:.4f}')
print(f'Average Recall@{K}: {avg_recall:.4f}')
print(f'Average NDCG@{K}: {avg_ndcg:.4f}')



Average Precision@10: 0.0467
Average Recall@10: 0.0211
Average NDCG@10: 0.0542


Save the Trained Model

In [18]:
# ------------------- Save the Trained Model ------------------- #

# Save embeddings and weights
np.save('user_embedding_mf.npy', user_embedding_mf)
np.save('item_embedding_mf.npy', item_embedding_mf)
np.save('user_embedding_mlp.npy', user_embedding_mlp)
np.save('item_embedding_mlp.npy', item_embedding_mlp)

# Save MLP weights and biases
np.savez('mlp_weights.npz', *mlp_weights)
np.savez('mlp_biases.npz', *mlp_biases)

# Save output layer weights and bias
np.save('output_weight.npy', output_weight)
np.save('output_bias.npy', output_bias)

print("Model parameters have been saved.")


Model parameters have been saved.
