In [1]:
# Cell 1: Import Necessary Libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import warnings
import os

# Deep Learning Libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

warnings.filterwarnings('ignore')


In [2]:
# Cell 2: Load and Preview the Dataset

# Load the dataset
df = pd.read_csv("C:/Users/anujp/OneDrive/Desktop/MovieRecommendations/data/Final_data/Final_data.csv")

# Preview the first few rows
print("Data preview:")
print(df.head())

# Check the shape of the dataset
print("\nDataset shape:", df.shape)


Data preview:
   UserID  MovieID  Rating                 Title
0       1      122     5.0      Boomerang (1992)
1       1      185     5.0       Net, The (1995)
2       1      231     5.0  Dumb & Dumber (1994)
3       1      292     5.0       Outbreak (1995)
4       1      316     5.0       Stargate (1994)

Dataset shape: (10000054, 4)


In [3]:
# Cell 3: Data Preprocessing

# Normalize the ratings to [0, 1]
max_rating = df['Rating'].max()
min_rating = df['Rating'].min()
df['Rating_normalized'] = (df['Rating'] - min_rating) / (max_rating - min_rating)

# Encode UserID and MovieID to indices starting from 0
from sklearn.preprocessing import LabelEncoder

user_encoder = LabelEncoder()
df['user'] = user_encoder.fit_transform(df['UserID'])

item_encoder = LabelEncoder()
df['item'] = item_encoder.fit_transform(df['MovieID'])

num_users = df['user'].nunique()
num_items = df['item'].nunique()

print(f"Number of users: {num_users}")
print(f"Number of items: {num_items}")

# Create a new DataFrame with the necessary columns
df_model = df[['user', 'item', 'Rating_normalized']]

# Check the head of the new DataFrame
print("\nPrepared data for modeling:")
print(df_model.head())


Number of users: 69878
Number of items: 10677

Prepared data for modeling:
   user  item  Rating_normalized
0     0   120                1.0
1     0   183                1.0
2     0   228                1.0
3     0   289                1.0
4     0   313                1.0


In [4]:
# Cell 4: Split the Data into Training, Validation, and Testing Sets

from sklearn.model_selection import train_test_split

# First, split the data into training and temp (validation + testing)
train_data, temp_data = train_test_split(df_model, test_size=0.2, random_state=42)

# Then split the temp data equally into validation and testing
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

print(f"Training data shape: {train_data.shape}")
print(f"Validation data shape: {val_data.shape}")
print(f"Testing data shape: {test_data.shape}")


Training data shape: (8000043, 3)
Validation data shape: (1000005, 3)
Testing data shape: (1000006, 3)


In [5]:
# Cell 5: Create PyTorch Datasets and DataLoaders

class NCFDataset(Dataset):
    def __init__(self, user_tensor, item_tensor, target_tensor):
        self.user_tensor = user_tensor
        self.item_tensor = item_tensor
        self.target_tensor = target_tensor
        
    def __getitem__(self, index):
        return self.user_tensor[index], self.item_tensor[index], self.target_tensor[index]
    
    def __len__(self):
        return self.user_tensor.size(0)

# Convert data to tensors
train_users = torch.tensor(train_data['user'].values, dtype=torch.long)
train_items = torch.tensor(train_data['item'].values, dtype=torch.long)
train_ratings = torch.tensor(train_data['Rating_normalized'].values, dtype=torch.float32)

val_users = torch.tensor(val_data['user'].values, dtype=torch.long)
val_items = torch.tensor(val_data['item'].values, dtype=torch.long)
val_ratings = torch.tensor(val_data['Rating_normalized'].values, dtype=torch.float32)

test_users = torch.tensor(test_data['user'].values, dtype=torch.long)
test_items = torch.tensor(test_data['item'].values, dtype=torch.long)
test_ratings = torch.tensor(test_data['Rating_normalized'].values, dtype=torch.float32)

# Create datasets
train_dataset = NCFDataset(train_users, train_items, train_ratings)
val_dataset = NCFDataset(val_users, val_items, val_ratings)
test_dataset = NCFDataset(test_users, test_items, test_ratings)

# Create DataLoaders
batch_size = 2048  # Adjust based on memory constraints

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)


In [6]:
# Cell 6: Define the NCF Model with Improvements

class NCF(nn.Module):
    def __init__(self, num_users, num_items, embedding_dim=128, hidden_layers=[256, 128, 64], dropout_rate=0.2):
        super(NCF, self).__init__()
        # User and Item embeddings
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.item_embedding = nn.Embedding(num_items, embedding_dim)
        
        # Define MLP layers
        self.fc_layers = nn.ModuleList()
        self.dropout = nn.Dropout(p=dropout_rate)
        input_size = embedding_dim * 2  # Since we concatenate user and item embeddings
        for hidden_size in hidden_layers:
            self.fc_layers.append(nn.Linear(input_size, hidden_size))
            self.fc_layers.append(nn.BatchNorm1d(hidden_size))
            self.fc_layers.append(nn.LeakyReLU())
            self.fc_layers.append(self.dropout)
            input_size = hidden_size
        
        # Output layer
        self.output_layer = nn.Linear(input_size, 1)
        
    def forward(self, user_indices, item_indices):
        # Get embeddings
        user_embedding = self.user_embedding(user_indices)
        item_embedding = self.item_embedding(item_indices)
        
        # Concatenate user and item embeddings
        vector = torch.cat([user_embedding, item_embedding], dim=-1)
        
        # Pass through MLP layers
        for layer in self.fc_layers:
            vector = layer(vector)
            
        # Output layer
        rating = self.output_layer(vector)
        rating = torch.sigmoid(rating)  # Ensure output is between 0 and 1
        return rating.squeeze()


In [7]:
# Cell 7: Initialize the Model, Define Loss Function and Optimizer

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = NCF(num_users, num_items).to(device)

# Define loss function and optimizer
loss_function = nn.BCELoss()  # Binary Cross Entropy due to sigmoid output
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-5)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)


Using device: cuda


In [None]:
# Cell 8: Train the Model with Early Stopping

epochs = 25  # Adjust based on your computational resources
model_save_path = 'ncf_model.pth'

best_val_loss = float('inf')
patience = 3  # Number of epochs to wait before stopping
trigger_times = 0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch_users, batch_items, batch_ratings in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        batch_users = batch_users.to(device)
        batch_items = batch_items.to(device)
        batch_ratings = batch_ratings.to(device)
        
        optimizer.zero_grad()
        predictions = model(batch_users, batch_items)
        loss = loss_function(predictions, batch_ratings)
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item() * batch_users.size(0)
        
    avg_loss = total_loss / len(train_dataset)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for val_users_batch, val_items_batch, val_ratings_batch in val_loader:
            val_users_batch = val_users_batch.to(device)
            val_items_batch = val_items_batch.to(device)
            val_ratings_batch = val_ratings_batch.to(device)
            val_predictions = model(val_users_batch, val_items_batch)
            batch_loss = loss_function(val_predictions, val_ratings_batch)
            val_loss += batch_loss.item() * val_users_batch.size(0)
    avg_val_loss = val_loss / len(val_dataset)
    
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
    
    # Scheduler step
    scheduler.step(avg_val_loss)
    
    # Early Stopping
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        torch.save(model.state_dict(), model_save_path)
        trigger_times = 0
    else:
        trigger_times += 1
        if trigger_times >= patience:
            print("Early stopping triggered.")
            break


Epoch 1/25: 100%|██████████| 3907/3907 [04:33<00:00, 14.29it/s]


In [None]:
# Cell 9: Load the Best Model for Inference

# Initialize the model architecture
loaded_model = NCF(num_users, num_items).to(device)

# Load the saved state dictionary
loaded_model.load_state_dict(torch.load(model_save_path))

# Set the model to evaluation mode
loaded_model.eval()
print("Best model loaded and ready for inference.")


In [None]:
# Cell 10: Evaluate the Loaded Model on the Test Set

with torch.no_grad():
    test_users = test_users.to(device)
    test_items = test_items.to(device)
    test_ratings = test_ratings.to(device)
    
    predictions = loaded_model(test_users, test_items)
    mse_loss = nn.MSELoss()
    mse = mse_loss(predictions, test_ratings)
    rmse = torch.sqrt(mse)
    mae = torch.mean(torch.abs(predictions - test_ratings))
    
    print(f"Test MSE: {mse.item():.4f}")
    print(f"Test RMSE: {rmse.item():.4f}")
    print(f"Test MAE: {mae.item():.4f}")


In [None]:
# Cell 11: Prepare Data for Ranking Metrics

# Get unique users and items
unique_users = df_model['user'].unique()
unique_items = df_model['item'].unique()

# Create a dictionary of actual items rated by each user in the test set
test_user_item_dict = test_data.groupby('user')['item'].apply(set).to_dict()

# Create a dictionary of items rated by each user in the training and validation sets
train_val_data = pd.concat([train_data, val_data])
train_user_item_dict = train_val_data.groupby('user')['item'].apply(set).to_dict()


In [None]:
# Cell 9: Save the Trained Model

model_save_path = 'C:/Users/anujp/OneDrive/Desktop/MovieRecommendations/models/neural_collaborative_filtering/ncf_model.pth'
torch.save(model.state_dict(), model_save_path)
print(f"Model saved to {model_save_path}")
