# BERT4Rec-based recommendation system

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertConfig
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Ensure compatibility with Jupyter Notebook
%matplotlib inline

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [2]:
# Function to load the MovieLens dataset
def load_data(filepath="ml-1m/ratings.dat"):
    df = pd.read_csv(filepath, sep="::", engine="python", 
                     names=["userId", "movieId", "rating", "timestamp"])
    df = df.sort_values(by=["userId", "timestamp"])  # Sort by user and timestamp
    user_movie_dict = df.groupby("userId")["movieId"].apply(list).to_dict()
    return user_movie_dict

# Load dataset
user_movie_dict = load_data()
print(f"Loaded {len(user_movie_dict)} users' movie interaction sequences")

# Print a sample of user-movie interactions for debugging
for user, movies in list(user_movie_dict.items())[:3]:
    print(f"User {user}: {movies}")

Loaded 6040 users' movie interaction sequences
User 1: [3186, 1270, 1721, 1022, 2340, 1836, 3408, 2804, 1207, 1193, 720, 260, 919, 608, 2692, 1961, 2028, 3105, 938, 1035, 1962, 2018, 150, 1028, 1097, 914, 1287, 2797, 2762, 1246, 661, 2918, 531, 3114, 2791, 2321, 1029, 1197, 594, 2398, 1545, 527, 595, 2687, 745, 588, 1, 2355, 2294, 783, 1566, 1907, 48]
User 2: [1198, 1210, 1217, 2717, 1293, 2943, 1225, 1193, 318, 3030, 2858, 1213, 1945, 1207, 593, 3095, 3468, 1873, 515, 1090, 2501, 3035, 110, 2067, 3147, 1247, 3105, 1357, 1196, 1957, 1953, 920, 1834, 1084, 1962, 3471, 3654, 3735, 1259, 1954, 1784, 2728, 1968, 1103, 902, 3451, 3578, 2852, 3334, 3068, 265, 2312, 590, 1253, 3071, 1244, 3699, 1955, 1245, 2236, 3678, 982, 2194, 2268, 1442, 3255, 647, 235, 1096, 1124, 498, 1246, 3893, 1537, 1188, 2396, 2359, 2321, 356, 3108, 1265, 3809, 589, 2028, 2571, 457, 2916, 1610, 480, 163, 380, 3418, 3256, 1408, 21, 349, 1527, 2353, 2006, 2278, 1370, 648, 2427, 1792, 1372, 1552, 2490, 1385, 780, 2881, 

In [3]:
# Function to split user interactions into train and test sets
def split_train_test(user_movie_dict, test_ratio=0.2, min_interactions=5):
    train_dict, test_dict = {}, {}

    for user, movies in user_movie_dict.items():
        if len(movies) >= min_interactions:  # Only split users with enough data
            split_idx = int(len(movies) * (1 - test_ratio))
            train_dict[user] = movies[:split_idx]
            test_dict[user] = movies[split_idx:]
        else:
            train_dict[user] = movies  # Assign all to train if only a few interactions
    
    return train_dict, test_dict

# Apply train-test split with filtering
train_dict, test_dict = split_train_test(user_movie_dict, test_ratio=0.2, min_interactions=5)

# Print updated user counts
print(f"Train users: {len(train_dict)}, Test users: {len(test_dict)}")

Train users: 6040, Test users: 6040


In [4]:
# Define Dataset for Training
class MovieDataset(Dataset):
    def __init__(self, user_movie_dict, max_len=50):
        self.users = list(user_movie_dict.keys())
        self.sequences = [user_movie_dict[user] for user in self.users]
        self.max_len = max_len

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_ids = sequence[:self.max_len] + [0] * (self.max_len - len(sequence))  # Padding
        target_ids = input_ids[1:] + [0]  # Next-movie prediction
        attention_mask = [1 if id != 0 else 0 for id in input_ids]  # Attention mask
        return torch.tensor(input_ids), torch.tensor(target_ids), torch.tensor(attention_mask)

# Create Train Dataset & DataLoader
train_dataset = MovieDataset(train_dict, max_len=50)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Create Test Dataset & DataLoader
test_dataset = MovieDataset(test_dict, max_len=50)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)

print(f"Train Dataset: {len(train_dataset)}, Test Dataset: {len(test_dataset)}")

Train Dataset: 6040, Test Dataset: 6040


In [5]:
# Transformer-based BERT4Rec Model
class BERT4Rec(nn.Module):
    def __init__(self, vocab_size, hidden_size=256, num_layers=4, num_heads=4, max_len=50):
        super(BERT4Rec, self).__init__()
        config = BertConfig(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_attention_heads=num_heads,
            num_hidden_layers=num_layers,
            max_position_embeddings=max_len,
        )
        self.bert = BertModel(config)
        self.output_layer = nn.Linear(hidden_size, vocab_size)  # Output layer to predict next movie ID

    def forward(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
        return self.output_layer(output)

# Initialize Model
vocab_size = max(max(seq) for seq in user_movie_dict.values()) + 1  # Get max movie ID as vocab size
model = BERT4Rec(vocab_size).to(device)

print(f"Initialized BERT4Rec model with vocab size {vocab_size}")

Initialized BERT4Rec model with vocab size 3953


In [6]:
# Define Training Function
def train_model(model, dataloader, epochs=3, lr=0.001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()
    model.train()
    
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets, attention_mask in dataloader:
            inputs, targets, attention_mask = inputs.to(device), targets.to(device), attention_mask.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, attention_mask)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        
        print(f"Epoch {epoch + 1}, Train Loss: {total_loss / len(dataloader)}")

# Train on Train DataLoader
train_model(model, train_dataloader)


Epoch 1, Train Loss: 6.129596157679482
Epoch 2, Train Loss: 5.767031051494457
Epoch 3, Train Loss: 4.2274885593898714


In [9]:
# Define Evaluation Function
def evaluate_model(model, dataloader, k=10):
    model.eval()
    total_loss = 0
    total_recall = 0
    total_ndcg = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, targets, attention_mask in dataloader:
            inputs, targets, attention_mask = inputs.to(device), targets.to(device), attention_mask.to(device)
            
            outputs = model(inputs, attention_mask)  # (batch_size, seq_len, vocab_size)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1))
            total_loss += loss.item()

            # Compute Recall@K & NDCG@K
            _, top_k_predictions = torch.topk(outputs, k, dim=-1)  # Get top K movie predictions
            recall = recall_at_k(top_k_predictions, targets, k)
            ndcg = ndcg_at_k(top_k_predictions, targets, k)

            total_recall += recall
            total_ndcg += ndcg

    avg_loss = total_loss / len(dataloader)
    avg_recall = total_recall / len(dataloader)
    avg_ndcg = total_ndcg / len(dataloader)

    print(f"Test Loss: {avg_loss:.4f}, Recall@{k}: {avg_recall:.4f}, NDCG@{k}: {avg_ndcg:.4f}")
    return avg_loss, avg_recall, avg_ndcg

# Compute Recall@K
def recall_at_k(top_k_predictions, targets, k):
    hits = (top_k_predictions == targets.unsqueeze(-1)).float()  # Check if target is in top K
    recall = hits.sum(dim=-1).mean().item()  # Compute recall
    return recall

# Compute NDCG@K
def ndcg_at_k(top_k_predictions, targets, k):
    hits = (top_k_predictions == targets.unsqueeze(-1)).float()
    log_positions = 1 / torch.log2(torch.arange(2, k + 2, device=targets.device).float())  # Discount factor
    dcg = (hits * log_positions).sum(dim=-1).mean().item()
    return dcg

In [10]:
# Evaluate on Test DataLoader
test_loss, recall_k, ndcg_k = evaluate_model(model, test_dataloader, k=10)

Test Loss: 2.4125, Recall@10: 0.7370, NDCG@10: 0.6553
