# BERT4Rec UP - V2

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertConfig
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Ensure compatibility with Jupyter Notebook
%matplotlib inline

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [17]:
# ✅ Load MovieLens-1M Data (Users, Ratings, Movies)
def load_ratings(filepath="ml-1m/ratings.dat"):
    df = pd.read_csv(filepath, sep="::", engine="python",
                     names=["userId", "movieId", "rating", "timestamp"],
                     encoding="utf-8")
    df = df.sort_values(by=["userId", "timestamp"])
    user_movie_dict = df.groupby("userId")["movieId"].apply(list).to_dict()
    return user_movie_dict, df

def load_users(filepath="ml-1m/users.dat"):
    user_df = pd.read_csv(filepath, sep="::", engine="python",
                           names=["userId", "gender", "age", "occupation", "zip-code"],
                           encoding="utf-8")

    gender_map = {"M": 0, "F": 1}
    user_df["gender"] = user_df["gender"].map(gender_map)

    age_groups = {1: 0, 18: 1, 25: 2, 35: 3, 45: 4, 50: 5, 56: 6}
    user_df["age"] = user_df["age"].map(age_groups)

    users_dict = user_df.set_index("userId")[["gender", "age", "occupation"]].to_dict("index")
    return users_dict

def load_movies(filepath="ml-1m/movies.dat"):
    movies = pd.read_csv(filepath, sep="::", engine="python",
                         names=["movieId", "title", "genres"], encoding="latin-1")

    genre_list = [
        "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
        "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
        "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
    ]
    genre_dict = {genre: i + 1 for i, genre in enumerate(genre_list)}

    movies["genre_vector"] = movies["genres"].apply(lambda x: [genre_dict[g] for g in x.split("|") if g in genre_dict])
    movie_dict = movies.set_index("movieId")["genre_vector"].to_dict()

    return movie_dict, genre_dict

# ✅ Load datasets
user_movie_dict, ratings_df = load_ratings()
users_dict = load_users()
movie_dict, genre_dict = load_movies()

print(f"Loaded {len(user_movie_dict)} users' movie interaction sequences.")

# ✅ Strict Train/Test Split
def split_train_test_strict(user_movie_dict, test_user_ratio=0.2, min_interactions=5):
    users = list(user_movie_dict.keys())
    np.random.shuffle(users)
    split_idx = int(len(users) * (1 - test_user_ratio))
    train_users = users[:split_idx]
    test_users = users[split_idx:]

    train_dict = {user: user_movie_dict[user] for user in train_users if len(user_movie_dict[user]) >= min_interactions}
    test_dict = {user: user_movie_dict[user] for user in test_users if len(user_movie_dict[user]) >= min_interactions}

    return train_dict, test_dict

train_dict, test_dict = split_train_test_strict(user_movie_dict)
print(f"Train users: {len(train_dict)}, Test users: {len(test_dict)}")


# ✅ Negative Sampling
def negative_sampling(movie_list, vocab_size, num_neg=2, max_len=30):
    neg_samples = []
    for _ in range(max_len):
        neg = []
        while len(neg) < num_neg:
            sampled = np.random.randint(1, vocab_size)
            if sampled not in movie_list:
                neg.append(sampled)
        neg_samples.append(neg)
    return neg_samples  # Shape: (max_len, num_neg)

# ✅ Define vocab_size
vocab_size = max(max(seq) for seq in user_movie_dict.values()) + 1
    
class MovieDataset(Dataset):
    def __init__(self, user_movie_dict, users_dict, movie_dict, vocab_size, max_len=30, num_neg=2, max_genres=5):
        self.users = list(user_movie_dict.keys())
        self.sequences = [user_movie_dict[user] for user in self.users]
        self.user_profiles = [users_dict[user] for user in self.users]
        self.movie_dict = movie_dict
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.num_neg = num_neg
        self.max_genres = max_genres

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        user_profile = self.user_profiles[idx]

        gender, age, occupation = (
            user_profile["gender"], user_profile["age"], user_profile["occupation"]
        )

        input_ids = sequence[:self.max_len] + [0] * (self.max_len - len(sequence))
        target_ids = input_ids[1:] + [0]
        attention_mask = [1 if id != 0 else 0 for id in input_ids]

        # ✅ 外部调用negative_sampling
        neg_samples = negative_sampling(sequence, self.vocab_size, self.num_neg, self.max_len)
        neg_samples = torch.tensor(neg_samples, dtype=torch.long)

        genre_vectors = [self.movie_dict.get(movie, [0]) for movie in input_ids]
        padded_genre_vectors = [g[:self.max_genres] + [0] * (self.max_genres - len(g)) for g in genre_vectors]
        genre_tensor = torch.tensor(padded_genre_vectors, dtype=torch.long)

        return (
            torch.tensor(input_ids, dtype=torch.long),
            torch.tensor(target_ids, dtype=torch.long),
            neg_samples,
            torch.tensor(attention_mask, dtype=torch.long),
            torch.tensor(gender, dtype=torch.long),
            torch.tensor(age, dtype=torch.long),
            torch.tensor(occupation, dtype=torch.long),
            genre_tensor
        )

# ✅ Update DataLoaders with `genre_ids`
train_dataset = MovieDataset(train_dict, users_dict, movie_dict, vocab_size)
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_dataset = MovieDataset(test_dict, users_dict, movie_dict, vocab_size)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False)


class BERT4Rec(nn.Module):
    def __init__(self, vocab_size, num_genres, hidden_size=256, num_layers=4, num_heads=4, max_len=30, dropout_rate=0.2):
        super(BERT4Rec, self).__init__()
        config = BertConfig(
            vocab_size=vocab_size, hidden_size=hidden_size, num_attention_heads=num_heads,
            num_hidden_layers=num_layers, max_position_embeddings=max_len
        )
        self.bert = BertModel(config)
        split_size = hidden_size // 3  # Adjusted due to removing activity embedding
        self.gender_embedding = nn.Embedding(2, split_size)
        self.age_embedding = nn.Embedding(7, split_size)
        self.occupation_embedding = nn.Embedding(21, split_size)
        self.genre_embedding = nn.Embedding(num_genres, hidden_size)
        self.genre_fc = nn.Linear(hidden_size, hidden_size)
        self.user_fc = nn.Linear(split_size * 3, hidden_size)
        self.user_dropout = nn.Dropout(dropout_rate)
        self.final_dropout = nn.Dropout(dropout_rate)
        self.output_layer = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, input_ids, attention_mask, gender, age, occupation, genre_ids):
        batch_size, seq_len = input_ids.shape
        causal_mask = torch.triu(torch.ones((seq_len, seq_len), dtype=torch.bool, device=input_ids.device), diagonal=1)
        transformer_output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_attention_mask=~causal_mask
        ).last_hidden_state
        genre_emb = self.genre_embedding(genre_ids).mean(dim=2)
        genre_emb = self.genre_fc(genre_emb)
        user_emb = self.user_fc(torch.cat([
            self.gender_embedding(gender).unsqueeze(1).expand(-1, seq_len, -1),
            self.age_embedding(age).unsqueeze(1).expand(-1, seq_len, -1),
            self.occupation_embedding(occupation).unsqueeze(1).expand(-1, seq_len, -1)
        ], dim=-1))
        user_emb = self.user_dropout(user_emb)
        output = transformer_output + user_emb + genre_emb
        return self.output_layer(self.final_dropout(output))

vocab_size = max(max(seq) for seq in user_movie_dict.values()) + 1
model = BERT4Rec(vocab_size, len(genre_dict) + 1, dropout_rate=0.4).to(device)
print(f"Model initialized with vocab size {vocab_size}")

from torch.optim.lr_scheduler import ReduceLROnPlateau
# ✅ Early Stopping Implementation
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = float("inf")
        self.counter = 0

    def step(self, val_loss):
        if val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                print("Early stopping triggered.")
                return True
        return False

# ✅ Updated Training Function
def train_model(model, dataloader, epochs=10, lr=0.0001):
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    criterion_ce = nn.CrossEntropyLoss(ignore_index=0)  # Used for positive samples
    criterion_bce = nn.BCEWithLogitsLoss()  # Used for negative sampling
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    early_stopping = EarlyStopping(patience=3)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets, neg_samples, attention_mask, gender, age, occupation, genre_ids in dataloader:
            inputs, targets, neg_samples, attention_mask, gender, age, occupation, genre_ids = (
                inputs.to(device), targets.to(device), neg_samples.to(device),
                attention_mask.to(device), gender.to(device), age.to(device),
                occupation.to(device),
                genre_ids.to(device)
            )

            optimizer.zero_grad()
            outputs = model(inputs, attention_mask, gender, age, occupation, genre_ids)  
            # ✅ outputs.shape = [batch_size, seq_len, vocab_size]

            # ✅ Compute Positive Sample Loss (CrossEntropy)
            pos_loss = criterion_ce(outputs.view(-1, outputs.shape[-1]), targets.view(-1))  # Targets must be long integers

            # ✅ Compute Negative Sample Loss (Binary Classification)
            neg_logits = outputs.gather(2, neg_samples).squeeze(-1)  # Extract negative logits
            neg_labels = torch.zeros_like(neg_logits)  # Label negative samples as "0"
            neg_loss = criterion_bce(neg_logits, neg_labels.float())  # BCE expects float targets

            # ✅ Compute Final Loss
            loss = pos_loss + neg_loss.mean()

            # ✅ Backpropagation
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()

        # ✅ Learning Rate Scheduling
        scheduler.step(total_loss)
        print(f"Epoch {epoch + 1}, Train Loss: {total_loss / len(dataloader):.4f}")

        # ✅ Early Stopping
        if early_stopping.step(total_loss):
            break

# ✅ Train the Model
train_model(model, train_dataloader, epochs=10, lr=0.0001)

Loaded 6040 users' movie interaction sequences.
Train users: 4832, Test users: 1208
Model initialized with vocab size 3953
Epoch 1, Train Loss: 8.1018
Epoch 2, Train Loss: 7.3164
Epoch 3, Train Loss: 6.8969
Epoch 4, Train Loss: 6.4634
Epoch 5, Train Loss: 5.6522
Epoch 6, Train Loss: 4.0472
Epoch 7, Train Loss: 2.7484
Epoch 8, Train Loss: 1.8936
Epoch 9, Train Loss: 1.3391
Epoch 10, Train Loss: 0.9695


In [18]:
# Define Evaluation Function
def evaluate_model(model, dataloader, k=10):
    """
    Evaluate the BERT4Rec model on test data.

    Args:
        model: The trained BERT4Rec model.
        dataloader: Test DataLoader.
        k: Top-K predictions to consider for Recall and NDCG.

    Returns:
        None
    """
    model.eval()
    total_loss, total_recall, total_ndcg = 0, 0, 0
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    with torch.no_grad():
        for batch in dataloader:
            # Unpack batch
            inputs, targets, neg_samples, attention_mask, gender, age, occupation, genres = batch

            # Move to device
            inputs, targets, attention_mask = inputs.to(device), targets.to(device), attention_mask.to(device)
            gender, age, occupation, genres = (
                gender.to(device), age.to(device), occupation.to(device),
                genres.to(device)
            )

            # Forward pass
            outputs = model(inputs, attention_mask, gender, age, occupation, genres)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1))
            total_loss += loss.item()

            # Compute Recall@K & NDCG@K
            _, top_k_predictions = torch.topk(outputs, k, dim=-1)  # Get top K movie predictions
            recall = recall_at_k(top_k_predictions, targets, k)
            ndcg = ndcg_at_k(top_k_predictions, targets, k)

            total_recall += recall
            total_ndcg += ndcg

    avg_loss = total_loss / len(dataloader)
    avg_recall = total_recall / len(dataloader)
    avg_ndcg = total_ndcg / len(dataloader)

    print(f"Test Loss: {avg_loss:.4f}, Recall@{k}: {avg_recall:.4f}, NDCG@{k}: {avg_ndcg:.4f}")

# Compute Recall@K
def recall_at_k(top_k_predictions, targets, k):
    hits = (top_k_predictions == targets.unsqueeze(-1)).float()  # Check if target is in top K
    recall = hits.sum(dim=-1).mean().item()  # Compute recall
    return recall

# Compute NDCG@K
def ndcg_at_k(top_k_predictions, targets, k):
    hits = (top_k_predictions == targets.unsqueeze(-1)).float()
    log_positions = 1 / torch.log2(torch.arange(2, k + 2, device=targets.device).float())  # Discount factor
    dcg = (hits * log_positions).sum(dim=-1).mean().item()
    return dcg

# Evaluate Model
evaluate_model(model, test_dataloader, k=10)

Test Loss: 0.5159, Recall@10: 0.9222, NDCG@10: 0.9116
