In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertConfig
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from torch.optim.lr_scheduler import ReduceLROnPlateau

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# ✅ Load MovieLens-1M Data
def load_data():
    # Load ratings
    ratings = pd.read_csv("ml-1m/ratings.dat", sep="::", engine="python",
                          names=["userId", "movieId", "rating", "timestamp"],
                          encoding="utf-8")
    ratings = ratings.sort_values(by=["userId", "timestamp"])

    # Load movies
    movies = pd.read_csv("ml-1m/movies.dat", sep="::", engine="python",
                         names=["movieId", "title", "genres"],
                         encoding="latin-1")  # ML-1M uses latin-1 encoding

    # ✅ Define available genres
    genre_list = [
        "Action", "Adventure", "Animation", "Children's", "Comedy", "Crime",
        "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
        "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"
    ]
    genre_dict = {genre: idx + 1 for idx, genre in enumerate(genre_list)}

    # ✅ Convert genres into numerical vectors
    movies["genre_vector"] = movies["genres"].apply(
        lambda x: [genre_dict[g] for g in x.split("|") if g in genre_dict]
    )
    movie_dict = movies.set_index("movieId")["genre_vector"].to_dict()

    # ✅ Create user-movie interaction dictionary
    user_movie_dict = ratings.groupby("userId")["movieId"].apply(list).to_dict()

    return user_movie_dict, movie_dict, genre_dict

user_movie_dict, movie_dict, genre_dict = load_data()
print(f"Loaded {len(user_movie_dict)} users' movie interaction sequences")

# ✅ （Negative Sampling）
def negative_sampling(movie_list, vocab_size, num_neg=5):
    neg_samples = []
    for movie in movie_list:
        neg = []
        while len(neg) < num_neg:
            sampled = np.random.randint(1, vocab_size)
            if sampled not in movie_list:
                neg.append(sampled)
        neg_samples.append(neg)
    return neg_samples

def split_train_test_strict(user_movie_dict, test_user_ratio=0.2, min_interactions=5):
    users = list(user_movie_dict.keys())
    np.random.shuffle(users)
    split_idx = int(len(users) * (1 - test_user_ratio))
    train_users = users[:split_idx]
    test_users = users[split_idx:]

    train_dict = {user: user_movie_dict[user] for user in train_users if len(user_movie_dict[user]) >= min_interactions}
    test_dict = {user: user_movie_dict[user] for user in test_users if len(user_movie_dict[user]) >= min_interactions}

    return train_dict, test_dict

train_dict, test_dict = split_train_test_strict(user_movie_dict)
print(f"Train users: {len(train_dict)}, Test users: {len(test_dict)}")

class MovieDataset(Dataset):
    def __init__(self, user_movie_dict, movie_dict, vocab_size, max_len=30, max_genres=5, num_neg=5):
        self.users = list(user_movie_dict.keys())
        self.sequences = [user_movie_dict[user] for user in self.users]
        self.movie_dict = movie_dict
        self.vocab_size = vocab_size
        self.max_len = max_len
        self.max_genres = max_genres
        self.num_neg = num_neg  # Fixed number of negative samples per target

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_ids = sequence[:self.max_len] + [0] * (self.max_len - len(sequence))
        target_ids = input_ids[1:] + [0]

        # ✅ Fixed-shape negative sampling (max_len, num_neg)
        neg_samples = []
        for _ in range(self.max_len):
            neg = []
            while len(neg) < self.num_neg:
                sampled = np.random.randint(1, self.vocab_size)
                if sampled not in sequence:
                    neg.append(sampled)
            neg_samples.append(neg)

        neg_samples = torch.tensor(neg_samples, dtype=torch.long)  # Shape: (max_len, num_neg)

        attention_mask = [1 if id != 0 else 0 for id in input_ids]

        genre_vectors = [self.movie_dict.get(movie, [0]) for movie in input_ids]
        padded_genre_vectors = [g[:self.max_genres] + [0] * (self.max_genres - len(g)) for g in genre_vectors]

        return (
            torch.tensor(input_ids, dtype=torch.long),
            torch.tensor(target_ids, dtype=torch.long),
            neg_samples,  # ✅ Fixed-size tensor (max_len, num_neg)
            torch.tensor(attention_mask, dtype=torch.long),
            torch.tensor(padded_genre_vectors, dtype=torch.long)
        )


vocab_size = max(max(seq) for seq in user_movie_dict.values()) + 1
train_dataset = MovieDataset(train_dict, movie_dict, vocab_size)
train_dataloader = DataLoader(train_dataset, batch_size=32, num_workers=0, shuffle=True)
test_dataset = MovieDataset(test_dict, movie_dict, vocab_size)
test_dataloader = DataLoader(test_dataset, batch_size=32, num_workers=0, shuffle=False)

# ✅ BERT4Rec
class BERT4Rec(nn.Module):
    def __init__(self, vocab_size, genre_size, hidden_size=256, num_layers=4, num_heads=4, max_len=30, dropout_rate=0.2):
        super(BERT4Rec, self).__init__()
        config = BertConfig(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_attention_heads=num_heads,
            num_hidden_layers=num_layers,
            max_position_embeddings=max_len,
        )
        self.bert = BertModel(config)
        self.output_layer = nn.Linear(hidden_size, vocab_size)

        self.genre_embedding = nn.Embedding(genre_size, hidden_size)
        self.genre_fc = nn.Linear(hidden_size, hidden_size)

        self.layernorm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids, attention_mask, genre_ids):
        seq_len = input_ids.shape[1]
        causal_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool, device=input_ids.device), diagonal=1)

        output = self.bert(input_ids=input_ids, attention_mask=attention_mask, encoder_attention_mask=~causal_mask).last_hidden_state
        genre_emb = self.genre_embedding(genre_ids).mean(dim=2)
        genre_emb = self.genre_fc(genre_emb)

        output = self.layernorm(output + genre_emb)
        output = self.dropout(output)

        return self.output_layer(output)

model = BERT4Rec(vocab_size, len(genre_dict) + 1, dropout_rate=0.3).to(device)

# ✅ Early Stopping Implementation
class EarlyStopping:
    def __init__(self, patience=3, delta=0):
        self.patience = patience
        self.delta = delta
        self.best_loss = float("inf")
        self.counter = 0

    def step(self, val_loss):
        if val_loss < self.best_loss - self.delta:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                print("Early stopping triggered.")
                return True
        return False

# ✅ Training with Early Stopping
def train_model(model, dataloader, epochs=10, lr=0.0001):
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
    criterion = nn.BCEWithLogitsLoss()
    scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=2)
    early_stopping = EarlyStopping(patience=3)

    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets, neg_samples, attention_mask, genres in dataloader:
            inputs, targets, neg_samples, attention_mask, genres = (
                inputs.to(device), targets.to(device), neg_samples.to(device),
                attention_mask.to(device), genres.to(device)
            )

            optimizer.zero_grad()
            outputs = model(inputs, attention_mask, genres)

            pos_logits = outputs.gather(2, targets.unsqueeze(-1)).squeeze(-1)
            neg_logits = outputs.gather(2, neg_samples).squeeze(-1)

            pos_loss = criterion(pos_logits, torch.ones_like(pos_logits))
            neg_loss = criterion(neg_logits, torch.zeros_like(neg_logits))
            loss = pos_loss + neg_loss.mean()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_loss += loss.item()

        scheduler.step(total_loss)
        print(f"Epoch {epoch + 1}, Train Loss: {total_loss / len(dataloader):.4f}")

        if early_stopping.step(total_loss):
            break
        

train_model(model, train_dataloader, epochs=10)

Using device: cpu
Loaded 6040 users' movie interaction sequences
Train users: 4832, Test users: 1208
Epoch 1, Train Loss: 1.0553
Epoch 2, Train Loss: 0.8319
Epoch 3, Train Loss: 0.7952
Epoch 4, Train Loss: 0.7655
Epoch 5, Train Loss: 0.7251
Epoch 6, Train Loss: 0.6756
Epoch 7, Train Loss: 0.6245
Epoch 8, Train Loss: 0.5653
Epoch 9, Train Loss: 0.4989
Epoch 10, Train Loss: 0.4092


In [8]:
def recall_at_k(top_k_predictions, targets, k):
    """
    Compute Recall@K: 
    - Measures how many of the relevant items (targets) are in the top K recommendations.

    Args:
        top_k_predictions: (batch_size, seq_len, k) - Top K predicted items.
        targets: (batch_size, seq_len) - True target items.
        k: The number of top items considered.
    
    Returns:
        Average Recall@K across all samples.
    """
    hits = (top_k_predictions == targets.unsqueeze(-1)).float()  # Check if target is in top K
    recall = hits.sum(dim=-1).mean().item()  # Compute recall
    return recall

def ndcg_at_k(top_k_predictions, targets, k):
    """
    Compute NDCG@K:
    - Measures ranking quality of recommendations by discounting correct predictions at later ranks.

    Args:
        top_k_predictions: (batch_size, seq_len, k) - Top K predicted items.
        targets: (batch_size, seq_len) - True target items.
        k: The number of top items considered.

    Returns:
        Average NDCG@K across all samples.
    """
    hits = (top_k_predictions == targets.unsqueeze(-1)).float()
    log_positions = 1 / torch.log2(torch.arange(2, k + 2, device=targets.device).float())  # Discount factor
    dcg = (hits * log_positions).sum(dim=-1).mean().item()
    return dcg

# Define Evaluation Function
def evaluate_model(model, dataloader, k=10):
    model.eval()
    total_loss, total_recall, total_ndcg = 0, 0, 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, targets, _, attention_mask, genres in dataloader:
            inputs, targets, attention_mask, genres = inputs.to(device), targets.to(device), attention_mask.to(device), genres.to(device)

            outputs = model(inputs, attention_mask, genres)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1))
            total_loss += loss.item()

            _, top_k_predictions = torch.topk(outputs, k, dim=-1)
            recall = recall_at_k(top_k_predictions, targets, k)
            ndcg = ndcg_at_k(top_k_predictions, targets, k)

            total_recall += recall
            total_ndcg += ndcg

    print(f"Test Loss: {total_loss / len(dataloader):.4f}, Recall@{k}: {total_recall / len(dataloader):.4f}, NDCG@{k}: {total_ndcg / len(dataloader):.4f}")

# Evaluate Model
evaluate_model(model, test_dataloader, k=10)

Test Loss: 5.6626, Recall@10: 0.2270, NDCG@10: 0.1253
