# BERT4Rec-based recommendation system

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertModel, BertConfig
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Ensure compatibility with Jupyter Notebook
%matplotlib inline

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')
folder_path='/content/drive/MyDrive/CS247/Final Project'
os.chdir(folder_path)
# Function to load the MovieLens dataset


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Data Processing


In [3]:
def load_data_bert(filepath="ml-1m/ratings.dat"):
    df = pd.read_csv(filepath, sep="::", engine="python",
                     names=["userId", "movieId", "rating", "timestamp"])
    df = df.sort_values(by=["userId", "timestamp"])  # Sort by user and timestamp
    user_movie_dict = df.groupby("userId")["movieId"].apply(list).to_dict()
    return user_movie_dict

def load_data_cf(filepath="ml-1m/ratings.dat"):
    df = pd.read_csv(filepath, sep="::", engine="python",
                     names=["userId", "movieId", "rating", "timestamp"])
    df = df.drop(columns=["timestamp"])  # Remove timestamp for CF
    return df


# Load dataset -CF
ratings_df = load_data_cf()
user_ids = {user: idx for idx, user in enumerate(ratings_df["userId"].unique())}
movie_ids = {movie: idx for idx, movie in enumerate(ratings_df["movieId"].unique())}


# Load dataset - BERT
user_movie_dict = load_data_bert()

# Apply the same user/movie mapping to BERT4Rec data vs CF

user_movie_dict = {
    user_ids[user]: [movie_ids[m] for m in movies if m in movie_ids]  # Ensure movies exist in mapping
    for user, movies in user_movie_dict.items() if user in user_ids  # Ensure users exist in mapping
}

print(f"Loaded {len(user_movie_dict)} users' movie interaction sequences")

# Print a sample of user-movie interactions for debugging
for user, movies in list(user_movie_dict.items())[:3]:
    print(f"User {user}: {movies}")

# Function to split user interactions into train and test sets
def split_train_test(user_movie_dict, test_ratio=0.2, min_interactions=5):
    train_dict, test_dict = {}, {}

    for user, movies in user_movie_dict.items():
        if len(movies) >= min_interactions:  # Only split users with enough data
            split_idx = int(len(movies) * (1 - test_ratio))
            train_dict[user] = movies[:split_idx]
            test_dict[user] = movies[split_idx:]
        else:
            train_dict[user] = movies  # Assign all to train if only a few interactions

    return train_dict, test_dict

# Apply train-test split with filtering
train_dict, test_dict = split_train_test(user_movie_dict, test_ratio=0.2, min_interactions=5)
# Define Dataset for Training
class MovieDataset(Dataset):
    def __init__(self, user_movie_dict, max_len=50):
        self.users = list(user_movie_dict.keys())
        self.sequences = [user_movie_dict[user] for user in self.users]
        self.max_len = max_len

    def __len__(self):
        return len(self.users)

    def __getitem__(self, idx):
        sequence = self.sequences[idx]
        input_ids = sequence[:self.max_len] + [0] * (self.max_len - len(sequence))  # Padding
        target_ids = input_ids[1:] + [0]  # Next-movie prediction
        attention_mask = [1 if id != 0 else 0 for id in input_ids]  # Attention mask
        return torch.tensor(input_ids), torch.tensor(target_ids), torch.tensor(attention_mask)

# Create Train Dataset & DataLoader
train_dataset = MovieDataset(train_dict, max_len=30)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# Create Test Dataset & DataLoader
test_dataset = MovieDataset(test_dict, max_len=30)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Train-test split (80% train, 20% test)
train_data_cf, test_data_cf = train_test_split(ratings_df, test_size=0.2, random_state=42)
print(f"Train size: {len(train_data_cf)}, Test size: {len(test_data_cf)}")

# Encode users and movies into unique indices
user_ids = {user: idx for idx, user in enumerate(ratings_df["userId"].unique())}
movie_ids = {movie: idx for idx, movie in enumerate(ratings_df["movieId"].unique())}

# Map users and movies to indices
ratings_df["userId"] = ratings_df["userId"].map(user_ids)
ratings_df["movieId"] = ratings_df["movieId"].map(movie_ids)

print(f"user id size: {len(user_ids)}; move id size {len(movie_ids)}")
# print(len(movie_ids))
# Train-test split (80% train, 20% test)
train_data_cf, test_data_cf = train_test_split(ratings_df, test_size=0.2, random_state=42)
print(f"Train size: {len(train_data_cf)}, Test size: {len(test_data_cf)}")


Loaded 6040 users' movie interaction sequences
User 0: [31, 22, 27, 37, 24, 36, 3, 7, 47, 0, 21, 44, 9, 51, 43, 41, 48, 18, 11, 14, 42, 17, 39, 45, 26, 2, 6, 19, 38, 52, 1, 13, 49, 50, 15, 20, 46, 5, 8, 12, 28, 23, 10, 16, 29, 33, 40, 4, 30, 35, 32, 34, 25]
User 1: [127, 64, 71, 131, 87, 165, 105, 0, 167, 70, 104, 67, 101, 47, 128, 159, 63, 103, 106, 147, 149, 76, 92, 115, 85, 174, 18, 53, 124, 143, 136, 153, 95, 111, 42, 96, 99, 135, 84, 139, 164, 162, 168, 61, 82, 158, 68, 100, 121, 54, 109, 108, 123, 77, 81, 170, 112, 140, 172, 80, 169, 102, 57, 59, 114, 89, 56, 141, 152, 142, 161, 52, 173, 55, 88, 134, 129, 20, 171, 75, 116, 138, 97, 48, 132, 151, 62, 78, 113, 144, 148, 156, 90, 110, 145, 150, 155, 120, 166, 93, 117, 58, 122, 65, 119, 125, 94, 157, 160, 69, 74, 83, 118, 146, 154, 107, 137, 60, 98, 91, 126, 163, 73, 79, 130, 66, 72, 86, 133]
User 2: [128, 104, 178, 168, 181, 41, 194, 196, 197, 203, 123, 44, 124, 204, 5, 127, 201, 64, 186, 188, 189, 192, 113, 185, 187, 195, 166, 200,

In [4]:

# Print updated user counts
# print(f"Train users: {len(train_dict)}, Test users: {len(test_dict)}")

In [5]:


# print(f"Train Dataset: {len(train_dataset)}, Test Dataset: {len(test_dataset)}")


In [6]:
#CF Data Processing
#data alignment ---> make sure the user id and test / train dataset are the same
import pandas as pd

# Create train_data_cf using train_dict
train_data_cf = []
for user, movies in train_dict.items():
    for movie in movies:
        train_data_cf.append((user, movie))  # No ratings, just user-item interactions

train_data_cf = pd.DataFrame(train_data_cf, columns=["userId", "movieId"])

# Create test_data_cf using test_dict
test_data_cf = []
for user, movies in test_dict.items():
    for movie in movies:
        test_data_cf.append((user, movie))

test_data_cf = pd.DataFrame(test_data_cf, columns=["userId", "movieId"])
# Merge ratings into CF train and test sets
train_data_cf = train_data_cf.merge(ratings_df, on=["userId", "movieId"], how="left")
test_data_cf = test_data_cf.merge(ratings_df, on=["userId", "movieId"], how="left")

# Drop any rows where ratings are missing
train_data_cf = train_data_cf.dropna().astype({"rating": "int"})
test_data_cf = test_data_cf.dropna().astype({"rating": "int"})

print(train_data_cf.head())  # Verify the train dataset
print(test_data_cf.head())   # Verify the test dataset

#---------------------
# verify index alignment
#---------------------
# Print first few users from BERT's train_dict
print("First 5 Users in BERT Train Set:")
for user, movies in list(train_dict.items())[:5]:
    print(f"User {user}: {movies}")

# Print first few users from CF train set
print("\nFirst 5 Users in CF Train Set:")
for user in train_data_cf["userId"].unique()[:5]:
    user_movies = train_data_cf[train_data_cf["userId"] == user]["movieId"].tolist()
    print(f"User {user}: {user_movies}")


   userId  movieId  rating
0       0       31       4
1       0       22       5
2       0       27       4
3       0       37       5
4       0       24       3
   userId  movieId  rating
0       0       10       5
1       0       16       3
2       0       29       3
3       0       33       4
4       0       40       5
First 5 Users in BERT Train Set:
User 0: [31, 22, 27, 37, 24, 36, 3, 7, 47, 0, 21, 44, 9, 51, 43, 41, 48, 18, 11, 14, 42, 17, 39, 45, 26, 2, 6, 19, 38, 52, 1, 13, 49, 50, 15, 20, 46, 5, 8, 12, 28, 23]
User 1: [127, 64, 71, 131, 87, 165, 105, 0, 167, 70, 104, 67, 101, 47, 128, 159, 63, 103, 106, 147, 149, 76, 92, 115, 85, 174, 18, 53, 124, 143, 136, 153, 95, 111, 42, 96, 99, 135, 84, 139, 164, 162, 168, 61, 82, 158, 68, 100, 121, 54, 109, 108, 123, 77, 81, 170, 112, 140, 172, 80, 169, 102, 57, 59, 114, 89, 56, 141, 152, 142, 161, 52, 173, 55, 88, 134, 129, 20, 171, 75, 116, 138, 97, 48, 132, 151, 62, 78, 113, 144, 148, 156, 90, 110, 145, 150, 155, 120, 166, 93, 117, 58

## Bert Model Training

In [7]:
# Transformer-based BERT4Rec Model
# class BERT4Rec(nn.Module):
#     def __init__(self, vocab_size, hidden_size=256, num_layers=4, num_heads=4, max_len=50):
#         super(BERT4Rec, self).__init__()
#         config = BertConfig(
#             vocab_size=vocab_size,
#             hidden_size=hidden_size,
#             num_attention_heads=num_heads,
#             num_hidden_layers=num_layers,
#             max_position_embeddings=max_len,
#         )
#         self.bert = BertModel(config)
#         self.output_layer = nn.Linear(hidden_size, vocab_size)  # Output layer to predict next movie ID

#     def forward(self, input_ids, attention_mask):
#         output = self.bert(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
#         return self.output_layer(output)


# class BERT4Rec(nn.Module):
#     def __init__(self, vocab_size, hidden_size=256, num_layers=4, num_heads=4, max_len=50):
#         super(BERT4Rec, self).__init__()
#         config = BertConfig(
#             vocab_size=vocab_size,
#             hidden_size=hidden_size,
#             num_attention_heads=num_heads,
#             num_hidden_layers=num_layers,
#             max_position_embeddings=max_len,
#         )
#         self.max_len = max_len
#         self.bert = BertModel(config)
#         self.output_layer = nn.Linear(hidden_size, vocab_size)  # Output layer to predict next movie ID

#     def get_causal_mask(self, seq_len, device):
#        return torch.tril(torch.ones((seq_len, seq_len), device=device))

#     def forward(self, input_ids, attention_mask):
#         batch_size, seq_len = input_ids.shape
#         causal_mask = self.get_causal_mask(seq_len, input_ids.device)
#         extended_attention_mask = attention_mask.unsqueeze(1) * causal_mask
#         output = self.bert(input_ids=input_ids, attention_mask=extended_attention_mask).last_hidden_state
#         return self.output_layer(output)

# Transformer-based BERT4Rec Model
class BERT4Rec(nn.Module):
    def __init__(self, vocab_size, hidden_size=256, num_layers=4, num_heads=4, max_len=30, dropout_rate=0.2):
        super(BERT4Rec, self).__init__()
        config = BertConfig(
            vocab_size=vocab_size,
            hidden_size=hidden_size,
            num_attention_heads=num_heads,
            num_hidden_layers=num_layers,
            max_position_embeddings=max_len,
        )
        self.bert = BertModel(config)
        self.max_len= max_len
        self.layernorm = nn.LayerNorm(hidden_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.output_layer = nn.Linear(hidden_size, vocab_size)  # Output layer to predict next movie ID

    def forward(self, input_ids, attention_mask):
        batch_size, seq_len = input_ids.shape
        causal_mask = torch.triu(torch.ones(seq_len, seq_len, dtype=torch.bool, device=input_ids.device), diagonal=1)
        output = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            encoder_attention_mask=~causal_mask  # PyTorch requires masked positions to be False
        ).last_hidden_state
        output = self.layernorm(output)
        output = self.dropout(output)
        return self.output_layer(output)

# Initialize Model
vocab_size = max(max(seq) for seq in user_movie_dict.values()) + 1  # Get max movie ID as vocab size
bert_model = BERT4Rec(vocab_size).to(device)

print(f"Initialized BERT4Rec model with vocab size {vocab_size}")

Initialized BERT4Rec model with vocab size 3706


In [8]:
# Define Training Function
def train_model(model, dataloader, epochs=3, lr=0.0001):
    optimizer = optim.Adam(model.parameters(), lr=lr)
    # criterion = nn.CrossEntropyLoss()
    criterion = nn.CrossEntropyLoss(ignore_index=0)
    model.train()

    for epoch in range(epochs):
        total_loss = 0
        for inputs, targets, attention_mask in dataloader:
            inputs, targets, attention_mask = inputs.to(device), targets.to(device), attention_mask.to(device)
            optimizer.zero_grad()
            outputs = model(inputs, attention_mask)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"Epoch {epoch + 1}, Train Loss: {total_loss / len(dataloader)}")

# Train on Train DataLoader
train_model(bert_model, train_dataloader, epochs=100, lr= 0.0001)

Epoch 1, Train Loss: 7.502762086767899
Epoch 2, Train Loss: 6.94595419231214
Epoch 3, Train Loss: 6.604208213404605
Epoch 4, Train Loss: 6.004741668701172
Epoch 5, Train Loss: 4.495345012765181
Epoch 6, Train Loss: 2.8925968747389943
Epoch 7, Train Loss: 1.9011440942161961
Epoch 8, Train Loss: 1.29327277447048
Epoch 9, Train Loss: 0.9136895462086326
Epoch 10, Train Loss: 0.6700733046782644
Epoch 11, Train Loss: 0.5056561818248347
Epoch 12, Train Loss: 0.3912417957657262
Epoch 13, Train Loss: 0.30873130214841743
Epoch 14, Train Loss: 0.24656713165734945
Epoch 15, Train Loss: 0.2011746086572346
Epoch 16, Train Loss: 0.1656410979597192
Epoch 17, Train Loss: 0.13772009383690983
Epoch 18, Train Loss: 0.11604685603003753
Epoch 19, Train Loss: 0.0985638273389716
Epoch 20, Train Loss: 0.08413806534127186
Epoch 21, Train Loss: 0.07300800053696883
Epoch 22, Train Loss: 0.06318211618222688
Epoch 23, Train Loss: 0.055078628306326115
Epoch 24, Train Loss: 0.04818023580469583
Epoch 25, Train Loss: 0

In [9]:
# Define Evaluation Function
def evaluate_model(model, dataloader, k=10):
    model.eval()
    total_loss = 0
    total_recall = 0
    total_ndcg = 0
    criterion = nn.CrossEntropyLoss()

    with torch.no_grad():
        for inputs, targets, attention_mask in dataloader:
            inputs, targets, attention_mask = inputs.to(device), targets.to(device), attention_mask.to(device)

            outputs = model(inputs, attention_mask)  # (batch_size, seq_len, vocab_size)
            loss = criterion(outputs.view(-1, outputs.shape[-1]), targets.view(-1))
            total_loss += loss.item()

            # Compute Recall@K & NDCG@K
            _, top_k_predictions = torch.topk(outputs, k, dim=-1)  # Get top K movie predictions
            recall = recall_at_k(top_k_predictions, targets, k)
            ndcg = ndcg_at_k(top_k_predictions, targets, k)

            total_recall += recall
            total_ndcg += ndcg

    avg_loss = total_loss / len(dataloader)
    avg_recall = total_recall / len(dataloader)
    avg_ndcg = total_ndcg / len(dataloader)

    print(f"Test Loss: {avg_loss:.4f}, Recall@{k}: {avg_recall:.4f}, NDCG@{k}: {avg_ndcg:.4f}")
    return avg_loss, avg_recall, avg_ndcg

# Compute Recall@K
def recall_at_k(top_k_predictions, targets, k):
    hits = (top_k_predictions == targets.unsqueeze(-1)).float()  # Check if target is in top K
    recall = hits.sum(dim=-1).mean().item()  # Compute recall
    return recall

# Compute NDCG@K
def ndcg_at_k(top_k_predictions, targets, k):
    hits = (top_k_predictions == targets.unsqueeze(-1)).float()
    log_positions = 1 / torch.log2(torch.arange(2, k + 2, device=targets.device).float())  # Discount factor
    dcg = (hits * log_positions).sum(dim=-1).mean().item()
    return dcg

In [10]:
# Evaluate on Test DataLoader
test_loss, recall_k, ndcg_k = evaluate_model(bert_model, test_dataloader, k=10)

Test Loss: 6.6949, Recall@10: 0.6027, NDCG@10: 0.6027


In [11]:
torch.save(bert_model.state_dict(),"BERT4REC_model.pth")


## loading CF model

In [12]:
# Define CF-based Matrix Factorization Model
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_movies, embedding_dim=150):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_dim)
        self.movie_embedding = nn.Embedding(num_movies, embedding_dim)
        self.user_bias = nn.Embedding(num_users, 1)
        self.movie_bias = nn.Embedding(num_movies, 1)

    def forward(self, user_ids, movie_ids):
        user_vecs = self.user_embedding(user_ids)  # (batch_size, embedding_dim)
        movie_vecs = self.movie_embedding(movie_ids)  # (batch_size, embedding_dim)
        dot_product = (user_vecs * movie_vecs).sum(-1)  # Compute rating scores
        pred_ratings = dot_product + self.user_bias(user_ids).squeeze() + self.movie_bias(movie_ids).squeeze()
        return pred_ratings  # Shape should be (batch_size,) or (batch_size, num_movies)

cf_model = MatrixFactorization(6040, 3706).to(device)
cf_model.load_state_dict(torch.load('CF_model.pth', map_location=device))

  cf_model.load_state_dict(torch.load('CF_model.pth', map_location=device))


<All keys matched successfully>

## CF and Bert prediction and a linear hybrid score on single example
$ $hybrid_score$ =  α \times $cf_score$ + (1-α) \times $bert_score$ $

In [13]:
def predict_ratings_for_items_cf(cf_model, user_id, item_ids, device=device):
    cf_model.eval()

    user_tensor = torch.tensor([user_id]*len(item_ids), dtype=torch.long, device=device)
    movie_tensor = torch.tensor(item_ids, dtype=torch.long, device=device)
    with torch.no_grad():
        preds = cf_model(user_tensor, movie_tensor)
        # preds = preds.numpy()  # depends on the device
    return preds

In [14]:
# user_id_example = random.choice(test_data["userId"].unique().tolist())
user_id_example = 5411

all_mapped_movie_ids = list(movie_ids.values())

ratings  = predict_ratings_for_items_cf(cf_model, user_id_example, all_mapped_movie_ids, device=device)

item_rating_pairs = list(zip(all_mapped_movie_ids, ratings))
item_rating_pairs.sort(key=lambda x: x[1], reverse=True)
top_recommendations_cf = []
for i in range(5):
    print(f"UserId {user_id_example} on Item {item_rating_pairs[i][0]}, rating {item_rating_pairs[i][1]:.2f}")
    top_recommendations_cf.append(item_rating_pairs[i][0])

print("top_recommendations_cf = ",top_recommendations_cf)
actual_ratings = test_data_cf[test_data_cf["userId"] == user_id_example][["movieId", "rating"]]
# actual_ratings = test_data_cf[test_data_cf["userId"] == user_id_example][["movieId", "rating"]]


# Print all actual ratings for user 5411
# print(f"Actual Ratings for User {user_id_example}:")
# print(actual_ratings)

UserId 5411 on Item 29, rating 4.94
UserId 5411 on Item 259, rating 4.89
UserId 5411 on Item 167, rating 4.89
UserId 5411 on Item 611, rating 4.89
UserId 5411 on Item 535, rating 4.86
top_recommendations_cf =  [29, 259, 167, 611, 535]


In [15]:
# # Filter actual ratings to only include these movies
# filtered_actual_ratings = actual_ratings[actual_ratings["movieId"].isin(top_recommendations_cf)]

# # Display the actual ratings for the selected movies
# print(f"Actual Ratings for User {user_id_example} on Selected Movies:")
# print(filtered_actual_ratings)


## BERT_model prediction

In [16]:
import torch

def predict_next_movies_bert(bert_model, user_sequence, candidate_items, top_k=10, device='cuda'):
    bert_model.eval()
    max_len = bert_model.max_len
    input_sequence = user_sequence[-max_len:]
    input_tensor = torch.tensor([input_sequence], dtype=torch.long, device=device)
    attention_mask = (input_tensor != 0).long()
    with torch.no_grad():
        logits = bert_model(input_tensor, attention_mask)  # Shape: [1, seq_len, vocab_size]
        next_item_logits = logits[:, -1, :]  # Extract last token's prediction

    scores = {item: next_item_logits[0, item].item() for item in candidate_items}
    sorted_movies = sorted(scores.items(), key=lambda x: x[1], reverse=True)
    return [movie_id for movie_id, score in sorted_movies[:top_k]]


In [17]:
# device = 'cuda' if torch.cuda.is_available() else 'cpu'
bert_model.to(device)
user_sequence = user_movie_dict[user_id_example]  # Get this user's movie history

candidate_items = list(movie_ids.values())

top_recommendations_bert = predict_next_movies_bert(bert_model, user_sequence, candidate_items, top_k=5, device=device)
print("Top 5recommended movies by BERT4Rec:", top_recommendations_bert)
print("Top 5recommended movies by CF:", top_recommendations_cf)


Top 5recommended movies by BERT4Rec: [2000, 2556, 3144, 1275, 354]
Top 5recommended movies by CF: [29, 259, 167, 611, 535]


In [18]:
# print(user_sequence)
top_recommendations = predict_next_movies_bert(bert_model, user_sequence[:-1], candidate_items, top_k=5, device=device)[0]
print("Top predicted next movies by BERT4Rec input (user_sequence-1):", top_recommendations)
print("Actual movie (user_sequence[-1]):", user_sequence[-1])


Top predicted next movies by BERT4Rec input (user_sequence-1): 718
Actual movie (user_sequence[-1]): 891


In [19]:
# hybrid score
alpha = 0.5
cf_scores = predict_ratings_for_items_cf(cf_model, user_id_example, candidate_items, device=device)
bert_scores = {item: idx for idx, item in enumerate(top_recommendations_bert[::-1])}  # Reverse ranking score
cf_min, cf_max = min(cf_scores), max(cf_scores)
cf_scores = {item: (score - cf_min) / (cf_max - cf_min) for item, score in zip(candidate_items, cf_scores)}
bert_max = max(bert_scores.values()) if bert_scores else 1
bert_scores = {item: score / bert_max for item, score in bert_scores.items()}
# algo alpha * cf - (1-alpha) *bert -> linear
fusion_scores = {
    item: alpha * cf_scores.get(item, 0) + (1 - alpha) * bert_scores.get(item, 0)
    for item in candidate_items
}
top_fusion_recommendations = sorted(fusion_scores.items(), key=lambda x: x[1], reverse=True)[:5]
print("Top 5 recommended movies by Fusion Model:")
for movie, score in top_fusion_recommendations:
    print(f"Movie {movie}: Fusion Score {score:.2f}")

Top 5 recommended movies by Fusion Model:
Movie 2000: Fusion Score 0.80
Movie 2556: Fusion Score 0.65
Movie 29: Fusion Score 0.50
Movie 3144: Fusion Score 0.50
Movie 259: Fusion Score 0.49


##Modeling##

In [20]:
import torch
import torch.nn as nn
import torch.optim as optim

class FusionModel(nn.Module):
    def __init__(self, alpha=0.5):
        """
        A simple fusion model t-> linear
        """
        super(FusionModel, self).__init__()
        self.alpha = nn.Parameter(torch.tensor(alpha, dtype=torch.float32))


    def forward(self, cf_scores, bert_scores):
        alpha = torch.sigmoid(self.alpha)  # Keeps alpha in range [0,1]
        fusion_scores = alpha * cf_scores + (1 - alpha) * bert_scores
        return fusion_scores

In [21]:
def recall_at_k(top_k_predictions, targets, k):
    """
    Compute Recall@K.
    """
    targets = targets.view(-1, 1).expand(-1, k)  # Expand targets to (batch_size, K)
    hits = (top_k_predictions == targets).float()  # Check if target is in top-K
    recall = hits.sum(dim=-1).mean().item()
    return recall

def ndcg_at_k(top_k_predictions, targets, k):
    """
    Compute NDCG@K.
    """
    targets = targets.view(-1, 1).expand(-1, k)
    hits = (top_k_predictions == targets).float()
    log_positions = 1 / torch.log2(torch.arange(2, k + 2, device=targets.device).float())  # Discount factor
    dcg = (hits * log_positions).sum(dim=-1).mean().item()
    return dcg


In [22]:
# OPTION1 - have future records
# class FusionDataset(Dataset):
#     def __init__(self, cf_data, user_movie_dict, movie_ids, max_history=50):
#         """
#         Custom dataset for Fusion Model with sliding window.

#         Args:
#         - cf_data (DataFrame): CF training data with (userId, movieId, rating).
#         - user_movie_dict (dict): User's historical watched sequences for BERT4Rec.
#         - movie_ids (list): List of mapped movie IDs.
#         - max_history (int): Maximum number of past movies to consider.
#         """
#         self.cf_data = cf_data
#         self.user_movie_dict = user_movie_dict
#         self.movie_ids = movie_ids
#         self.max_history = max_history

#     def __len__(self):
#         return len(self.cf_data)

#     def __getitem__(self, idx):
#         row = self.cf_data.iloc[idx]
#         user_id = row["userId"]
#         movie_id = row["movieId"]
#         rating = row["rating"]

#         # Get full watched sequence
#         full_sequence = self.user_movie_dict.get(user_id, [])

#         # Get only past movies (before the current movie)
#         if movie_id in full_sequence:
#             movie_idx = full_sequence.index(movie_id)  # Find where this movie appears
#             user_sequence = full_sequence[:movie_idx]  # Take only past movies
#         else:
#             user_sequence = full_sequence  # If not found, return full sequence

#         # Truncate history to max length
#         user_sequence = user_sequence[-self.max_history:]

#         return torch.tensor(user_id, dtype=torch.long), \
#                torch.tensor(movie_id, dtype=torch.long), \
#                torch.tensor(rating, dtype=torch.float32), \
#                torch.tensor(user_sequence, dtype=torch.long)


# # Create updated FusionDataset
# train_dataset = FusionDataset(train_data_cf, user_movie_dict, movie_ids)
# test_dataset = FusionDataset(test_data_cf, user_movie_dict, movie_ids)

# OPTION2 - only keep the past
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

class FusionDataset(Dataset):
    def __init__(self, cf_data, user_movie_dict, movie_ids, max_history=30, pad_value=0):
        """
        Custom dataset for Fusion Model.

        Args:
        - cf_data (DataFrame): CF training data with (userId, movieId, rating).
        - user_movie_dict (dict): User's historical watched sequences for BERT4Rec.
        - movie_ids (list): List of mapped movie IDs.
        - max_history (int): Max past movies to keep for BERT.
        - pad_value (int): Value to pad sequences.
        """
        self.cf_data = cf_data
        self.user_movie_dict = user_movie_dict
        self.movie_ids = movie_ids
        self.max_history = max_history
        self.pad_value = pad_value

    def __len__(self):
        return len(self.cf_data)

    def __getitem__(self, idx):
        row = self.cf_data.iloc[idx]
        user_id = row["userId"]
        movie_id = row["movieId"]
        rating = row["rating"]

        # Get user's watched sequence for BERT
        user_sequence = self.user_movie_dict.get(user_id, [])
        user_sequence = user_sequence[-self.max_history:]  # Keep only last `max_history` items
        user_sequence = torch.tensor(user_sequence, dtype=torch.long)

        # Candidate items (all movies) for BERT
        candidate_items = torch.tensor(self.movie_ids, dtype=torch.long)

        return (
            torch.tensor(user_id, dtype=torch.long),
            torch.tensor(movie_id, dtype=torch.long),
            torch.tensor(rating, dtype=torch.float32),
            user_sequence,  # User's history for BERT
            candidate_items,  # Movies for BERT scoring
        )

# Function to pad sequences in DataLoader
def collate_fn(batch):
    users, movies, ratings, sequences, candidate_items = zip(*batch)

    users = torch.stack(users)
    movies = torch.stack(movies)
    ratings = torch.stack(ratings)

    # Pad user sequences
    sequences = pad_sequence(sequences, batch_first=True, padding_value=0)
    candidate_items = torch.stack(candidate_items)  # Should remain the same

    return users, movies, ratings, sequences, candidate_items

train_dataset = FusionDataset(train_data_cf, user_movie_dict, list(movie_ids.values()))
test_dataset = FusionDataset(test_data_cf, user_movie_dict, list(movie_ids.values()))

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, collate_fn=collate_fn)

# Print a sample batch
for batch in train_loader:
    users, movies, ratings, sequences, candidate_items = batch
    print(f"train_dataset shape: {len(train_dataset)}")
    print(f"test_dataset shape: {len(test_dataset)}")
    print(f"Users Shape: {users.shape}")  # (batch_size,)
    print(f"Movies Shape: {movies.shape}")  # (batch_size,)
    print(f"Ratings Shape: {ratings.shape}")  # (batch_size,)
    print(f"Sequences Shape: {sequences.shape}")  # (batch_size, max_history)
    print(f"Candidate Items Shape: {candidate_items.shape}")  # (batch_size, num_movies)

    # Print some values
    # print("Users:", users[:5])
    # print("Movies:", movies[:5])
    # print("Ratings:", ratings[:5])
    # print("Sequences:", sequences[:5])
    # print("Candidate Items:", candidate_items[:5])

    break  # Only print one batch


train_dataset shape: 797758
test_dataset shape: 202451
Users Shape: torch.Size([64])
Movies Shape: torch.Size([64])
Ratings Shape: torch.Size([64])
Sequences Shape: torch.Size([64, 30])
Candidate Items Shape: torch.Size([64, 3706])


In [23]:
#============================ dataset verifications
# Print first 5 samples from the train dataset
# Print first 5 samples from the train dataset
# Print first 5 samples from the train dataset
for i in range(3):
    user_id, movie_id, rating, user_sequence, candidate_items = train_dataset[i]
    print(f"Sample {i}:")
    print(f"  User ID: {user_id.item()}")
    print(f"  Movie ID: {movie_id.item()}")
    print(f"  Rating: {rating.item()}")
    print(f"  First 10 User Sequence: {user_sequence.tolist()[:10]}")  # Convert tensor to list
    print(f"  First 10 Candidate Items: {candidate_items.tolist()[:10]}")  # Convert tensor to list
    print("-" * 50)

Sample 0:
  User ID: 0
  Movie ID: 31
  Rating: 4.0
  First 10 User Sequence: [45, 26, 2, 6, 19, 38, 52, 1, 13, 49]
  First 10 Candidate Items: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
--------------------------------------------------
Sample 1:
  User ID: 0
  Movie ID: 22
  Rating: 5.0
  First 10 User Sequence: [45, 26, 2, 6, 19, 38, 52, 1, 13, 49]
  First 10 Candidate Items: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
--------------------------------------------------
Sample 2:
  User ID: 0
  Movie ID: 27
  Rating: 4.0
  First 10 User Sequence: [45, 26, 2, 6, 19, 38, 52, 1, 13, 49]
  First 10 Candidate Items: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
--------------------------------------------------


In [27]:
from tqdm import tqdm  # Progress bar for training

def train_fusion_model(fusion_model, train_loader, cf_model, bert_model, epochs=10, lr=0.01, print_every=int(12465/10)):
    fusion_model.train()
    optimizer = optim.Adam(fusion_model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        total_loss = 0
        batch_count = len(train_loader)

        print(f"\n Epoch {epoch+1}/{epochs} - Training Started...")

        progress_bar = tqdm(enumerate(train_loader), total=batch_count, desc=f"Epoch {epoch+1}", position=0, leave=True)

        for batch_idx, (users, movies, ratings, sequences, candidate_items) in progress_bar:
            users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)
            sequences, candidate_items = sequences.to(device), candidate_items.to(device)

            # Get CF scores
            cf_scores = cf_model(users, movies)  # Shape: (batch_size,)

            valid_mask = (sequences != 0).float()
            with torch.no_grad():  # No gradient needed for inference
                bert_logits = bert_model(sequences, valid_mask)  # Shape: (batch_size, max_seq_len, vocab_size)
                bert_scores_all = bert_logits[:, -1, :]  # Shape: (batch_size, vocab_size)
                bert_scores = torch.gather(bert_scores_all, 1, candidate_items)  # Shape: (batch_size, num_candidates)

            # Normalize BERT scores
            if bert_scores.max() > 0:
                bert_scores = (bert_scores - bert_scores.min()) / (bert_scores.max() - bert_scores.min() + 1e-8)

            # Compute Fusion Scores
            fusion_scores = fusion_model(cf_scores, bert_scores.mean(dim=1))  # Mean candidate scores

            # Compute Loss
            loss = criterion(fusion_scores, ratings)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # Update Progress Bar with Loss
            progress_bar.set_postfix({"Batch Loss": f"{loss.item():.4f}"})

            # Print loss every `print_every` batches
            if (batch_idx + 1) % print_every == 0:
                print(f"🔹 [Epoch {epoch+1} | Batch {batch_idx+1}/{batch_count}] Loss: {loss.item():.4f}")

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} Completed! Average Loss: {avg_loss:.4f}\n")

    return fusion_model


In [28]:
# Initialize Fusion Model
fusion_model = FusionModel(alpha=0.2).to(device)
# Train Fusion Model
trained_fusion_model = train_fusion_model(fusion_model, train_loader, cf_model, bert_model , epochs=3, lr=0.002)


 Epoch 1/3 - Training Started...


Epoch 1:  10%|█         | 1249/12465 [00:40<05:59, 31.19it/s, Batch Loss=0.9778]

🔹 [Epoch 1 | Batch 1246/12465] Loss: 1.2715


Epoch 1:  20%|██        | 2496/12465 [01:21<05:23, 30.78it/s, Batch Loss=0.9478]

🔹 [Epoch 1 | Batch 2492/12465] Loss: 1.2224


Epoch 1:  30%|███       | 3741/12465 [02:01<04:44, 30.69it/s, Batch Loss=1.0214]

🔹 [Epoch 1 | Batch 3738/12465] Loss: 1.1972


Epoch 1:  40%|████      | 4989/12465 [02:42<04:07, 30.27it/s, Batch Loss=1.1135]

🔹 [Epoch 1 | Batch 4984/12465] Loss: 0.9038


Epoch 1:  50%|█████     | 6233/12465 [03:23<03:25, 30.29it/s, Batch Loss=1.0602]

🔹 [Epoch 1 | Batch 6230/12465] Loss: 0.9483


Epoch 1:  60%|██████    | 7480/12465 [04:03<02:42, 30.75it/s, Batch Loss=0.7035]

🔹 [Epoch 1 | Batch 7476/12465] Loss: 1.1989


Epoch 1:  70%|███████   | 8727/12465 [04:44<01:58, 31.51it/s, Batch Loss=0.9641]

🔹 [Epoch 1 | Batch 8722/12465] Loss: 0.5447


Epoch 1:  80%|████████  | 9973/12465 [05:24<01:20, 30.88it/s, Batch Loss=0.6374]

🔹 [Epoch 1 | Batch 9968/12465] Loss: 0.8594


Epoch 1:  90%|████████▉ | 11217/12465 [06:05<00:39, 31.23it/s, Batch Loss=0.8166]

🔹 [Epoch 1 | Batch 11214/12465] Loss: 0.6410


Epoch 1: 100%|██████████| 12465/12465 [06:45<00:00, 30.72it/s, Batch Loss=0.8455]


🔹 [Epoch 1 | Batch 12460/12465] Loss: 1.0209
Epoch 1 Completed! Average Loss: 0.9986


 Epoch 2/3 - Training Started...


Epoch 2:  10%|█         | 1250/12465 [00:40<06:00, 31.09it/s, Batch Loss=0.8198]

🔹 [Epoch 2 | Batch 1246/12465] Loss: 0.7200


Epoch 2:  20%|██        | 2495/12465 [01:21<05:25, 30.64it/s, Batch Loss=1.0667]

🔹 [Epoch 2 | Batch 2492/12465] Loss: 0.9208


Epoch 2:  30%|███       | 3743/12465 [02:02<04:41, 30.97it/s, Batch Loss=0.9416]

🔹 [Epoch 2 | Batch 3738/12465] Loss: 0.8106


Epoch 2:  40%|████      | 4989/12465 [02:42<04:03, 30.76it/s, Batch Loss=0.6680]

🔹 [Epoch 2 | Batch 4984/12465] Loss: 0.7558


Epoch 2:  50%|█████     | 6236/12465 [03:23<03:22, 30.75it/s, Batch Loss=1.3705]

🔹 [Epoch 2 | Batch 6230/12465] Loss: 0.8225


Epoch 2:  60%|██████    | 7481/12465 [04:04<02:45, 30.13it/s, Batch Loss=0.9425]

🔹 [Epoch 2 | Batch 7476/12465] Loss: 0.8911


Epoch 2:  70%|██████▉   | 8725/12465 [04:44<02:04, 30.04it/s, Batch Loss=0.7793]

🔹 [Epoch 2 | Batch 8722/12465] Loss: 1.0019


Epoch 2:  80%|████████  | 9972/12465 [05:25<01:19, 31.25it/s, Batch Loss=0.7426]

🔹 [Epoch 2 | Batch 9968/12465] Loss: 0.8196


Epoch 2:  90%|████████▉ | 11218/12465 [06:05<00:40, 30.50it/s, Batch Loss=0.9095]

🔹 [Epoch 2 | Batch 11214/12465] Loss: 0.8213


Epoch 2: 100%|█████████▉| 12462/12465 [06:46<00:00, 30.68it/s, Batch Loss=1.2708]

🔹 [Epoch 2 | Batch 12460/12465] Loss: 1.0017


Epoch 2: 100%|██████████| 12465/12465 [06:46<00:00, 30.69it/s, Batch Loss=0.9010]


Epoch 2 Completed! Average Loss: 0.8556


 Epoch 3/3 - Training Started...


Epoch 3:  10%|█         | 1252/12465 [00:40<06:05, 30.68it/s, Batch Loss=0.8012]

🔹 [Epoch 3 | Batch 1246/12465] Loss: 0.8069


Epoch 3:  20%|██        | 2495/12465 [01:21<05:24, 30.76it/s, Batch Loss=0.7102]

🔹 [Epoch 3 | Batch 2492/12465] Loss: 0.7825


Epoch 3:  30%|███       | 3742/12465 [02:01<04:39, 31.26it/s, Batch Loss=0.8890]

🔹 [Epoch 3 | Batch 3738/12465] Loss: 0.8832


Epoch 3:  40%|████      | 4986/12465 [02:42<04:08, 30.14it/s, Batch Loss=0.7889]

🔹 [Epoch 3 | Batch 4984/12465] Loss: 0.8052


Epoch 3:  50%|█████     | 6234/12465 [03:22<03:21, 30.87it/s, Batch Loss=0.8919]

🔹 [Epoch 3 | Batch 6230/12465] Loss: 0.8751


Epoch 3:  60%|██████    | 7482/12465 [04:03<02:39, 31.26it/s, Batch Loss=0.9890]

🔹 [Epoch 3 | Batch 7476/12465] Loss: 0.7203


Epoch 3:  70%|██████▉   | 8725/12465 [04:43<02:02, 30.43it/s, Batch Loss=1.0197]

🔹 [Epoch 3 | Batch 8722/12465] Loss: 1.0945


Epoch 3:  80%|███████▉  | 9971/12465 [05:24<01:20, 30.92it/s, Batch Loss=0.8562]

🔹 [Epoch 3 | Batch 9968/12465] Loss: 0.5069


Epoch 3:  90%|█████████ | 11220/12465 [06:04<00:40, 31.12it/s, Batch Loss=0.8000]

🔹 [Epoch 3 | Batch 11214/12465] Loss: 0.6128


Epoch 3: 100%|█████████▉| 12463/12465 [06:44<00:00, 30.35it/s, Batch Loss=0.7239]

🔹 [Epoch 3 | Batch 12460/12465] Loss: 0.6941


Epoch 3: 100%|██████████| 12465/12465 [06:44<00:00, 30.79it/s, Batch Loss=1.1582]

Epoch 3 Completed! Average Loss: 0.8554






In [29]:
def evaluate_fusion_model(fusion_model, test_loader, cf_model, bert_model, k=10, device='cuda'):
    fusion_model.eval()
    total_loss, total_recall, total_ndcg = 0, 0, 0
    criterion = nn.MSELoss()
    batch_count = len(test_loader)

    print("\n Evaluating Fusion Model...")
    progress_bar = tqdm(enumerate(test_loader), total=batch_count, desc="Evaluating", position=0, leave=True)

    with torch.no_grad():
        for batch_idx, (users, movies, ratings, sequences, candidate_items) in progress_bar:
            users, movies, ratings = users.to(device), movies.to(device), ratings.to(device)
            sequences, candidate_items = sequences.to(device), candidate_items.to(device)


            cf_scores = cf_model(users, movies)


            valid_mask = (sequences != 0).float()
            bert_logits = bert_model(sequences, valid_mask)  # Shape: (batch_size, max_seq_len, vocab_size)
            bert_scores_all = bert_logits[:, -1, :]  # Take last prediction


            bert_scores = torch.gather(bert_scores_all, 1, candidate_items)  # Shape: (batch_size, num_candidates)


            if bert_scores.max() > 0:
                bert_scores = (bert_scores - bert_scores.min()) / (bert_scores.max() - bert_scores.min() + 1e-8)


            fusion_scores = fusion_model(cf_scores, bert_scores.mean(dim=1))


            loss = criterion(fusion_scores, ratings)
            total_loss += loss.item()


            _, top_k_predictions = torch.topk(fusion_scores, k, dim=-1)


            recall = recall_at_k(top_k_predictions, movies, k)
            ndcg = ndcg_at_k(top_k_predictions, movies, k)

            total_recall += recall
            total_ndcg += ndcg


            progress_bar.set_postfix({"Batch Loss": f"{loss.item():.4f}"})

    rmse = (total_loss / batch_count) ** 0.5
    avg_recall = total_recall / batch_count
    avg_ndcg = total_ndcg / batch_count

    print(f"\n Evaluation Completed! Fusion Model - RMSE: {rmse:.4f}, Recall@{k}: {avg_recall:.4f}, NDCG@{k}: {avg_ndcg:.4f}")

    return rmse, avg_recall, avg_ndcg

In [30]:
# Evaluate the Fusion Model
rmse, recall, ndcg = evaluate_fusion_model(fusion_model, test_loader, cf_model, bert_model, k=10, device=device)

# Print Results
# print(f"Fusion Model Evaluation:")
# print(f" - RMSE: {rmse:.4f}")
# print(f" - Recall@10: {recall:.4f}")
# print(f" - NDCG@10: {ndcg:.4f}")



 Evaluating Fusion Model...


Evaluating: 100%|██████████| 3164/3164 [01:39<00:00, 31.83it/s, Batch Loss=0.3803]


 Evaluation Completed! Fusion Model - RMSE: 0.9503, Recall@10: 0.0084, NDCG@10: 0.0038



