In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from tqdm import tqdm

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.amp import GradScaler, autocast

from transformers import DistilBertModel

import faiss

import warnings
warnings.filterwarnings("ignore", message=".*nested tensors.*")

In [2]:
movies = pd.read_csv(r"D:\movies_dataset\movies.csv")
ratings = pd.read_csv(r"D:\movies_dataset\filtered_ratings.csv")

In [3]:
movie_to_idx = {movie_id: idx + 2 for idx, movie_id in enumerate(movies['movieId'])}
idx_to_movie = {idx + 2: movie_id for idx, movie_id in enumerate(movies['movieId'])}

movie_to_idx['[PAD]'] = 0
idx_to_movie[0] = '[PAD]'

movie_to_idx['[MASK]'] = 1
idx_to_movie[1] = '[MASK]'

In [4]:
user_sequences = ratings.groupby('userId', sort=True)['movieId'].apply(list).reset_index()
user_sequences.columns = ['userId', 'sequence']

user_sequences = user_sequences[user_sequences['sequence'].apply(len) > 1]

user_sequences.loc[:, 'indexed_sequence'] = user_sequences['sequence'].apply(lambda seq: [movie_to_idx[movie_id] for movie_id in seq])

In [5]:
def create_training_examples(sequences, mask_token_id = 1, pad_token_id = 0, mask_prob = 0.1, num_mask = 5):
    train_data = []
    for seq in sequences:
        for _ in range(num_mask):
            input_seq = seq.copy()
            target_seq = [pad_token_id] * len(seq)
            mask_positions = []
            for i in range(len(seq)):
                if np.random.rand() < mask_prob and seq[i] != mask_token_id:
                    target_seq[i] = seq[i]
                    input_seq[i] = mask_token_id
                    mask_positions.append(i)
            if mask_positions:
                train_data.append((input_seq, target_seq, mask_positions))
    return train_data

In [6]:
train_user_sequences, val_user_sequences = train_test_split(
    user_sequences, test_size = 0.1, random_state = 42
)

train_user_sequences_4_examples = train_user_sequences['indexed_sequence'].tolist()
training_examples = create_training_examples(train_user_sequences_4_examples)

In [7]:
class BERT4RecDataset(Dataset):
    def __init__(self, training_examples, max_length = 10):
        self.examples = training_examples
        self.max_length = max_length
        self.pad_token_id = 0
        self.mask_token_id = 1
        
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        input_seq, target_seq, mask_positions = self.examples[idx]

        if len(input_seq) < self.max_length:
            pad_length = self.max_length - len(input_seq)
            input_seq.extend([self.pad_token_id] * pad_length)
            target_seq.extend([self.pad_token_id] * pad_length)

        attention_mask = [1 if x != self.pad_token_id else 0 for x in input_seq]
        labels_mask = [1 if i in mask_positions else 0 for i in range(len(target_seq))]
        
        return {
            'input_ids': torch.tensor(input_seq, dtype=torch.long),
            'target_ids': torch.tensor(target_seq, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels_mask': torch.tensor(labels_mask, dtype=torch.long)
        }

In [8]:
class BERT4RecValidationDataset(Dataset):
    def __init__(self, user_sequences_df, max_length = 10):
        self.user_sequences = user_sequences_df.reset_index(drop = True)
        self.max_length = max_length
        self.pad_token_id = 0
        self.mask_token_id = 1

    def __len__(self):
        return len(self.user_sequences)

    def __getitem__(self, idx):
        row = self.user_sequences.loc[idx]
        indexed_seq = row['indexed_sequence']

        input_seq_indexed = indexed_seq[:-1]
        target_movie_id = indexed_seq[-1]

        input_seq_indexed = input_seq_indexed + [self.mask_token_id]
        mask_position = len(input_seq_indexed) - 1

        if len(input_seq_indexed) < self.max_length:
            pad_length = self.max_length - len(input_seq_indexed)
            input_seq_indexed.extend([self.pad_token_id] * pad_length)

        attention_mask = [1 if x != self.pad_token_id else 0 for x in input_seq_indexed]
        
        labels_mask = [0] * self.max_length
        labels_mask[mask_position] = 1

        target_seq = [self.pad_token_id] * self.max_length
        target_seq[mask_position] = target_movie_id

        return {
            'input_ids': torch.tensor(input_seq_indexed, dtype=torch.long),
            'target_ids': torch.tensor(target_seq, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long),
            'labels_mask': torch.tensor(labels_mask, dtype=torch.long),
            'true_movie_id': target_movie_id
        }

In [9]:
dataset = BERT4RecDataset(training_examples)
dataloader = DataLoader(dataset, batch_size = 64, shuffle = True)
val_dataset = BERT4RecValidationDataset(val_user_sequences)
val_dataloader = DataLoader(val_dataset, batch_size = 64, shuffle = False)

In [10]:
def dcg_score(y_true, y_pred, k = 10):
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gains / discounts)

def ndcg_score(y_true, y_pred, k = 10):
    best = dcg_score(y_true, y_true, k)
    if best == 0:
        return 0.0
    actual = dcg_score(y_true, y_pred, k)
    return actual / best

def ndcg_at_k_batch(y_true_batch, y_pred_batch, k = 10):
    scores = []
    for y_true, y_pred in zip(y_true_batch, y_pred_batch):
        if not y_true or not y_pred:
             scores.append(0.0)
             continue

        relevance = [1 if movie_id in y_true else 0 for movie_id in y_pred[:k]]
        count_relevant = sum(relevance)
        ideal_relevance = [1] * min(count_relevant, k) + [0] * (k - min(count_relevant, k))

        y_pred_actual = np.arange(k, 0, -1)
        actual_dcg = dcg_score(relevance, y_pred_actual, k)

        best_dcg = dcg_score(ideal_relevance, ideal_relevance, k)

        scores.append(actual_dcg / best_dcg if best_dcg != 0 else 0.0)

    return np.mean(scores) if scores else 0.0


def recall_at_k_batch(y_true_batch, y_pred_batch, k = 10):
    recalls = []
    for y_true, y_pred in zip(y_true_batch, y_pred_batch):
        if len(y_true) == 0:
            recalls.append(1.0)
            continue
        y_pred_k = set(y_pred[:k])
        y_true_set = set(y_true)
        if len(y_true_set) == 0:
            recalls.append(1.0)
            continue
        hits = len(y_true_set & y_pred_k)
        recalls.append(hits / len(y_true_set))
    return np.mean(recalls) if recalls else 0.0

def precision_at_k_batch(y_true_batch, y_pred_batch, k = 10):
    precisions = []
    for y_true, y_pred in zip(y_true_batch, y_pred_batch):
        y_pred_k = set(y_pred[:k])
        y_true_set = set(y_true)
        if len(y_pred_k) == 0:
            precisions.append(1.0)
            continue
        hits = len(y_true_set & y_pred_k)
        precisions.append(hits / len(y_pred_k))
    return np.mean(precisions) if precisions else 0.0

def hit_rate_at_k_batch(y_true_batch, y_pred_batch, k = 10):
    hits = 0
    total = len(y_true_batch)
    for y_true, y_pred in zip(y_true_batch, y_pred_batch):
        y_pred_k = set(y_pred[:k])
        y_true_set = set(y_true)
        if y_true_set & y_pred_k:
            hits += 1
    return hits / total if total > 0 else 0.0

def average_precision_at_k(y_true, y_pred, k = 10):
    if len(y_true) == 0:
        return 0.0
    y_pred_k = y_pred[:k]
    score = 0.0
    num_hits = 0.0
    for i, p in enumerate(y_pred_k):
        if p in y_true and p not in y_pred_k[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)
    if num_hits == 0:
        return 0.0
    return score / min(len(y_true), k)

def map_at_k_batch(y_true_batch, y_pred_batch, k = 10):
    aps = []
    for y_true, y_pred in zip(y_true_batch, y_pred_batch):
        ap = average_precision_at_k(y_true, y_pred, k)
        aps.append(ap)
    return np.mean(aps) if aps else 0.0

In [11]:
def validate(model, val_dataloader, device, k = 10):
    model.eval()
    total_loss = 0
    num_batches = 0

    all_y_true_movie_ids = []
    all_y_pred_movie_ids = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc = "Validation"):
            input_ids = batch['input_ids'].to(device)
            target_ids = batch['target_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels_mask = batch['labels_mask'].to(device)

            outputs = model(input_ids = input_ids, attention_mask = attention_mask)

            loss = sampled_softmax_loss(
                outputs,
                target_ids,
                num_samples = num_negative_samples,
                vocab_size = len(movie_to_idx)
            )
            total_loss += loss.item()
            num_batches += 1

            batch_size, seq_length, vocab_size = outputs.shape
            mask_positions = torch.where(labels_mask == 1)
            masked_logits = outputs[mask_positions]

            top_k_logits, top_k_indices = torch.topk(masked_logits, k, dim = 1, largest = True, sorted = True)

            mask_token_idx = movie_to_idx['[MASK]']
            top_k_indices_filtered = torch.where(top_k_indices == mask_token_idx, torch.tensor(0, device = top_k_indices.device), top_k_indices)

            pred_movie_ids = []
            for i in range(top_k_indices_filtered.shape[0]):
                pred_ids_for_item = []
                for idx in top_k_indices_filtered[i]:
                    movie_id = idx_to_movie.get(idx.item(), 0)
                    if movie_id != 0 and movie_id != '[PAD]':
                        pred_ids_for_item.append(movie_id)
                pred_movie_ids.append(pred_ids_for_item)

            true_movie_ids_for_masks = target_ids[mask_positions].cpu().numpy()
            true_movie_ids_converted = [idx_to_movie.get(idx, 0) for idx in true_movie_ids_for_masks]

            y_true_batch = [[true_id] for true_id in true_movie_ids_converted]
            y_pred_batch = pred_movie_ids

            all_y_true_movie_ids.extend(y_true_batch)
            all_y_pred_movie_ids.extend(y_pred_batch)

    avg_loss = total_loss / num_batches

    recall_at_k = recall_at_k_batch(all_y_true_movie_ids, all_y_pred_movie_ids, k = k)
    precision_at_k = precision_at_k_batch(all_y_true_movie_ids, all_y_pred_movie_ids, k = k)
    map_at_k = map_at_k_batch(all_y_true_movie_ids, all_y_pred_movie_ids, k = k)
    ndcg_at_k = ndcg_at_k_batch(all_y_true_movie_ids, all_y_pred_movie_ids, k = k)
    hr_at_k = hit_rate_at_k_batch(all_y_true_movie_ids, all_y_pred_movie_ids, k = k)

    print(f"Validation Loss: {avg_loss:.4f}")
    print(f"Recall@{k}: {recall_at_k:.4f}")
    print(f"Precision@{k}: {precision_at_k:.4f}")
    print(f"MAP@{k}: {map_at_k:.4f}")
    print(f"NDCG@{k}: {ndcg_at_k:.4f}")
    print(f"HR@{k}: {hr_at_k:.4f}")

    return avg_loss, recall_at_k, precision_at_k, map_at_k, ndcg_at_k, hr_at_k

In [12]:
distil_bert = DistilBertModel.from_pretrained("distilbert-base-uncased").eval()

for param in distil_bert.parameters():
    param.requires_grad = False

In [13]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
class Bert4Rec(nn.Module):
    def __init__(self, distil_bert, nhead, dim_feedforward, num_layers, movie_len, max_len = 10, reduced_dim = 128):
        super().__init__()
        self.max_len = max_len
        self.reduced_dim = reduced_dim
        self.movie_len = movie_len

        self.distil_bert = distil_bert

        self.movie_adapter = nn.Embedding(movie_len, 768, padding_idx = 0)

        self.distilbert_to_reduced = nn.Linear(768, reduced_dim)

        self.position_embeddings = nn.Embedding(max_len, reduced_dim)

        self.embedding_layer_norm = nn.LayerNorm(reduced_dim)
        self.embedding_dropout = nn.Dropout(0.1)

        encoder_layers = TransformerEncoderLayer(
            reduced_dim,
            nhead,
            dim_feedforward,
            dropout = 0.1,
            activation = 'gelu',
            batch_first = True
        )
        self.transformer = TransformerEncoder(encoder_layers, num_layers)

        self.prediction_head = nn.Linear(reduced_dim, self.movie_len)

    def forward(self, input_ids, attention_mask):
        batch_size, seq_length = input_ids.shape

        movie_embeddings = self.movie_adapter(input_ids)  # [batch_size, seq_length, 768]

        with torch.no_grad():
            distilbert_output = self.distil_bert(
                inputs_embeds = movie_embeddings,
                attention_mask = attention_mask
            )
            token_embeddings = distilbert_output.last_hidden_state  # [batch_size, seq_length, 768]

        token_embeddings = self.distilbert_to_reduced(token_embeddings)  # [batch_size, seq_length, reduced_dim]

        position_ids = torch.arange(seq_length, device = input_ids.device).unsqueeze(0).expand(batch_size, -1)
        position_embeddings = self.position_embeddings(position_ids)  # [batch_size, seq_length, reduced_dim]

        hidden_states = token_embeddings + position_embeddings
        hidden_states = self.embedding_layer_norm(hidden_states)
        hidden_states = self.embedding_dropout(hidden_states)

        transformer_output = self.transformer(
            hidden_states,
            src_key_padding_mask = (attention_mask == 0)
        )

        logits = self.prediction_head(transformer_output)  # [batch_size, seq_length, movie_len]

        return logits

    def get_movie_embeddings(self):
        self.eval()
        with torch.no_grad():
            
            movie_indices = torch.arange(2, self.movie_len, device=device)
            movie_embeddings = self.movie_adapter(movie_indices)  # [num_movies, 768]
            reduced_embeddings = self.distilbert_to_reduced(movie_embeddings)  # [num_movies, reduced_dim]
            
            return reduced_embeddings.cpu().numpy(), movie_indices.cpu().numpy()

In [15]:
model = Bert4Rec(
    distil_bert, 
    nhead = 8,
    dim_feedforward = 512,
    num_layers = 3,
    movie_len = len(movie_to_idx),
    reduced_dim = 128
).to(device)

In [16]:
def sampled_softmax_loss(logits, targets, num_samples, vocab_size, padding_idx = 0, mask_idx = 1):

    mask = (targets != padding_idx)
    masked_logits = logits[mask]  # [num_masked, vocab_size]
    masked_targets = targets[mask]  # [num_masked]
    
    batch_size = masked_logits.size(0)
    
    weights = torch.ones(vocab_size, device = logits.device)
    weights[padding_idx] = 0
    weights[mask_idx] = 0

    neg_samples = torch.multinomial(
        weights, 
        batch_size * num_samples, 
        replacement = True
    ).view(batch_size, num_samples)
    
    mask = (neg_samples == masked_targets.unsqueeze(1))

    if mask.any():
        total_replacements = mask.sum().item()
        replacement = torch.multinomial(weights, total_replacements, replacement = True)

        neg_samples[mask] = replacement
    
    all_indices = torch.cat([masked_targets.unsqueeze(1), neg_samples], dim = 1)  # [batch_size, num_samples + 1]

    selected_logits = torch.gather(masked_logits, 1, all_indices)

    targets_for_loss = torch.zeros(batch_size, dtype=torch.long, device = logits.device)

    loss = F.cross_entropy(selected_logits, targets_for_loss)
    
    return loss

In [17]:
scaler = GradScaler(device.type)
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)

num_negative_samples = 1000
epochs = 10
k_for_metrics = 10

In [18]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    num_batches = 0
    for batch in tqdm(dataloader, desc = "Training"):
        input_ids = batch['input_ids'].to(device)
        target_ids = batch['target_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels_mask = batch['labels_mask'].to(device)

        
        optimizer.zero_grad()
        
        with autocast(device.type):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            loss = sampled_softmax_loss(
                outputs,
                target_ids,
                num_samples = num_negative_samples,
                vocab_size = len(movie_to_idx),
            )

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
        num_batches += 1
        
    avg_loss = total_loss / num_batches
    print(f"Epoch {epoch+1} | Avg Loss: {avg_loss:.4f} | Total Loss: {total_loss:.4f}")
    
    val_loss, recall_k, precision_k, map_k, ndcg_k, hr_k = validate(model, val_dataloader, device, k=k_for_metrics)

Training: 100%|██████████| 8759/8759 [08:30<00:00, 17.17it/s]


Epoch 1 | Avg Loss: 3.4281 | Total Loss: 30026.6668


Validation: 100%|██████████| 369/369 [00:28<00:00, 13.15it/s]


Validation Loss: 3.3967
Recall@10: 0.0848
Precision@10: 0.0085
MAP@10: 0.0295
NDCG@10: 0.0422
HR@10: 0.0848


Training: 100%|██████████| 8759/8759 [08:34<00:00, 17.03it/s]


Epoch 2 | Avg Loss: 3.1724 | Total Loss: 27787.4799


Validation: 100%|██████████| 369/369 [00:28<00:00, 12.95it/s]


Validation Loss: 3.3341
Recall@10: 0.0941
Precision@10: 0.0094
MAP@10: 0.0346
NDCG@10: 0.0483
HR@10: 0.0941


Training: 100%|██████████| 8759/8759 [09:35<00:00, 15.22it/s]


Epoch 3 | Avg Loss: 3.0827 | Total Loss: 27001.1128


Validation: 100%|██████████| 369/369 [00:31<00:00, 11.86it/s]


Validation Loss: 3.2646
Recall@10: 0.1001
Precision@10: 0.0100
MAP@10: 0.0369
NDCG@10: 0.0515
HR@10: 0.1001


Training: 100%|██████████| 8759/8759 [09:48<00:00, 14.88it/s]


Epoch 4 | Avg Loss: 3.0145 | Total Loss: 26404.2613


Validation: 100%|██████████| 369/369 [00:32<00:00, 11.41it/s]


Validation Loss: 3.2548
Recall@10: 0.1028
Precision@10: 0.0103
MAP@10: 0.0387
NDCG@10: 0.0535
HR@10: 0.1028


Training: 100%|██████████| 8759/8759 [09:46<00:00, 14.93it/s]


Epoch 5 | Avg Loss: 2.9557 | Total Loss: 25889.1025


Validation: 100%|██████████| 369/369 [00:31<00:00, 11.68it/s]


Validation Loss: 3.2222
Recall@10: 0.1078
Precision@10: 0.0108
MAP@10: 0.0418
NDCG@10: 0.0570
HR@10: 0.1078


Training: 100%|██████████| 8759/8759 [09:32<00:00, 15.31it/s]


Epoch 6 | Avg Loss: 2.9071 | Total Loss: 25463.5317


Validation: 100%|██████████| 369/369 [00:28<00:00, 13.15it/s]


Validation Loss: 3.1971
Recall@10: 0.1129
Precision@10: 0.0113
MAP@10: 0.0436
NDCG@10: 0.0597
HR@10: 0.1129


Training: 100%|██████████| 8759/8759 [08:27<00:00, 17.26it/s]


Epoch 7 | Avg Loss: 2.8605 | Total Loss: 25055.2490


Validation: 100%|██████████| 369/369 [00:27<00:00, 13.29it/s]


Validation Loss: 3.2026
Recall@10: 0.1128
Precision@10: 0.0113
MAP@10: 0.0437
NDCG@10: 0.0597
HR@10: 0.1128


Training: 100%|██████████| 8759/8759 [09:05<00:00, 16.04it/s]


Epoch 8 | Avg Loss: 2.8201 | Total Loss: 24701.2040


Validation: 100%|██████████| 369/369 [00:30<00:00, 11.93it/s]


Validation Loss: 3.1825
Recall@10: 0.1174
Precision@10: 0.0117
MAP@10: 0.0451
NDCG@10: 0.0618
HR@10: 0.1174


Training: 100%|██████████| 8759/8759 [09:03<00:00, 16.13it/s]


Epoch 9 | Avg Loss: 2.7834 | Total Loss: 24379.4392


Validation: 100%|██████████| 369/369 [00:29<00:00, 12.56it/s]


Validation Loss: 3.1644
Recall@10: 0.1172
Precision@10: 0.0117
MAP@10: 0.0457
NDCG@10: 0.0623
HR@10: 0.1172


Training: 100%|██████████| 8759/8759 [08:26<00:00, 17.31it/s]


Epoch 10 | Avg Loss: 2.7472 | Total Loss: 24063.1060


Validation: 100%|██████████| 369/369 [00:27<00:00, 13.24it/s]


Validation Loss: 3.1664
Recall@10: 0.1202
Precision@10: 0.0120
MAP@10: 0.0452
NDCG@10: 0.0625
HR@10: 0.1202


In [19]:
class FaissRecommender:
    def __init__(self, model, device, movie_to_idx, idx_to_movie):
        self.model = model
        self.device = device
        self.movie_to_idx = movie_to_idx
        self.idx_to_movie = idx_to_movie
        self.index = None
        self.movie_vectors = None
        self.movie_indices = None

    def build_index(self):
        movie_embeddings, movie_indices = self.model.get_movie_embeddings()
        self.movie_vectors = movie_embeddings
        self.movie_indices = movie_indices
        faiss.normalize_L2(movie_embeddings)

        # HNSW
        dim = movie_embeddings.shape[1]
        self.index = faiss.IndexHNSWFlat(dim, 32)
        self.index.hnsw.efConstruction = 40
        self.index.hnsw.efSearch = 16

        self.index.add(movie_embeddings)

    def get_recommendations(self, watched_movies, k=10, exclude_watched = True):
        watched_indices = [
            self.movie_to_idx[mid] for mid in watched_movies if mid in self.movie_to_idx
        ]

        if not watched_indices:
            return []

        valid_indices = [idx - 2 for idx in watched_indices if idx >= 2]
        watched_vectors = self.movie_vectors[valid_indices]
        query_vector = np.mean(watched_vectors, axis=0, keepdims=True)
        faiss.normalize_L2(query_vector)

        total_k = k + len(watched_indices) if exclude_watched else k
        scores, indices = self.index.search(query_vector, total_k)

        recommendations = []
        watched_set = set(watched_movies) if exclude_watched else set()

        for idx, score in zip(indices[0], scores[0]):
            if idx >= len(self.movie_indices):
                continue

            movie_idx = self.movie_indices[idx]
            movie_id = self.idx_to_movie[movie_idx]

            if exclude_watched and movie_id in watched_set:
                continue

            recommendations.append((movie_id, float(score)))

            if len(recommendations) == k:
                break

        return recommendations

In [20]:
faiss_recommender = FaissRecommender(model, device, movie_to_idx, idx_to_movie)
faiss_recommender.build_index()  

In [44]:
k = 5 
user_history = [20, 31]
recommendations = faiss_recommender.get_recommendations(user_history, k = k)
titles = movies[movies['movieId'].isin(user_history)]['title'].tolist()

print(f"History: {titles}:")
for i, (movie_id, score) in enumerate(recommendations, 1):
    print(f"{i}. Film: {movies['title'][movies['movieId'] == movie_id].item()} (score: {score:.4f})")

History: ['Money Train (1995)', 'Dangerous Minds (1995)']:
1. Film: Akaton mies (1983) (score: 0.9087)
2. Film: The Black Shield Of Falworth (1954) (score: 0.9255)
3. Film: Baadasssss! (How to Get the Man's Foot Outta Your Ass) (2003) (score: 0.9289)
4. Film: For a Cop's Hide (1981) (score: 0.9362)
5. Film: Bullets or Ballots (1936) (score: 0.9477)
