# Preprocessing Script

In [None]:
import json
from collections import defaultdict

dataset_name = 'Beauty1'
input_path = '/kaggle/input/amazondataset/All_Beauty.jsonl'
output_path = f'{dataset_name}.txt'

countU = defaultdict(int)
countP = defaultdict(int)
User = defaultdict(list)
usermap = {}
itemmap = {}
usernum = 0
itemnum = 0

with open(input_path, 'r') as f:
    for line in f:
        l = json.loads(line.strip())
        countU[l['user_id']] += 1
        countP[l['parent_asin']] += 1

with open(input_path, 'r') as f:
    for line in f:
        l = json.loads(line.strip())
        rev, asin, time = l['user_id'], l['parent_asin'], l['timestamp']
        
        if countU[rev] < 5 or countP[asin] < 5:
            continue
        
        if rev not in usermap:
            usernum += 1
            usermap[rev] = usernum
        
        if asin not in itemmap:
            itemnum += 1
            itemmap[asin] = itemnum
        
        User[usermap[rev]].append((time, itemmap[asin]))

for userid in User:
    User[userid].sort()

with open(output_path, 'w') as f:
    for userid, interactions in User.items():
        for _, itemid in interactions:
            f.write(f'{userid} {itemid}\n')

print(f"Processed {usernum} users and {itemnum} items. Data saved to {output_path}.")


Processed 1603 users and 4090 items. Data saved to Beauty1.txt.


# LLM Enhanced SASRec using PyTorch

In [None]:
import sys
import copy
import torch
import random
import numpy as np
from collections import defaultdict
from transformers import AutoTokenizer
from multiprocessing import Process, Queue

def get_device():
    return torch.device("cuda" if torch.cuda.is_available() else "cpu")

device = get_device()

def build_index(dataset_name):
    data = np.loadtxt(dataset_name, dtype=np.int32)
    usernum, itemnum = data[:, 0].max(), data[:, 1].max()
    
    u2i_index = [[] for _ in range(usernum + 1)]
    i2u_index = [[] for _ in range(itemnum + 1)]
    
    for u, i in data:
        u2i_index[u].append(i)
        i2u_index[i].append(u)
    
    return u2i_index, i2u_index

def random_neq(l, r, s):
    valid_items = np.setdiff1d(np.arange(l, r), list(s))
    return np.random.choice(valid_items) if len(valid_items) > 0 else 0

class WarpSampler:
    def __init__(self, User, usernum, itemnum, batch_size=64, maxlen=10, n_workers=1):
        self.User = User
        self.usernum = usernum
        self.itemnum = itemnum
        self.batch_size = batch_size
        self.maxlen = maxlen
        self.uids = np.arange(1, usernum + 1, dtype=np.int32)
        self.counter = 0
        
        self.tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-mini')
        self.tokenizer.add_special_tokens({'additional_special_tokens': [f'[ITEM_{i}]' for i in range(1, itemnum + 1)]})
        
        self.seq_buf = np.zeros((batch_size, maxlen), dtype=np.int32)
        self.pos_buf = np.zeros((batch_size, maxlen), dtype=np.int32)
        self.neg_buf = np.zeros((batch_size, maxlen), dtype=np.int32)
        self.bert_input_ids = np.zeros((batch_size, maxlen), dtype=np.int32)
        self.bert_attention_mask = np.zeros((batch_size, maxlen), dtype=np.int32)

        if n_workers > 1:
            self.queue = Queue(maxsize=n_workers*2)
            self.processes = [
                Process(target=self._sample_worker) 
                for _ in range(n_workers)
            ]
            for p in self.processes:
                p.start()

    def _sample_worker(self):
        while True:
            uid = self.uids[self.counter % self.usernum]
            self.counter += 1
            if len(self.User[uid]) <= 1:
                continue
            
            seq = np.zeros([self.maxlen], dtype=np.int32)
            pos = np.zeros([self.maxlen], dtype=np.int32)
            neg = np.zeros([self.maxlen], dtype=np.int32)
            nxt = self.User[uid][-1]
            idx = self.maxlen - 1

            ts = set(self.User[uid])
            for i in reversed(self.User[uid][:-1]):
                seq[idx] = i
                pos[idx] = nxt
                if nxt != 0:
                    neg[idx] = random_neq(1, self.itemnum + 1, ts)
                nxt = i
                idx -= 1
                if idx == -1: break
            
            self.queue.put((uid, seq, pos, neg))


    def sample(self, uid):
        while len(self.User[uid]) <= 1:
            uid = np.random.randint(1, self.usernum + 1)
        
        seq = np.zeros([self.maxlen], dtype=np.int32)
        pos = np.zeros([self.maxlen], dtype=np.int32)
        neg = np.zeros([self.maxlen], dtype=np.int32)
        nxt = self.User[uid][-1]
        idx = self.maxlen - 1

        ts = set(self.User[uid])
        for i in reversed(self.User[uid][:-1]):
            seq[idx] = i
            pos[idx] = nxt
            if nxt != 0:
                neg[idx] = random_neq(1, self.itemnum + 1, ts)
            nxt = i
            idx -= 1
            if idx == -1: break
        
        return uid, seq, pos, neg

    def next_batch(self):
        if self.counter % self.usernum == 0:
            np.random.shuffle(self.uids)
        
        valid_samples = 0
        while valid_samples < self.batch_size:
            uid = self.uids[(self.counter + valid_samples) % self.usernum]
            hist = self.User[uid]
            
            if len(hist) > 1:  
                nxt = hist[-1]
                idx = self.maxlen - 1
                ts = set(hist)
                
                for item in reversed(hist[:-1]):
                    self.seq_buf[valid_samples, idx] = item
                    self.pos_buf[valid_samples, idx] = nxt
                    self.bert_input_ids[valid_samples, idx] = item
                    self.bert_attention_mask[valid_samples, idx] = 1
                    
                    if nxt != 0:
                        self.neg_buf[valid_samples, idx] = random_neq(1, self.itemnum + 1, ts)
                    
                    nxt = item
                    idx -= 1
                    if idx == -1: break
                
                valid_samples += 1
            
            self.counter += 1
        
        return {
            'uids': self.uids.copy(),
            'seq': self.seq_buf.copy(),
            'pos': self.pos_buf.copy(),
            'neg': self.neg_buf.copy(),
            'bert_input_ids': self.bert_input_ids.copy(),
            'bert_attention_mask': self.bert_attention_mask.copy()
        }

def data_partition(fname):
    data = np.loadtxt(fname, dtype=np.int32)
    usernum, itemnum = data[:, 0].max(), data[:, 1].max()
    User = defaultdict(list)
    
    data = data[data[:, 0].argsort()]  
    
    for u, i in data:
        User[u].append(i)
    
    user_train, user_valid, user_test = {}, {}, {}
    for user in User:
        nfeedback = len(User[user])
        if nfeedback < 3:
            user_train[user] = User[user]
            user_valid[user] = []
            user_test[user] = []
        else:
            user_train[user] = User[user][:-2]
            user_valid[user] = [User[user][-2]]
            user_test[user] = [User[user][-1]]
    
    return [user_train, user_valid, user_test, usernum, itemnum]

@torch.no_grad()
def evaluate_bert(model, dataset, args):
    [train, valid, test, usernum, itemnum] = copy.deepcopy(dataset)
    model.eval()
    
    users = random.sample(range(1, usernum + 1), min(5000, usernum))
    all_items = np.arange(1, itemnum + 1)
    NDCG, HT = 0.0, 0.0
    valid_user = 0
    
    tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-mini')
    tokenizer.add_special_tokens({'additional_special_tokens': [f'[ITEM_{i}]' for i in range(1, itemnum + 1)]})
    
    with torch.no_grad():
        for u in users:
            if len(train[u]) < 1 or len(test[u]) < 1:
                continue
            
            seq_len = min(len(train[u]), args.maxlen)
            bert_input_ids = np.zeros(args.maxlen, dtype=np.int32)
            attention_mask = np.zeros(args.maxlen, dtype=np.int32)
            
            idx = args.maxlen - 1
            for i in reversed(train[u][-seq_len:]):
                bert_input_ids[idx] = tokenizer.convert_tokens_to_ids(f'[ITEM_{i}]')
                attention_mask[idx] = 1
                idx -= 1
                if idx == -1: break
            
            rated = set(train[u])
            rated.add(0)
            unrated = np.setdiff1d(all_items, list(rated))
            neg_samples = np.random.choice(unrated, size=100, replace=len(unrated) < 100)
            item_idx = np.concatenate([[test[u][0]], neg_samples])
            
            bert_inputs = {
                'input_ids': torch.from_numpy(np.array([bert_input_ids])).long().to(args.device),
                'attention_mask': torch.from_numpy(np.array([attention_mask])).long().to(args.device)
            }
            
            predictions = -model.predict(
                torch.tensor([u], dtype=torch.long).to(args.device),
                bert_inputs,
                torch.tensor(item_idx, dtype=torch.long).to(args.device)
            ).squeeze()
            
            rank = (predictions < predictions[0]).sum().item()
            valid_user += 1
            if rank < 10:
                NDCG += 1 / np.log2(rank + 2)
                HT += 1
                
    return NDCG / valid_user, HT / valid_user

def evaluate_valid(model, dataset, args):
    print(f"Evaluation running on device: {next(model.parameters()).device}")
    return evaluate_bert(model, [dataset[0], dataset[1], dataset[2], dataset[3], dataset[4]], args)

In [None]:
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoModel, AutoTokenizer

class PointWiseFeedForward(nn.Module):
    def __init__(self, hidden_units, dropout_rate):
        super().__init__()
        self.conv1 = nn.Conv1d(hidden_units, hidden_units, kernel_size=1)
        self.dropout1 = nn.Dropout(p=dropout_rate)
        self.conv2 = nn.Conv1d(hidden_units, hidden_units, kernel_size=1)
        self.dropout2 = nn.Dropout(p=dropout_rate)
        self.layer_norm = nn.LayerNorm(hidden_units, eps=1e-8)

    def forward(self, inputs):
        outputs = inputs.transpose(-1, -2)
        outputs = self.conv2(F.gelu(self.dropout1(self.conv1(outputs))))
        outputs = self.dropout2(outputs).transpose(-1, -2)
        return self.layer_norm(outputs + inputs)

class SASRecWithBERT(nn.Module):
    def __init__(self, num_users, num_items, args):
        super().__init__()
        self.num_items = num_items + 1
        self.num_users = num_users + 1
        self.hidden_units = args.hidden_units
        self.maxlen = args.maxlen
        self.device = args.device
        
        self.bert_model = AutoModel.from_pretrained('prajjwal1/bert-mini')
        self.bert_tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-mini')
        self.bert_tokenizer.add_special_tokens({'additional_special_tokens': [f'[ITEM_{i}]' for i in range(1, num_items + 1)]})
        self.bert_model.resize_token_embeddings(len(self.bert_tokenizer))
        
        for param in self.bert_model.parameters():
            param.requires_grad = False
            
        self.item_emb = nn.Embedding(self.num_items, args.hidden_units, padding_idx=0)
        self.pos_emb = nn.Embedding(args.maxlen, args.hidden_units)
        self.bert_proj = nn.Linear(256, args.hidden_units)
        
        self.attention_layernorms = nn.ModuleList()
        self.attention_layers = nn.ModuleList()
        self.forward_layernorms = nn.ModuleList()
        self.forward_layers = nn.ModuleList()
        
        for _ in range(args.num_blocks):
            self.attention_layernorms.append(nn.LayerNorm(args.hidden_units))
            self.attention_layers.append(nn.MultiheadAttention(args.hidden_units, args.num_heads, args.dropout_rate))
            self.forward_layernorms.append(nn.LayerNorm(args.hidden_units))
            self.forward_layers.append(nn.Sequential(
                nn.Linear(args.hidden_units, args.hidden_units * 4),
                nn.GELU(),
                nn.Linear(args.hidden_units * 4, args.hidden_units),
                nn.Dropout(args.dropout_rate)
            ))
        
        self.dropout = nn.Dropout(args.dropout_rate)
        self.final_layer = nn.Linear(args.hidden_units, args.hidden_units)
        self.to(self.device)
        self._init_weights()

    def _init_weights(self):
        nn.init.xavier_uniform_(self.item_emb.weight)
        nn.init.xavier_uniform_(self.bert_proj.weight)
        for layer in self.forward_layers:
            for module in layer:
                if isinstance(module, nn.Linear):
                    nn.init.xavier_uniform_(module.weight)

    def log2feats(self, log_seqs):
        
        log_seqs = log_seqs.to(self.device).long()

        log_seqs = torch.clamp(log_seqs, 0, self.num_items - 1)


        assert torch.all(log_seqs >= 0), "Negative values in input sequence"
        assert torch.all(log_seqs < self.num_items), f"Item IDs exceed maximum ({self.num_items-1})"
    
        seqs = self.item_emb(log_seqs) * math.sqrt(self.hidden_units)
        
        with torch.no_grad():
            seq_texts = []
        for seq in log_seqs.cpu():  
            items = []
            for x in seq:
                val = x.item()  
                if val != 0:    
                    items.append(f"[ITEM_{val}]")
            seq_text = " ".join(items) if items else "[PAD]"
            seq_texts.append(seq_text)

        try:
            inputs = self.bert_tokenizer(
                seq_texts,
                return_tensors='pt',
                padding=True,
                truncation=True,
                max_length=self.maxlen
            ).to(self.device)
            
            bert_out = self.bert_model(**inputs).last_hidden_state
            bert_feats = self.bert_proj(bert_out)
        except Exception as e:
            print(f"Error in BERT processing: {e}")
            print(f"Problematic sequences: {seq_texts}")
            raise

        if bert_feats.size(1) < seqs.size(1):
            bert_feats = F.pad(bert_feats, (0, 0, 0, seqs.size(1) - bert_feats.size(1)))
        elif bert_feats.size(1) > seqs.size(1):
            bert_feats = bert_feats[:, :seqs.size(1), :]
        
        seqs += bert_feats
        positions = torch.arange(self.maxlen, dtype=torch.long, device=self.device).unsqueeze(0)
        seqs += self.pos_emb(positions)
        seqs = self.dropout(seqs)
        
        timeline_mask = log_seqs == 0
        seqs *= ~timeline_mask.unsqueeze(-1)
        
        for i in range(len(self.attention_layers)):
            Q = self.attention_layernorms[i](seqs)
            mha_outputs, _ = self.attention_layers[i](Q.transpose(0, 1), Q.transpose(0, 1), Q.transpose(0, 1), key_padding_mask=timeline_mask)
            seqs = Q + mha_outputs.transpose(0, 1)
            seqs = self.forward_layernorms[i](seqs)
            seqs = seqs + self.forward_layers[i](seqs)
            seqs *= ~timeline_mask.unsqueeze(-1)
        
        return self.final_layer(seqs)

    def forward(self, user_ids, log_seqs, pos_seqs, neg_seqs):
        log_feats = self.log2feats(log_seqs)
        pos_logits = (log_feats * self.item_emb(pos_seqs)).sum(dim=-1)
        neg_logits = (log_feats * self.item_emb(neg_seqs)).sum(dim=-1)
        return pos_logits, neg_logits

    @torch.no_grad()
    def predict(self, user_ids, seq_input, item_indices):
        log_feats = self.log2feats(seq_input if isinstance(seq_input, torch.Tensor) else seq_input['input_ids'])[:, -1, :]
        item_tensor = torch.as_tensor(item_indices, dtype=torch.long, device=self.device)
        item_embs = self.item_emb(item_tensor)
        return (log_feats.unsqueeze(1) @ item_embs.unsqueeze(-1)).squeeze(-1).squeeze(1)


In [None]:
import os
import logging
import time
import torch
from tqdm import tqdm
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import CosineAnnealingLR

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if str(device) == 'cuda':
    torch.cuda.set_device(0)  
    print(f"\n=== GPU ACTIVATED ===\n"
          f"Device: {torch.cuda.get_device_name(0)}\n"
          f"Memory: Allocated = {torch.cuda.memory_allocated()/1e9:.2f}GB, "
          f"Cached = {torch.cuda.memory_reserved()/1e9:.2f}GB\n")
else:
    print("\n=== USING CPU ===\n")

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    force=True,
    handlers=[logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

class Args:
    dataset = "/kaggle/input/amazonbeauty/Beauty.txt"
    train_dir = "train_output"
    batch_size = 512
    lr = 0.001
    maxlen = 200
    hidden_units = 50
    num_blocks = 4
    num_epochs = 50
    num_heads = 2
    dropout_rate = 0.2
    l2_emb = 0.0
    weight_decay = 0.01
    warmup_steps = 4000
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inference_only = False
    state_dict_path = None

if __name__ == '__main__':

    args = Args()
    
    os.makedirs(args.train_dir, exist_ok=True)
    logger.info(f"Created output directory at: {os.path.abspath(args.train_dir)}")
    
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
    args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    if torch.cuda.is_available():
        torch.cuda.set_device(0)  
        
    dataset = data_partition(args.dataset)
    user_train, user_valid, user_test, usernum, itemnum = dataset
    
    model = SASRecWithBERT(usernum, itemnum, args).to(args.device)

    optimizer = torch.optim.AdamW([
        {'params': [p for n,p in model.named_parameters() if 'bert' not in n]},
        {'params': [p for n,p in model.named_parameters() if 'bert' in n], 'lr': 1e-5}
    ], lr=args.lr, weight_decay=args.weight_decay)
    
    logger.info(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    logger.info(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
    
    logger.info("Creating sampler...")
    sampler = WarpSampler(user_train, usernum, itemnum, 
                         batch_size=args.batch_size,
                         maxlen=args.maxlen,
                         n_workers=4)
    
    scheduler = CosineAnnealingLR(optimizer, args.num_epochs)
    scaler = GradScaler()
    
    best_ndcg = 0
    for epoch in range(1, args.num_epochs + 1):
        model.train()
        epoch_loss = 0
        start_time = time.time()
        
        with tqdm(total=len(user_train)//args.batch_size+1,
                 desc=f'Epoch {epoch}/{args.num_epochs}',
                 bar_format='{l_bar}{bar:20}{r_bar}',
                 mininterval=1) as pbar:
            
            for step in range(len(user_train) // args.batch_size + 1):
                batch = sampler.next_batch()
                u = torch.LongTensor(batch['uids']).to(args.device)
                seq = torch.LongTensor(batch['seq']).to(args.device)
                pos = torch.LongTensor(batch['pos']).to(args.device)
                neg = torch.LongTensor(batch['neg']).to(args.device)

                bert_inputs = {
                'input_ids': torch.LongTensor(batch['bert_input_ids']).to(device),
                'attention_mask': torch.LongTensor(batch['bert_attention_mask']).to(device)
                }
                       
                with autocast():
                    pos_logits, neg_logits = model(u, seq, pos, neg)
                    loss = torch.nn.BCEWithLogitsLoss()(
                        pos_logits[pos != 0], 
                        torch.ones_like(pos_logits[pos != 0])
                    )
                    loss += torch.nn.BCEWithLogitsLoss()(
                        neg_logits[pos != 0],
                        torch.zeros_like(neg_logits[pos != 0])
                    )
                
                optimizer.zero_grad()
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                
                epoch_loss += loss.item()
                pbar.set_postfix({'loss': f'{loss.item():.4f}'})
                pbar.update(1)
        
        avg_loss = epoch_loss / (len(user_train)//args.batch_size + 1)
        logger.info(f"Epoch {epoch} complete - Loss: {avg_loss:.4f} - Time: {time.time()-start_time:.1f}s")
        
        if epoch % 5 == 0 or epoch == 1 or epoch==2 or epoch==3:
            val_ndcg, val_hr = evaluate_valid(model, dataset, args)
            logger.info(f"Validation - NDCG@10: {val_ndcg:.4f}, HR@10: {val_hr:.4f}")
            
            if val_ndcg > best_ndcg:
                best_ndcg = val_ndcg
                model_path = os.path.join(args.train_dir, f'best_model_epoch{epoch}.pth')
                torch.save(model.state_dict(), model_path)
                logger.info(f"Saved best model to: {model_path}")
    
    sampler.close()


2025-04-07 16:22:22,745 - INFO - Created output directory at: /kaggle/working/train_output
2025-04-07 16:22:22,747 - INFO - Loading dataset...



=== GPU ACTIVATED ===
Device: Tesla T4
Memory: Allocated = 0.20GB, Cached = 1.94GB



2025-04-07 16:22:23,746 - INFO - 
=== SYSTEM CHECK ===
2025-04-07 16:22:23,747 - INFO - PyTorch 2.5.1+cu121, CUDA: True
2025-04-07 16:22:23,747 - INFO - Dataset: /kaggle/input/amazonbeauty/Beauty.txt
2025-04-07 16:22:23,748 - INFO - Users: 52,204, Items: 57,289
2025-04-07 16:22:23,754 - INFO - Train sequences: 293,912
2025-04-07 16:22:23,766 - INFO - Example user items: [45056, 48406, 54945, 47484, 39988]
2025-04-07 16:22:23,767 - INFO - 
Initializing model...
2025-04-07 16:22:42,119 - INFO - Model parameters: 28,849,044
2025-04-07 16:22:42,121 - INFO - Trainable parameters: 3,012,500
2025-04-07 16:22:42,121 - INFO - Creating sampler...
  scaler = GradScaler()
2025-04-07 16:23:00,465 - INFO - 
=== TRAINING START ===
  with autocast():
Epoch 1/50: 100%|████████████████████| 102/102 [06:04<00:00,  3.58s/it, loss=0.5777]
2025-04-07 16:29:05,445 - INFO - Epoch 1 complete - Loss: 0.8176 - Time: 365.0s
2025-04-07 16:29:05,446 - INFO - Running evaluation...


Evaluation running on device: cuda:0


2025-04-07 16:30:11,677 - INFO - Validation - NDCG@10: 0.1314, HR@10: 0.2768
2025-04-07 16:30:12,021 - INFO - Saved best model to: train_output/best_model_epoch1.pth
Epoch 2/50: 100%|████████████████████| 102/102 [06:08<00:00,  3.61s/it, loss=0.3795]
2025-04-07 16:36:20,405 - INFO - Epoch 2 complete - Loss: 0.4569 - Time: 368.4s
2025-04-07 16:36:20,406 - INFO - Running evaluation...


Evaluation running on device: cuda:0


2025-04-07 16:37:26,753 - INFO - Validation - NDCG@10: 0.1540, HR@10: 0.3028
2025-04-07 16:37:27,152 - INFO - Saved best model to: train_output/best_model_epoch2.pth
Epoch 3/50: 100%|████████████████████| 102/102 [06:10<00:00,  3.63s/it, loss=0.3076]
2025-04-07 16:43:37,301 - INFO - Epoch 3 complete - Loss: 0.3436 - Time: 370.1s
2025-04-07 16:43:37,302 - INFO - Running evaluation...


Evaluation running on device: cuda:0


2025-04-07 16:44:43,354 - INFO - Validation - NDCG@10: 0.1861, HR@10: 0.3428
2025-04-07 16:44:43,581 - INFO - Saved best model to: train_output/best_model_epoch3.pth
Epoch 4/50: 100%|████████████████████| 102/102 [06:08<00:00,  3.61s/it, loss=0.2853]
2025-04-07 16:50:51,936 - INFO - Epoch 4 complete - Loss: 0.2905 - Time: 368.4s
Epoch 5/50: 100%|████████████████████| 102/102 [06:05<00:00,  3.58s/it, loss=0.2412]
2025-04-07 16:56:57,306 - INFO - Epoch 5 complete - Loss: 0.2595 - Time: 365.4s
2025-04-07 16:56:57,307 - INFO - Running evaluation...


Evaluation running on device: cuda:0


2025-04-07 16:58:02,300 - INFO - Validation - NDCG@10: 0.1869, HR@10: 0.3378
2025-04-07 16:58:02,528 - INFO - Saved best model to: train_output/best_model_epoch5.pth
Epoch 6/50: 100%|████████████████████| 102/102 [06:05<00:00,  3.59s/it, loss=0.2150]
2025-04-07 17:04:08,452 - INFO - Epoch 6 complete - Loss: 0.2344 - Time: 365.9s
Epoch 7/50: 100%|████████████████████| 102/102 [06:03<00:00,  3.57s/it, loss=0.1999]
2025-04-07 17:10:12,301 - INFO - Epoch 7 complete - Loss: 0.2108 - Time: 363.8s
Epoch 8/50: 100%|████████████████████| 102/102 [06:09<00:00,  3.62s/it, loss=0.2005]
2025-04-07 17:16:21,612 - INFO - Epoch 8 complete - Loss: 0.2002 - Time: 369.3s
Epoch 9/50: 100%|████████████████████| 102/102 [06:05<00:00,  3.58s/it, loss=0.1737]
2025-04-07 17:22:27,024 - INFO - Epoch 9 complete - Loss: 0.1850 - Time: 365.4s
Epoch 10/50: 100%|████████████████████| 102/102 [06:04<00:00,  3.57s/it, loss=0.1606]
2025-04-07 17:28:31,394 - INFO - Epoch 10 complete - Loss: 0.1712 - Time: 364.4s
2025-04

Evaluation running on device: cuda:0


2025-04-07 17:29:36,644 - INFO - Validation - NDCG@10: 0.1549, HR@10: 0.2914
Epoch 11/50:   6%|█▏                  | 6/102 [00:21<05:57,  3.72s/it, loss=0.1604]