In [46]:
DATA_DIRECTORY = "./dataset/yelp2018"
MAX_INTERACTIONS = 100_000
MAX_USERS = 6000
MAX_ITEMS = 20000
NEGATIVE_SAMPLING_RATIO = 6

SEED = 6969
TRAIN_SIZE = 0.9
VALIDATION_SIZE = 0.1
BATCH_SIZE = 1024
NUM_LATENT_FACTOR = 8
LEARNING_RATE = 0.001
WEIGHT_DACAY = 0.00001
PROJECT_NAME = f'RecSys-NeuMF-Yelp2018-Latent{NUM_LATENT_FACTOR}-NegRatio{NEGATIVE_SAMPLING_RATIO}-with-Metrics'

import random
import pandas as pd
import wandb

import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.loggers import WandbLogger
from metrics import MetronAtK

# NeuMF Model

In [47]:
class NCFModel(pl.LightningModule):
    def __init__(self, num_users, num_items, latent_factors, num_mlp_layers=[16, 64, 32, 16, 8], top_k=10):
        super().__init__()
        self.num_users = num_users
        self.num_items = num_items
        self.top_k = top_k
        
        # GMF
        ## GMF USER EMBEDDING
        self.gmf_user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=latent_factors)
        nn.init.normal_(self.gmf_user_embedding.weight, mean=0, std=0.01)
        self.gmf_user_embedding.weight = nn.Parameter(self.gmf_user_embedding.weight, requires_grad=True)
        ## GMF ITEM EMBEDDING
        self.gmf_item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=latent_factors)
        nn.init.normal_(self.gmf_item_embedding.weight, mean=0, std=0.01)
        self.gmf_item_embedding.weight = nn.Parameter(self.gmf_item_embedding.weight, requires_grad=True)
        
        # MLP
        ## MLP USER EMBEDDING
        self.mlp_user_embedding = nn.Embedding(num_embeddings=num_users, embedding_dim=latent_factors)
        ## MLP ITEM EMBEDDING
        self.mlp_item_embedding = nn.Embedding(num_embeddings=num_items, embedding_dim=latent_factors)
        ## MLP LAYERS
        mlp_layers = []
        input_dim = num_mlp_layers[0]  # Concatenated embedding dimension
        for idx in range(1, len(num_mlp_layers)):
            mlp_layers.append(
                nn.Linear(input_dim, num_mlp_layers[idx])
            )
            mlp_layers.append(nn.ReLU())
            input_dim = num_mlp_layers[idx]
        self.mlp = nn.Sequential(*mlp_layers)
            
        # NeuMF
        self.neumf_layer = nn.Linear(in_features=num_mlp_layers[-1] + latent_factors, out_features=1)
        self.neumf_sigmoid = nn.Sigmoid()
        
        # Weight Initialize
        for sm in self.modules():
            if isinstance(sm, (nn.Embedding, nn.Linear)):
                nn.init.normal_(sm.weight.data, 0.0, 0.01)
                
    def forward(self, user_input, item_input):
        # GMF
        ## GMF EMBEDDING
        gmf_user_embedding = self.gmf_user_embedding(user_input)
        gmf_item_embedding = self.gmf_item_embedding(item_input)
        ## GMF ELEMENT-WISE PRODUCT
        gmf_element_wise_product = torch.mul(gmf_user_embedding, gmf_item_embedding)
        
        # MLP
        ## MLP EMBEDDING
        mlp_user_embedding = self.mlp_user_embedding(user_input)
        mlp_item_embedding = self.mlp_item_embedding(item_input)
        mlp_concat_embedding = torch.cat([mlp_user_embedding, mlp_item_embedding], dim=-1)

        ## MLP FORWARD
        mlp_vector = mlp_concat_embedding
        for idx, _ in enumerate(range(len(self.mlp))):
            mlp_vector = self.mlp[idx](mlp_vector)
            
        # NeuMF
        ## CONCAT GMF & MLP
        neumf_concat_embedding = torch.cat([gmf_element_wise_product, mlp_vector], dim=-1)
        prediction = self.neumf_sigmoid(self.neumf_layer(neumf_concat_embedding))
        
        return prediction

    def configure_optimizers(self):
        optimizer = optim.Adam(self.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DACAY)
        return optimizer
    
    def training_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predictions = self.forward(user_input, item_input).squeeze()
        loss = nn.BCELoss()(predictions, labels.float())
        self.log("train_loss", loss)
        return loss

    def ndcg_at_k(self, predictions, labels, k):
        """
        Compute NDCG@K metric.
        Args:
            predictions (torch.Tensor): Predicted scores.
            labels (torch.Tensor): Ground truth labels (binary).
            k (int): Top K items to consider.
        Returns:
            float: NDCG@K value.
        """
        _, indices = torch.topk(predictions, k, largest=True, sorted=True)
        top_k_labels = labels[indices]
        dcg = torch.sum((2 ** top_k_labels - 1) / torch.log2(torch.arange(2, k + 2).float()))
        ideal_dcg = torch.sum((2 ** torch.sort(labels, descending=True).values[:k] - 1) / torch.log2(torch.arange(2, k + 2).float()))
        return (dcg / ideal_dcg) if ideal_dcg > 0 else 0.0


    def hit_ratio_at_k(self, predictions, labels, k):
        """
        Compute HR@K metric.
        Args:
            predictions (torch.Tensor): Predicted scores.
            labels (torch.Tensor): Ground truth labels (binary).
            k (int): Top K items to consider.
        Returns:
            float: HR@K value.
        """
        _, indices = torch.topk(predictions, k, largest=True, sorted=True)
        top_k_labels = labels[indices]
        return torch.sum(top_k_labels).item() > 0

    def validation_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predictions = self.forward(user_input, item_input).squeeze()
        loss = nn.BCELoss()(predictions, labels.float())
        
        # Calculate metrics using self.top_k
        ndcg = self.ndcg_at_k(predictions, labels, self.top_k)
        hr = self.hit_ratio_at_k(predictions, labels, self.top_k)
        
        # Log metrics
        self.log("val_loss", loss, prog_bar=True)
        self.log("val_ndcg", ndcg, prog_bar=True)
        self.log("val_hr", hr, prog_bar=True)
        
        return {"loss": loss, "ndcg": ndcg, "hr": hr}

    def test_step(self, batch, batch_idx):
        user_input, item_input, labels = batch
        predictions = self.forward(user_input, item_input).squeeze()
        loss = nn.BCELoss()(predictions, labels.float())
        self.log("test_loss", loss)
        return loss



# Data

In [48]:
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        return sample[0], sample[1], sample[2]

class Yelp2018DataModule(pl.LightningDataModule):
    def __init__(self, data_dir=DATA_DIRECTORY, batch_size=BATCH_SIZE):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        
    def process_data(self, file_path):
        with open(file_path, 'r') as file:
            data = file.readlines()
            
            user_item_dict = {}
            interaction_count = 0
            original_user_ids = []
            original_item_ids = set()
            
            for user_id in range(MAX_USERS):
                tokens = list(map(int, data[user_id].strip().split()))
                item_ids = tokens[1:]
                if len(item_ids) < 20 or max(item_ids) > MAX_ITEMS:
                    continue
                else:
                    original_user_ids.append(user_id)
                    user_item_dict[user_id] = item_ids
                    interaction_count += len(item_ids)
                    original_item_ids.update(item_ids)
            
            # Re-index user and item IDs
            user_id_map = {original_id: new_id for new_id, original_id in enumerate(original_user_ids)}
            item_id_map = {original_id: new_id for new_id, original_id in enumerate(sorted(original_item_ids))}
            
            self.num_users = len(user_id_map)
            self.num_items = len(item_id_map)
            print(f'Read data from {file_path}')
            print(f'Filtered {self.num_users:,d} unique users & {self.num_items:,d} unique items.')
            print(f'Interaction count: {interaction_count:,d} records.\n')
            
            data_processed = []
            for original_user_id, item_ids in user_item_dict.items():
                reindexed_user_id = user_id_map[original_user_id]
                reindexed_item_ids = [item_id_map[item_id] for item_id in item_ids]
                # POSITIVE
                for reindexed_item_id in reindexed_item_ids:
                    data_processed.append([reindexed_user_id, reindexed_item_id, 1])
                # NEGATIVE
                all_items = set(range(len(item_id_map)))
                non_interacted_items = all_items - set(reindexed_item_ids)
                negative_samples = random.sample(list(non_interacted_items), len(item_ids) * NEGATIVE_SAMPLING_RATIO)
                for reindexed_item_id in negative_samples:
                    data_processed.append([reindexed_user_id, reindexed_item_id, 0])
            return data_processed
    
    def setup(self, stage=None):
        self.test_data = self.process_data(f"{self.data_dir}/test.txt")
        full_data = self.process_data(f"{self.data_dir}/train.txt")
        self.train_data, self.val_data = random_split(full_data, [TRAIN_SIZE, VALIDATION_SIZE])
        
        self.train_dataset = CustomDataset(self.train_data)
        self.val_dataset = CustomDataset(self.val_data)
        self.test_dataset = CustomDataset(self.test_data)
          
    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size)

# Main

In [49]:
# data_module = Yelp2018DataModule()
# data_module.setup()

# train_loader = data_module.train_dataloader()
# for batch in train_loader:
#     print("Batch sample:", batch)
#     break  # Only view the first batch

In [None]:
# Callbacks
checkpoint_callback = ModelCheckpoint(monitor="val_loss")
#early_stopping_callback = EarlyStopping(monitor="val_loss", patience=5, mode="min")

# Data and Model
datamodule = Yelp2018DataModule()
datamodule.setup()
model = NCFModel(num_users=datamodule.num_users, num_items=datamodule.num_items, latent_factors=NUM_LATENT_FACTOR)
# print(model)

# # Trainer
# logger = TensorBoardLogger("logs", name="ncf")

# initialise the wandb logger and name your wandb project
wandb_logger = WandbLogger(project=PROJECT_NAME)
wandb.init()
# add your batch size to the wandb config
wandb_logger.experiment.config["batch_size"] = BATCH_SIZE
trainer = Trainer(
    fast_dev_run=False,
    logger=wandb_logger,
    max_epochs=200,
    callbacks=[checkpoint_callback],
    log_every_n_steps=50,
)
trainer.fit(model, datamodule=datamodule)
wandb.finish()

Read data from ./dataset/yelp2018/test.txt
Filtered 48 unique users & 1,031 unique items.
Interaction count: 1,223 records.

Read data from ./dataset/yelp2018/train.txt
Filtered 1,970 unique users & 19,643 unique items.
Interaction count: 102,391 records.

                                                                           

  lambda data: self._console_raw_callback("stdout", data),


c:\ProgramData\anaconda3\envs\PytorchLightning_ENV\Lib\site-packages\pytorch_lightning\loggers\wandb.py:396: There is a wandb run already in progress and newly created instances of `WandbLogger` will reuse this run. If this is not desired, call `wandb.finish()` before instantiating `WandbLogger`.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


Read data from ./dataset/yelp2018/test.txt
Filtered 48 unique users & 1,031 unique items.
Interaction count: 1,223 records.

Read data from ./dataset/yelp2018/train.txt
Filtered 1,970 unique users & 19,643 unique items.
Interaction count: 102,391 records.




  | Name               | Type       | Params | Mode 
----------------------------------------------------------
0 | gmf_user_embedding | Embedding  | 15.8 K | train
1 | gmf_item_embedding | Embedding  | 157 K  | train
2 | mlp_user_embedding | Embedding  | 15.8 K | train
3 | mlp_item_embedding | Embedding  | 157 K  | train
4 | mlp                | Sequential | 3.8 K  | train
5 | neumf_layer        | Linear     | 17     | train
6 | neumf_sigmoid      | Sigmoid    | 0      | train
----------------------------------------------------------
349 K     Trainable params
0         Non-trainable params
349 K     Total params
1.399     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode


                                                                            

c:\ProgramData\anaconda3\envs\PytorchLightning_ENV\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\ProgramData\anaconda3\envs\PytorchLightning_ENV\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Epoch 5:  77%|███████▋  | 482/630 [00:28<00:08, 16.93it/s, v_num=lpap, val_loss=0.319, val_ndcg=0.872, val_hr=1.000]