In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import matplotlib.ticker as mticker
from matplotlib.ticker import MaxNLocator
from transformers import AutoTokenizer, AutoModel
from pathlib import Path
import os
import sys
import polars as pl # used to read the .parquet files so its important
import numpy as np
import warnings

# Suppress all warnings
warnings.filterwarnings("ignore")

# Add the parent directory (or specify path to 'utils' if it's higher)
dir = os.path.abspath(os.path.join(os.getcwd(), '..', '..')) 
sys.path.append(dir)

from dataloader.NRMSdataloader import NRMSDataLoader
from models.nrms import NRMSModelPytorch
from eval.metricEval import MetricEvaluator, AucScore, MrrScore, NdcgScore

# Now you can import from utils
from utils import add_known_user_column, add_prediction_scores
from utils import get_transformers_word_embeddings, concat_str_columns,convert_text2encoding_with_transformers, create_article_id_to_value_mapping
from utils import get_script_directory, slice_join_dataframes, truncate_history,sampling_strategy_wu2019, create_binary_labels_column

In [2]:
basic_path = get_script_directory()

DEFAULT_IS_BEYOND_ACCURACY_COL = "is_beyond_accuracy"


PATH = Path(basic_path+"/Data")
TRAIN_VAL_SPLIT = f"ebnerd_demo"  # [ebnerd_demo, ebnerd_small, ebnerd_large]
TEST_SPLIT = f"ebnerd_testset"  # "ebnerd_testset", "ebnerd_testset_gt"

#_____________________Training____________________________
# Reads the behaviors file from training data
df_behaviors_train = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "train", "behaviors.parquet")
)
# Reads the history file from training data
df_history_train = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "train", "history.parquet")
)

#_____________________Validation____________________________
# Reads the behaviors file from Validation data
df_behaviors_val = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "validation", "behaviors.parquet")
)
# Reads the History file from Validation data
df_history_val = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TRAIN_VAL_SPLIT, "validation", "history.parquet")
)

#_____________________Test____________________________
# Reads the behaviors file from test data
df_behaviors_test = df_behaviors = (
    pl.scan_parquet(PATH.joinpath(TEST_SPLIT, "test", "behaviors.parquet"))
    .filter(~pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))
    .drop(DEFAULT_IS_BEYOND_ACCURACY_COL)
)
# Reads the History file from test data
df_history_test = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TEST_SPLIT, "test", "history.parquet")
)

# ?? seem we already read this file ? but now without .drop(DEFAULT_IS_BEYOND_ACCURACY_COL)
df_behaviors_test_ba = df_behaviors = pl.scan_parquet(
    PATH.joinpath(TEST_SPLIT, "test", "behaviors.parquet")
).filter(pl.col(DEFAULT_IS_BEYOND_ACCURACY_COL))

#_____________________Reads Articles ??____________________________
df_articles = pl.scan_parquet(PATH.joinpath(TEST_SPLIT,"articles.parquet")).collect()

PLOT_PATH = Path("plot")

In [3]:

"""
from src.ebrec.utils._constants import (
    DEFAULT_HISTORY_ARTICLE_ID_COL, = f"{"article_id"}_fixed"
    DEFAULT_CLICKED_ARTICLES_COL, = "article_ids_clicked"
    DEFAULT_INVIEW_ARTICLES_COL, = "article_ids_inview"
    DEFAULT_IMPRESSION_ID_COL, = "impression_id"
    DEFAULT_SUBTITLE_COL, = "subtitle"
    DEFAULT_LABELS_COL, = "labels"
    DEFAULT_TITLE_COL, =  "title"
    DEFAULT_USER_COL, = "user_id"
)
"""

'\nfrom src.ebrec.utils._constants import (\n    DEFAULT_HISTORY_ARTICLE_ID_COL, = f"{"article_id"}_fixed"\n    DEFAULT_CLICKED_ARTICLES_COL, = "article_ids_clicked"\n    DEFAULT_INVIEW_ARTICLES_COL, = "article_ids_inview"\n    DEFAULT_IMPRESSION_ID_COL, = "impression_id"\n    DEFAULT_SUBTITLE_COL, = "subtitle"\n    DEFAULT_LABELS_COL, = "labels"\n    DEFAULT_TITLE_COL, =  "title"\n    DEFAULT_USER_COL, = "user_id"\n)\n'

In [3]:
def ebnerd_from_path(path: Path, history_size: int = 30) -> pl.DataFrame:
    """
    Load ebnerd - function
    """
    df_history = (
        pl.scan_parquet(path.joinpath("history.parquet"))
        .select("user_id", "article_id_fixed")
        .pipe(
            truncate_history,
            column="article_id_fixed",
            history_size=history_size,
            padding_value=0,
            enable_warning=False,
        )
    )
    df_behaviors = (
        pl.scan_parquet(path.joinpath("behaviors.parquet"))
        .collect()
        .pipe(
            slice_join_dataframes,
            df2=df_history.collect(),
            on="user_id",
            how="left",
        )
    )
    return df_behaviors

In [4]:
basic_path = get_script_directory()

PATH = Path(basic_path+"/Data")
DATASPLIT = f"ebnerd_demo"  # [ebnerd_demo, ebnerd_small, ebnerd_large]

In [5]:
COLUMNS = [
    "user_id",
    "article_id_fixed",
    "article_ids_inview",
    "article_ids_clicked",
    "impression_id",
]
HISTORY_SIZE = 10
FRACTION = 0.01

df_train = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "train"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(
        sampling_strategy_wu2019,
        npratio=4,
        shuffle=True,
        with_replacement=True,
        seed=123,
    )
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
# =>
df_validation = (
    ebnerd_from_path(PATH.joinpath(DATASPLIT, "validation"), history_size=HISTORY_SIZE)
    .select(COLUMNS)
    .pipe(create_binary_labels_column)
    .sample(fraction=FRACTION)
)
df_train.head(2)

user_id,article_id_fixed,article_ids_inview,article_ids_clicked,impression_id,labels
u32,list[i32],list[i64],list[i64],u32,list[i8]
287386,"[9768764, 9768802, … 9769197]","[9695098, 9776190, … 9775965]",[9775965],564824097,"[0, 0, … 1]"
463733,"[9769575, 9770594, … 9767697]","[9775733, 9775733, … 9775388]",[9775388],393207321,"[0, 0, … 1]"


In [6]:
df_articles = pl.read_parquet(PATH.joinpath(DATASPLIT,"articles.parquet"))
df_articles.head(2)

article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3037230,"""Ishockey-spiller: Jeg troede j…","""ISHOCKEY: Ishockey-spilleren S…",2023-06-29 06:20:57,False,"""Ambitionerne om at komme til U…",2003-08-28 08:55:00,,"""article_default""","""https://ekstrabladet.dk/sport/…",[],[],"[""Kriminalitet"", ""Kendt"", … ""Mindre ulykke""]",142,"[327, 334]","""sport""",,,,0.9752,"""Negative"""
3044020,"""Prins Harry tvunget til dna-te…","""Hoffet tvang Prins Harry til a…",2023-06-29 06:21:16,False,"""Den britiske tabloidavis The S…",2005-06-29 08:47:00,"[3097307, 3097197, 3104927]","""article_default""","""https://ekstrabladet.dk/underh…","[""Harry"", ""James Hewitt""]","[""PER"", ""PER""]","[""Kriminalitet"", ""Kendt"", … ""Personfarlig kriminalitet""]",414,[432],"""underholdning""",,,,0.7084,"""Negative"""


In [7]:
TRANSFORMER_MODEL_NAME = "FacebookAI/xlm-roberta-base"
TEXT_COLUMNS_TO_USE = ["subtitle", "title"]
MAX_TITLE_LENGTH = 30

# LOAD HUGGINGFACE:
transformer_model = AutoModel.from_pretrained(TRANSFORMER_MODEL_NAME)
transformer_tokenizer = AutoTokenizer.from_pretrained(TRANSFORMER_MODEL_NAME)

# We'll init the word embeddings using the
word2vec_embedding = get_transformers_word_embeddings(transformer_model)
#
df_articles, cat_cal = concat_str_columns(df_articles, columns=TEXT_COLUMNS_TO_USE)
df_articles, token_col_title = convert_text2encoding_with_transformers(
    df_articles, transformer_tokenizer, cat_cal, max_length=MAX_TITLE_LENGTH
)
# =>
article_mapping = create_article_id_to_value_mapping(
    df=df_articles, value_col=token_col_title
)

## Dataloader Implementation

In [8]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import tensorflow as tf
from typing import Any

# Assuming `NRMSDataLoader` is the class you provided above

class PyTorchNRMSDataLoader(Dataset):
    def __init__(self, tf_dataloader: NRMSDataLoader):
        """
        Wraps the TensorFlow DataLoader to work with PyTorch.
        
        Args:
        - tf_dataloader: An instance of the NRMSDataLoader.
        """
        self.tf_dataloader = tf_dataloader
    
    def __len__(self):
        # The length of the PyTorch Dataset is the same as the number of batches in TensorFlow's DataLoader
        return len(self.tf_dataloader)
    
    def __getitem__(self, idx: int) -> tuple:
        """
        Fetches a batch of data from the TensorFlow DataLoader and converts it to PyTorch format.
        
        Args:
        - idx: The index of the batch.
        
        Returns:
        - A tuple of (his_input_title, pred_input_title) and batch_y as PyTorch tensors.
        """
        # Get the batch from the TensorFlow DataLoader
        (his_input_title, pred_input_title), batch_y = self.tf_dataloader[idx]
        
        # Convert numpy arrays to PyTorch tensors
        his_input_title = torch.tensor(his_input_title, dtype=torch.float32)
        pred_input_title = torch.tensor(pred_input_title, dtype=torch.float32)
        batch_y = torch.tensor(batch_y, dtype=torch.float32)
        
        return (his_input_title, pred_input_title), batch_y


# Initialize your TensorFlow-based NRMSDataLoader (as shown in your example)
train_dataloader = NRMSDataLoader(
    behaviors=df_train,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column="article_id_fixed",
    eval_mode=False,
    batch_size=64,
)

val_dataloader = NRMSDataLoader(
    behaviors=df_validation,
    article_dict=article_mapping,
    unknown_representation="zeros",
    history_column="article_id_fixed",
    eval_mode=True,
    batch_size=64,
)

# Wrap the TensorFlow DataLoader into a PyTorch Dataset
train_pytorch_dataset = PyTorchNRMSDataLoader(train_dataloader)
val_pytorch_dataset = PyTorchNRMSDataLoader(val_dataloader)



In [9]:
# Now you can use these DataLoader objects with your PyTorch model
for (his_input_title, pred_input_title), batch_y in train_pytorch_dataset:
    print(his_input_title.shape)
    print(pred_input_title.shape)

for (his_input_title, pred_input_title), batch_y in val_pytorch_dataset:
    print(his_input_title.shape)

print(len(train_pytorch_dataset))

torch.Size([64, 10, 30])
torch.Size([64, 5, 30])
torch.Size([64, 10, 30])
torch.Size([64, 5, 30])
torch.Size([64, 10, 30])
torch.Size([64, 5, 30])
torch.Size([56, 10, 30])
torch.Size([56, 5, 30])
torch.Size([717, 10, 30])
torch.Size([838, 10, 30])
torch.Size([655, 10, 30])
torch.Size([770, 10, 30])
4


## Classes

In [9]:
import math
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F

def attention(query, key, value, mask=None, dropout=None):
    "Compute 'Scaled Dot Product Attention'"
    d_k = query.size(-1)
    print(f"Attention: query shape: {query.shape}, key shape: {key.shape}, value shape: {value.shape}")
    scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(d_k)
    print(f"Attention: scores shape: {scores.shape}")
    if mask is not None:
        scores = scores.masked_fill(mask == 0, -math.inf)
    p_attn = F.softmax(scores, dim = -1)
    print(f"Attention: p_attn shape: {p_attn.shape}")
    if dropout is not None:
        p_attn = dropout(p_attn)
    output = torch.matmul(p_attn, value)
    print(f"Attention: output shape: {output.shape}")
    return output, p_attn

class MultiHeadedAttention(nn.Module):
    """A simple Multi-head attention layer."""
    def __init__(self, h, d_model, dropout=0.1):
        "Take in model size and number of heads."
        super(MultiHeadedAttention, self).__init__()
        assert d_model % h == 0
        # We assume d_v always equals d_k
        self.d_k = d_model*h
        print("d_k", self.d_k)
        print("d_model",d_model)
        print("h",h)
        self.h = h
        self.linears = nn.ModuleList([nn.Linear(768, self.d_k) for _ in range(4)])
        self.attn = None # store the attention maps
        self.dropout = nn.Dropout(p=dropout)

    def forward(self, query, key, value, mask=None):
        nbatches = query.size(0)
        print(f"MultiHeadedAttention: input query shape: {query.shape}, key shape: {key.shape}, value shape: {value.shape}")
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)

        # 1) Do all the linear projections in batch from d_model => h x d_k 
        
        query, key, value = [l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2) for l, x in zip(self.linears, (query, key, value))]
        print(f"MultiHeadedAttention: reshaped query shape: {query.shape}, key shape: {key.shape}, value shape: {value.shape}")

        # 2) Apply attention on all the projected vectors in batch. 
        x, self.attn = attention(query, key, value, mask=mask, dropout=self.dropout)
        print(f"MultiHeadedAttention: attention output shape: {x.shape}")

        # 3) "Concat" using a view and apply a final linear. 
        x = x.transpose(1, 2).contiguous().view(nbatches, -1, self.h * self.d_k)
        print(f"MultiHeadedAttention: concatenated output shape: {x.shape}")
        return self.linears[-1](x)

class AttLayer2_torch(nn.Module):
    """Soft alignment attention implementation in PyTorch."""
    
    def __init__(self, dim=200, seed=0):
        super(AttLayer2_torch, self).__init__()
        self.dim = dim
        torch.manual_seed(seed)

        # Initialize W, b, and q but do not specify input dimension yet
        self.W = None
        self.b = nn.Parameter(torch.zeros(dim))
        self.q = nn.Parameter(torch.empty(dim, 1))
        
        # Initialize q using Xavier initialization
        nn.init.xavier_uniform_(self.q)

    def forward(self, inputs):
        # Dynamically initialize W based on the input's feature size
        if self.W is None:
            input_dim = inputs.size(-1)  # Get the feature dimension of the input
            self.W = nn.Parameter(torch.empty(input_dim, self.dim))
            nn.init.xavier_uniform_(self.W)  # Xavier initialization for W
        
        # Apply soft attention mechanism
        attention_scores = torch.tanh(inputs @ self.W + self.b) @ self.q
        print(f"AttLayer2_torch: attention_scores shape: {attention_scores.shape}")
        attention_weights = F.softmax(attention_scores.squeeze(-1), dim=-1).unsqueeze(-1)
        print(f"AttLayer2_torch: attention_weights shape: {attention_weights.shape}")
        weighted_inputs = inputs * attention_weights
        output = torch.sum(weighted_inputs, dim=1)
        print(f"AttLayer2_torch: output shape: {output.shape}")
        
        return output


class NRMSModelPytorch_2(nn.Module):
    def __init__(self, hparams, word2vec_embedding=None, word_emb_dim=300, vocab_size=32000, seed=None):
        super(NRMSModelPytorch_2, self).__init__()
        self.hparams = hparams
        self.seed = seed
        torch.manual_seed(seed)
        np.random.seed(seed)

        # Initialize word embeddings
        if word2vec_embedding is None:
            self.word2vec_embedding = torch.randn(vocab_size, word_emb_dim)
        else:
            self.word2vec_embedding = torch.from_numpy(word2vec_embedding).float()
        self.embedding_layer = nn.Embedding.from_pretrained(self.word2vec_embedding, freeze=False)

        # Build model components
        self.news_encoder = self._build_newsencoder()
        self.user_encoder = self._build_userencoder(self.news_encoder)

        # Define optimizer and loss
        self.criterion = self._get_loss(hparams.loss)
        self.optimizer = self._get_opt(hparams.optimizer, hparams.learning_rate)

    def _get_loss(self, loss):
        if loss == "cross_entropy_loss":
            return nn.CrossEntropyLoss()
        elif loss == "log_loss":
            return nn.BCELoss()
        else:
            raise ValueError(f"this loss not defined {loss}")

    def _get_opt(self, optimizer, lr):
        if optimizer == "adam":
            return optim.Adam(self.parameters(), lr=lr)
        else:
            raise ValueError(f"this optimizer not defined {optimizer}")

    def _build_userencoder(self, titleencoder):
        # Define user encoder using SelfAttention and AttLayer2 modules
        class UserEncoder(nn.Module):
            def __init__(self, hparams, titleencoder):
                super(UserEncoder, self).__init__()
                self.titleencoder = titleencoder
                self.attention = MultiHeadedAttention(hparams.head_num, hparams.head_num)
                self.att_layer = AttLayer2_torch(hparams.attention_hidden_dim)

            def forward(self, his_input_title):
                # Encode each news in the history
                click_title_presents = torch.stack([self.titleencoder(title) for title in his_input_title], dim=1)
                y = self.attention(click_title_presents,click_title_presents,click_title_presents)
                user_present = self.att_layer(y)
                return user_present

        return UserEncoder(self.hparams, titleencoder)

    def _build_newsencoder(self):
        # Define news encoder using embedding and attention layers
        class NewsEncoder(nn.Module):
            def __init__(self, embedding_layer, hparams, seed):
                super(NewsEncoder, self).__init__()
                self.embedding = embedding_layer
                self.dropout1 = nn.Dropout(hparams.dropout)  # Use attribute access here
                self.attention = MultiHeadedAttention(hparams.head_num, hparams.head_dim)
                self.dropout2 = nn.Dropout(hparams.dropout)  # Use attribute access here
                self.att_layer = AttLayer2_torch(hparams.attention_hidden_dim, seed=seed)


            def forward(self, sequences_input_title):
                # Convert input to LongTensor
                sequences_input_title = sequences_input_title.long()
                embedded_sequences_title = self.embedding(sequences_input_title)
                y = self.dropout1(embedded_sequences_title)

                y = self.attention(y,y,y)
                y = self.dropout2(y)
                pred_title = self.att_layer(y)
                return pred_title

        return NewsEncoder(self.embedding_layer, self.hparams, self.seed)

    def forward(self, his_input_title, pred_input_title):
        print(f"NRMSModelPytorch_2: his_input_title shape: {his_input_title.shape}, pred_input_title shape: {pred_input_title.shape}")
        
        user_vector = self.user_encoder(his_input_title)
        print(f"NRMSModelPytorch_2: user_vector shape: {user_vector.shape}")
        
        news_vectors = torch.stack([self.news_encoder(news) for news in pred_input_title], dim=1)
        print(f"NRMSModelPytorch_2: news_vectors shape: {news_vectors.shape}")
        
        scores = torch.bmm(news_vectors, user_vector.unsqueeze(-1)).squeeze(-1)
        print(f"NRMSModelPytorch_2: scores shape: {scores.shape}")
        return torch.softmax(scores, dim=-1)

    def predict(self, his_input_title, pred_input_title_one):
        print(f"NRMSModelPytorch_2 (predict): his_input_title shape: {his_input_title.shape}, pred_input_title_one shape: {pred_input_title_one.shape}")
        
        user_vector = self.user_encoder(his_input_title)
        print(f"NRMSModelPytorch_2 (predict): user_vector shape: {user_vector.shape}")
        
        news_vector = self.news_encoder(pred_input_title_one)
        print(f"NRMSModelPytorch_2 (predict): news_vector shape: {news_vector.shape}")
        
        prediction = torch.sigmoid(torch.dot(news_vector, user_vector))
        print(f"NRMSModelPytorch_2 (predict): prediction shape: {prediction.shape}")
        return prediction






## Attention Classes

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import torch.nn.functional as F

class AttLayer2_torch(nn.Module):
    """Soft alignment attention implementation in PyTorch."""
    
    def __init__(self, dim=200, seed=0):
        super(AttLayer2_torch, self).__init__()
        self.dim = dim
        torch.manual_seed(seed)

        # Initialize W, b, and q but do not specify input dimension yet
        self.W = None
        self.b = nn.Parameter(torch.zeros(dim))
        self.q = nn.Parameter(torch.empty(dim, 1))
        
        # Initialize q using Xavier initialization
        nn.init.xavier_uniform_(self.q)

    def forward(self, inputs):
        # Dynamically initialize W based on the input's feature size
        if self.W is None:
            input_dim = inputs.size(-1)  # Get the feature dimension of the input
            self.W = nn.Parameter(torch.empty(input_dim, self.dim))
            nn.init.xavier_uniform_(self.W)  # Xavier initialization for W
        
        # Apply soft attention mechanism
        #print("input", inputs.shape)
        attention = torch.tanh(inputs @ self.W + self.b)
        attention = attention @ self.q
        #print("attention", attention.shape)
        attention = torch.squeeze(attention, dim=-1)
        #print("attention2", attention.shape)
        
        attention_weights = F.softmax(attention, dim=-1)
        attention_weights = attention_weights.unsqueeze(-1)
        #print("attention_weights", attention_weights.shape)

        weighted_input = inputs * attention_weights
        return torch.sum(weighted_input, dim=1)


class SelfAttention_torch(nn.Module):
    """Multi-head self-attention implementation in PyTorch."""
    
    def __init__(self, multiheads, head_dim, seed=0, mask_right=False):
        super(SelfAttention_torch, self).__init__()
        self.multiheads = multiheads
        self.head_dim = head_dim
        self.output_dim = multiheads * head_dim
        self.mask_right = mask_right
        torch.manual_seed(seed)

        # Initially set input_dim as None
        self.input_dim = None

        # Placeholder for the Linear layers for Q, K, V
        self.WQ = None
        self.WK = None
        self.WV = None

    def _mask(self, inputs, seq_len, mode="add"):
        """Apply masking operation to inputs based on sequence length."""
        if seq_len is None:
            return inputs
        mask = (torch.arange(inputs.size(1)) < seq_len.unsqueeze(1)).float()
        if mode == "mul":
            return inputs * mask
        elif mode == "add":
            return inputs - (1 - mask) * 1e12

    def forward(self, Q_seq, K_seq, V_seq, Q_len=None, V_len=None):
        # Print input shapes before passing them to attention
        
        # Set input_dim dynamically based on the input shape
        if self.input_dim is None:
            self.input_dim = Q_seq.size(-1)  # Set input_dim from the last dimension of Q_seq

            # Initialize the Linear layers with the correct input dimension
            self.WQ = nn.Linear(self.input_dim, self.output_dim)
            self.WK = nn.Linear(self.input_dim, self.output_dim)
            self.WV = nn.Linear(self.input_dim, self.output_dim)

            # Get input and output shapes
            input_features = self.WQ.in_features
            output_features = self.WQ.out_features

            print(f"Input features: {input_features}, Output features: {output_features}")

        # Linear transformations for Q, K, and V
        Q = self.WQ(Q_seq).view(-1, Q_seq.size(1), self.multiheads, self.head_dim).permute(0, 2, 1, 3)
        K = self.WK(K_seq).view(-1, K_seq.size(1), self.multiheads, self.head_dim).permute(0, 2, 1, 3)
        V = self.WV(V_seq).view(-1, V_seq.size(1), self.multiheads, self.head_dim).permute(0, 2, 1, 3)

        # Scaled dot-product attention
        A = (Q @ K.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        if self.mask_right:
            ones = torch.ones_like(A[0, 0])
            mask = torch.tril(ones) * 1e12
            A = A - mask

        # Apply softmax to attention scores
        A = F.softmax(A, dim=-1)

        # Weighted sum of values
        O = (A @ V).permute(0, 2, 1, 3).contiguous().view(-1, Q_seq.size(1), self.output_dim)
        #print("atention shape: ", O.shape)
        # Apply the mask (if applicable)
        return self._mask(O, Q_len, "mul") if Q_len is not None else O


## NRMS model:

In [14]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

class NRMSModelPytorch(nn.Module):
    def __init__(self, hparams, word2vec_embedding=None, word_emb_dim=300, vocab_size=32000, seed=None):
        super(NRMSModelPytorch, self).__init__()
        self.hparams = hparams
        self.seed = seed
        torch.manual_seed(seed)
        np.random.seed(seed)

        # Initialize word embeddings
        if word2vec_embedding is None:
            self.word2vec_embedding = torch.randn(vocab_size, word_emb_dim)
        else:
            self.word2vec_embedding = torch.from_numpy(word2vec_embedding).float()
        self.embedding_layer = nn.Embedding.from_pretrained(self.word2vec_embedding, freeze=False)

        # Build model components
        self.news_encoder = self._build_newsencoder()
        self.user_encoder = self._build_userencoder(self.news_encoder)

        # Define optimizer and loss
        self.criterion = self._get_loss(hparams.loss)
        self.optimizer = self._get_opt(hparams.optimizer, hparams.learning_rate)

    def _get_loss(self, loss):
        if loss == "cross_entropy_loss":
            return nn.CrossEntropyLoss()
        elif loss == "log_loss":
            print("BCE is used")
            return nn.BCELoss()
        else:
            raise ValueError(f"this loss not defined {loss}")

    def _get_opt(self, optimizer, lr):
        if optimizer == "adam":
            return optim.Adam(self.parameters(), lr=lr)
        else:
            raise ValueError(f"this optimizer not defined {optimizer}")

    def _build_userencoder(self, titleencoder):
        # Define user encoder using SelfAttention and AttLayer2 modules
        class UserEncoder(nn.Module):
            def __init__(self, hparams, titleencoder):
                super(UserEncoder, self).__init__()
                self.titleencoder = titleencoder
                self.attention = SelfAttention_torch(hparams.head_num, hparams.head_num)
                self.att_layer = AttLayer2_torch(hparams.attention_hidden_dim)

            def forward(self, his_input_title):
                #print("input:", his_input_title.shape)
                # Encode each news in the history
                click_title_presents = torch.stack([self.titleencoder(title) for title in his_input_title], dim=0)
                #print("Vlick",click_title_presents.shape)
                y = self.attention(click_title_presents,click_title_presents,click_title_presents)
                #print("y.shape",y.shape)
                user_present = self.att_layer(y)
                return user_present

        return UserEncoder(self.hparams, titleencoder)

    def _build_newsencoder(self):
        # Define news encoder using embedding and attention layers
        class NewsEncoder(nn.Module):
            def __init__(self, embedding_layer, hparams, seed):
                super(NewsEncoder, self).__init__()
                self.embedding = embedding_layer
                self.dropout1 = nn.Dropout(hparams.dropout)  # Use attribute access here
                self.attention = SelfAttention_torch(hparams.head_num, hparams.head_dim, seed=seed)
                self.dropout2 = nn.Dropout(hparams.dropout)  # Use attribute access here
                self.att_layer = AttLayer2_torch(hparams.attention_hidden_dim, seed=seed)


            def forward(self, sequences_input_title):
                # Convert input to LongTensor
                #print("sequence",sequences_input_title.shape)
                sequences_input_title = sequences_input_title.long()
                embedded_sequences_title = self.embedding(sequences_input_title)
                #print("embedded",embedded_sequences_title.shape)
                y = self.dropout1(embedded_sequences_title)

                y = self.attention(y,y,y)
                y = self.dropout2(y)
                pred_title = self.att_layer(y)
                #print("pred",pred_title.shape)
                return pred_title

        return NewsEncoder(self.embedding_layer, self.hparams, self.seed)

    def forward(self, his_input_title, pred_input_title):
        #print(his_input_title.shape)
       # print(pred_input_title.shape)
        user_present = self.user_encoder(his_input_title)
        #print("user: ", user_present.shape)
        news_present = torch.stack([self.news_encoder(news) for news in pred_input_title], dim=0)
        #print("news: ", news_present.shape)
        #print("new user: ", user_present.unsqueeze(-1).shape)
        preds = torch.bmm(news_present, user_present.unsqueeze(-1)).squeeze(-1)
        #print("npreds: ", preds.shape)
        return torch.sigmoid(preds)

    def predict(self, his_input_title, pred_input_title_one):
        
        user_present = self.user_encoder(his_input_title)
        news_present_one = self.news_encoder(pred_input_title_one)
        pred_one = torch.sigmoid(torch.dot(news_present_one, user_present))
        return pred_one


 


## NRMS Model Implementation:

In [15]:
import warnings
warnings.filterwarnings("ignore")
import tensorflow as tf

class hparams_nrms:
    # INPUT DIMENTIONS:
    title_size: int = 30
    history_size: int = 50
    # MODEL ARCHITECTURE
    head_num: int = 20
    head_dim: int = 20
    attention_hidden_dim: int = 200
    # MODEL OPTIMIZER:
    optimizer: str = "adam"
    loss: str = "log_loss"
    dropout: float = 0.2
    learning_rate: float = 0.0001

MODEL_NAME = "NRMS"
LOG_DIR = f"downloads/runs/{MODEL_NAME}"
MODEL_WEIGHTS = "downloads/data/state_dict/NRMS/weights.weights.h5"

# CALLBACKS
#tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
#early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)
#modelcheckpoint = tf.keras.callbacks.ModelCheckpoint(
    #filepath=MODEL_WEIGHTS, save_best_only=False, save_weights_only=True, verbose=1
#)

hparams_nrms.history_size = HISTORY_SIZE
model = NRMSModelPytorch(
    hparams=hparams_nrms,
    word2vec_embedding=word2vec_embedding,
    seed=42,
)


BCE is used


## Training the model:

In [17]:
# Move the model to the appropriate device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define your dataloaders (ensure they are instances of PyTorch DataLoader)
train_dataloader = train_pytorch_dataset  # Created using NRMSDataLoader
val_dataloader =val_pytorch_dataset      # Created using NRMSDataLoader

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()  # Set model to training mode
    total_loss = 0
    k = 0
    for (his_input_title, pred_input_title), labels in train_dataloader:
        # Convert numpy arrays to PyTorch tensors and move to the appropriate device
        his_input_title = torch.tensor(his_input_title).to(device)
        pred_input_title = torch.tensor(pred_input_title).to(device)
        labels = torch.tensor(labels).to(device)
        #print("______________________________________________________")
        #print("User history input: ", his_input_title.shape)
        #print("News Items: ", pred_input_title.shape)
        #print("Target: ", labels.shape)
        
        # Zero the gradients
        model.optimizer.zero_grad()

        # Forward pass
        predictions = model(his_input_title, pred_input_title)
        #print("Predictions: ", predictions.shape)

        # Reshape labels to match predictions batch size
        labels = labels.view(-1)  # Flatten the labels to match batch size (flatten into a vector)
        
        # Ensure predictions have the shape [batch_size, num_classes]
        predictions = predictions.view(-1)  # Flatten predictions as necessary
        #print("Predictions: ", predictions[0:5])
        #print("Labels     : ",labels[0:5])
        print("Predictions: ", predictions[0:5])
        print("Labels     : ",labels[0:5])
        # Compute loss
        loss = model.criterion(predictions, labels)
        if epoch+1 == num_epochs:
            print("Predictions: ", predictions[0:5])
            print("Labels     : ",labels[0:5])
            print("Loss: ", loss.item())
        total_loss += loss.item()
        k = k+1
        # Backward pass and optimize
        loss.backward()
        model.optimizer.step()

    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss/k:.4f}")

    # Validation loop
    model.eval()
    with torch.no_grad():
        val_loss = 0
        k = 0
        for (his_input_title, pred_input_title), labels in val_dataloader:
            # Move tensors to the appropriate device
            # Convert numpy arrays to PyTorch tensors and move to the appropriate device
            his_input_title = torch.tensor(his_input_title).to(device)
            pred_input_title = torch.tensor(pred_input_title).to(device)
            labels = torch.tensor(labels).to(device)

            # Forward pass
            predictions = model(his_input_title, pred_input_title)

            labels = labels.view(-1)  # Flatten the labels to match batch size (flatten into a vector)
        
            # Ensure predictions have the shape [batch_size, num_classes]
            predictions = predictions.view(-1)  # Flatten predictions as necessary
            print("Predictions: ", predictions[0:5])
            print("Labels     : ",labels[0:5])
            # Compute loss
            loss = model.criterion(predictions, labels)
            if epoch+1 == num_epochs:
                print("Predictions: ", predictions[0:5])
                print("Labels     : ",labels[0:5])
                print("Loss: ", loss.item())
            val_loss += loss.item()
            k = k+1

        print(f"Validation Loss: {val_loss/k:.4f}")



Predictions:  tensor([0.5062, 0.5069, 0.5044, 0.5129, 0.4996], grad_fn=<SliceBackward0>)
Labels     :  tensor([0., 0., 0., 0., 1.])
Predictions:  tensor([0.4999, 0.5155, 0.4947, 0.5068, 0.5040], grad_fn=<SliceBackward0>)
Labels     :  tensor([0., 0., 1., 0., 0.])
Predictions:  tensor([0.5127, 0.5126, 0.5075, 0.5144, 0.5048], grad_fn=<SliceBackward0>)
Labels     :  tensor([0., 0., 0., 0., 1.])
Predictions:  tensor([0.5047, 0.5024, 0.4949, 0.5090, 0.5037], grad_fn=<SliceBackward0>)
Labels     :  tensor([0., 0., 0., 0., 1.])
Epoch 1/10, Loss: 0.7011
Predictions:  tensor([0.5041, 0.5038, 0.5041, 0.5085, 0.5074])
Labels     :  tensor([1., 0., 0., 0., 0.])
Predictions:  tensor([0.5041, 0.5010, 0.4992, 0.5084, 0.5047])
Labels     :  tensor([0., 0., 0., 0., 0.])
Predictions:  tensor([0.5088, 0.5081, 0.5156, 0.5068, 0.4976])
Labels     :  tensor([0., 0., 0., 0., 0.])
Predictions:  tensor([0.5087, 0.5042, 0.5035, 0.5116, 0.5142])
Labels     :  tensor([0., 0., 0., 0., 0.])
Validation Loss: 0.7048

In [39]:
auc_metric = AucScore()

model.eval()
with torch.no_grad():
    y_true_total = []
    y_pred_total = []
    acc = []
    sum = []
    sum_1 = []
    for (his_input_title, pred_input_title), labels in train_dataloader:
        # Move tensors to the appropriate device
        # Convert numpy arrays to PyTorch tensors and move to the appropriate device
        his_input_title = torch.tensor(his_input_title).to(device)
        pred_input_title = torch.tensor(pred_input_title).to(device)
        labels = torch.tensor(labels).to(device)

        # Forward pass
        predictions = model(his_input_title, pred_input_title)

        labels = labels.view(-1)  # Flatten the labels to match batch size (flatten into a vector)
        sum.append(len(labels.numpy()))   
        sum_1.append(np.sum(labels.numpy()))
        # Ensure predictions have the shape [batch_size, num_classes]
        predictions = predictions.view(-1)
        
        y_pred_total.append( predictions.numpy() )
        y_true_total.append( labels.numpy() )
        
        acc.append(np.sum( (predictions.numpy() > 0.5) == labels.numpy()) / len(labels.numpy()) )
   
    print(sum)    
    print(sum_1)
    auc_value = auc_metric.calculate(y_true_total, y_pred_total)
    print("Average AUC Score:", auc_value)
    print("Average Accuracy:", np.round(np.mean(acc)*100,2))



[320, 320, 320, 280]
[64.0, 64.0, 64.0, 56.0]
Average AUC Score: 0.6648587596659759
Average Accuracy: 77.67


In [40]:
a = 320+ 320+ 320+ 280
b = 64.0+ 64.0+ 64.0+ 56.0
num_of_zero = a-b
print(num_of_zero/a)

0.8
