In [1]:
from tensorflow.keras.layers import Embedding, Input, Dropout, Dense, BatchNormalization, TimeDistributed, Dot, Activation, Reshape
from tensorflow.keras.initializers import GlorotUniform

import numpy as np
import tensorflow as tf
from tensorflow.keras.regularizers import l2

import tensorflow.keras as keras
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import pandas as pd
import pyarrow

import random
import gc

import gc
import tensorflow as tf

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

from tensorflow.keras.mixed_precision import set_global_policy
set_global_policy('mixed_float16')

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 2070 SUPER, compute capability 7.5


In [2]:
train_history = pd.read_parquet(r"C:\Users\bilba\Downloads\DL_Small\history.parquet", engine='pyarrow')
train_behaviors = pd.read_parquet(r"C:\Users\bilba\Downloads\DL_Small\behaviors.parquet", engine='pyarrow').dropna(subset=["article_id"])

val_history = pd.read_parquet(r"C:\Users\bilba\Downloads\DL_Small\validation\history.parquet", engine='pyarrow')
val_behaviors = pd.read_parquet(r"C:\Users\bilba\Downloads\DL_Small\validation\behaviors.parquet", engine='pyarrow').dropna(subset=["article_id"])

articles_ = pd.read_parquet(r"C:\Users\bilba\Downloads\DL_Small\articles.parquet", engine='pyarrow')

word2vec_file = pd.read_parquet(r"C:\Users\bilba\Downloads\Ekstra_Bladet_word2vec\document_vector.parquet", engine='pyarrow')

In [3]:
class NRMSModel:
    """NRMS model(Neural News Recommendation with Multi-Head Self-Attention)
    Chuhan Wu, Fangzhao Wu, Suyu Ge, Tao Qi, Yongfeng Huang,and Xing Xie, "Neural News
    Recommendation with Multi-Head Self-Attention" in Proceedings of the 2019 Conference
    on Empirical Methods in Natural Language Processing and the 9th International Joint Conference
    on Natural Language Processing (EMNLP-IJCNLP)
    """

    def __init__(
        self,
        hparams: dict,
        word2vec_embedding: np.ndarray = None,
        word_emb_dim: int = 300,
        vocab_size: int = 32000,
        seed: int = None,
    ):
        """Initialization steps for NRMS."""
        self.hparams = hparams
        self.seed = seed

        # Set seed for reproducibility
        tf.random.set_seed(seed)
        np.random.seed(seed)

        # Initialize the word embeddings
        if word2vec_embedding is None:
            initializer = GlorotUniform(seed=self.seed)
            self.word2vec_embedding = initializer(shape=(vocab_size, word_emb_dim))
        else:
            self.word2vec_embedding = word2vec_embedding

        # Build and compile model
        self.model, self.scorer = self._build_graph()
        data_loss = self._get_loss(self.hparams['loss'])
        train_optimizer = self._get_opt(optimizer=self.hparams['optimizer'], lr=self.hparams['learning_rate'])
        self.model.compile(loss=data_loss, optimizer=train_optimizer)

    def _get_loss(self, loss: str):
        """Define loss function"""
        if loss == "cross_entropy_loss":
            data_loss = "categorical_crossentropy"
        elif loss == "log_loss":
            data_loss = "binary_crossentropy"
        else:
            raise ValueError(f"This loss is not defined: {loss}")
        return data_loss

    def _get_opt(self, optimizer: str, lr: float):
        """Define optimizer"""
        if optimizer == "adam":
            train_opt = tf.keras.optimizers.Adam(learning_rate=lr)
        else:
            raise ValueError(f"This optimizer is not defined: {optimizer}")
        return train_opt

    def _build_graph(self):
        """Build NRMS model and scorer."""
        model, scorer = self._build_nrms()
        return model, scorer

    def _build_userencoder(self, titleencoder):
        """Create user encoder for NRMS with timestamps."""
        his_input_title = Input(shape=(self.hparams['history_size'], self.hparams['title_size']), dtype="int32")
        timestamp_input = Input(shape=(self.hparams['history_size'],), dtype="float16")

        # Process titles
        click_title_presents = TimeDistributed(titleencoder)(his_input_title)

        # Concatenate timestamps with title representations
        timestamp_reshaped = tf.expand_dims(timestamp_input, axis=-1)  # Shape: (batch_size, history_size, 1)
        click_title_with_timestamps = tf.concat([click_title_presents, timestamp_reshaped], axis=-1)

        # Apply attention
        y = SelfAttention(self.hparams['head_num'], self.hparams['head_dim'], seed=self.seed)([click_title_with_timestamps] * 3)
        user_present = AttLayer2(self.hparams['attention_hidden_dim'], seed=self.seed)(y)

        model = tf.keras.Model([his_input_title, timestamp_input], user_present, name="user_encoder")
        return model


    def _build_newsencoder(self):
        """Create news encoder for NRMS."""
        embedding_layer = Embedding(
            input_dim=self.word2vec_embedding.shape[0],
            output_dim=self.word2vec_embedding.shape[1],
            weights=[self.word2vec_embedding],
            trainable=True,
        )
        sequences_input_title = Input(shape=(self.hparams['title_size'],), dtype="int32")
        embedded_sequences_title = embedding_layer(sequences_input_title)

        y = Dropout(self.hparams['dropout'])(embedded_sequences_title)
        y = SelfAttention(self.hparams['head_num'], self.hparams['head_dim'], seed=self.seed)([y, y, y])

        for layer_size in self.hparams['layers']:  # Adjust layer sizes to match user encoder output
            y = Dense(units=layer_size, activation="relu")(y)
            y = BatchNormalization()(y)
            y = Dropout(self.hparams['dropout'])(y)

        y = Dropout(self.hparams['dropout'])(y)
        pred_title = AttLayer2(self.hparams['attention_hidden_dim'], seed=self.seed)(y)
        model = tf.keras.Model(sequences_input_title, pred_title, name="news_encoder")
        return model

    def _build_nrms(self):
        """Create NRMS model."""
        his_input_title = Input(shape=(self.hparams['history_size'], self.hparams['title_size']), dtype="int32")
        timestamp_input = Input(shape=(self.hparams['history_size'],), dtype="float16")
        pred_input_title = Input(shape=(None, self.hparams['title_size']), dtype="int32")
        pred_input_title_one = Input(shape=(1, self.hparams['title_size']), dtype="int32")
        pred_title_one_reshape = Reshape((self.hparams['title_size'],))(pred_input_title_one)

        titleencoder = self._build_newsencoder()
        self.userencoder = self._build_userencoder(titleencoder)
        self.newsencoder = titleencoder

        #user_present = self.userencoder(his_input_title)
        user_present = self.userencoder([his_input_title, timestamp_input]) #newly added for the timestamps
        news_present = TimeDistributed(self.newsencoder)(pred_input_title)
        news_present_one = self.newsencoder(pred_title_one_reshape)

        preds = Dot(axes=-1)([news_present, user_present])
        preds = Activation(activation="softmax")(preds)

        pred_one = Dot(axes=-1)([news_present_one, user_present])
        pred_one = Activation(activation="sigmoid")(pred_one)

        #model = tf.keras.Model([his_input_title, pred_input_title], preds)
        #scorer = tf.keras.Model([his_input_title, pred_input_title_one], pred_one)
        model = tf.keras.Model([his_input_title, timestamp_input, pred_input_title], preds)
        scorer = tf.keras.Model([his_input_title, timestamp_input, pred_input_title_one], pred_one)

        return model, scorer


class AttLayer2(layers.Layer):
    """Soft alignment attention implement.

    Attributes:
        dim (int): attention hidden dim
    """

    def __init__(self, dim=200, seed=0, **kwargs):
        """Initialization steps for AttLayer2.

        Args:
            dim (int): attention hidden dim
        """

        self.dim = dim
        self.seed = seed
        super(AttLayer2, self).__init__(**kwargs)

    def build(self, input_shape):
        """Initialization for variables in AttLayer2
        There are there variables in AttLayer2, i.e. W, b and q.

        Args:
            input_shape (object): shape of input tensor.
        """

        assert len(input_shape) == 3
        dim = self.dim
        self.W = self.add_weight(
            name="W",
            shape=(int(input_shape[-1]), dim),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        self.b = self.add_weight(
            name="b",
            shape=(dim,),
            initializer=keras.initializers.Zeros(),
            trainable=True,
        )
        self.q = self.add_weight(
            name="q",
            shape=(dim, 1),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        super(AttLayer2, self).build(input_shape)  # be sure you call this somewhere!

    def call(self, inputs, mask=None, **kwargs):
        """Core implemention of soft attention

        Args:
            inputs (object): input tensor.

        Returns:
            object: weighted sum of input tensors.
        """
        print("Inside AttLayer2 call")
        
        attention = K.tanh(K.dot(inputs, self.W) + self.b)
        attention = K.dot(attention, self.q)

        attention = K.squeeze(attention, axis=2)

        if mask == None:
            attention = K.exp(attention)
        else:
            attention = K.exp(attention) * K.cast(mask, dtype=inputs.dtype)

        attention_weight = attention / (
            K.sum(attention, axis=-1, keepdims=True) + K.epsilon()
        )

        attention_weight = K.expand_dims(attention_weight)
        weighted_input = inputs * attention_weight
        return K.sum(weighted_input, axis=1)

    def compute_mask(self, input, input_mask=None):
        """Compte output mask value

        Args:
            input (object): input tensor.
            input_mask: input mask

        Returns:
            object: output mask.
        """
        return None

    def compute_output_shape(self, input_shape):
        """Compute shape of output tensor

        Args:
            input_shape (tuple): shape of input tensor.

        Returns:
            tuple: shape of output tensor.
        """
        return input_shape[0], input_shape[-1]
    
    def get_config(self):
        """Override get_config to enable saving/loading."""
        config = super(AttLayer2, self).get_config()
        config.update({
            "dim": self.dim,
            "seed": self.seed,
        })
        return config


class SelfAttention(layers.Layer):
    """Multi-head self attention implement.

    Args:
        multiheads (int): The number of heads.
        head_dim (object): Dimention of each head.
        mask_right (boolean): whether to mask right words.

    Returns:
        object: Weighted sum after attention.
    """

    def __init__(self, multiheads, head_dim, seed=0, mask_right=False, **kwargs):
        """Initialization steps for AttLayer2.

        Args:
            multiheads (int): The number of heads.
            head_dim (object): Dimention of each head.
            mask_right (boolean): whether to mask right words.
        """

        self.multiheads = multiheads
        self.head_dim = head_dim
        self.output_dim = multiheads * head_dim
        self.mask_right = mask_right
        self.seed = seed
        super(SelfAttention, self).__init__(**kwargs)

    def compute_output_shape(self, input_shape):
        """Compute shape of output tensor.

        Returns:
            tuple: output shape tuple.
        """

        return (input_shape[0][0], input_shape[0][1], self.output_dim)

    def build(self, input_shape):
        """Initialization for variables in SelfAttention.
        There are three variables in SelfAttention, i.e. WQ, WK ans WV.
        WQ is used for linear transformation of query.
        WK is used for linear transformation of key.
        WV is used for linear transformation of value.

        Args:
            input_shape (object): shape of input tensor.
        """

        self.WQ = self.add_weight(
            name="WQ",
            shape=(int(input_shape[0][-1]), self.output_dim),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        self.WK = self.add_weight(
            name="WK",
            shape=(int(input_shape[1][-1]), self.output_dim),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        self.WV = self.add_weight(
            name="WV",
            shape=(int(input_shape[2][-1]), self.output_dim),
            initializer=keras.initializers.glorot_uniform(seed=self.seed),
            trainable=True,
        )
        super(SelfAttention, self).build(input_shape)

    def Mask(self, inputs, seq_len, mode="add"):
        """Mask operation used in multi-head self attention

        Args:
            seq_len (object): sequence length of inputs.
            mode (str): mode of mask.

        Returns:
            object: tensors after masking.
        """

        if seq_len is None:
            return inputs
        else:
            mask = K.one_hot(indices=seq_len[:, 0], num_classes=K.shape(inputs)[1])
            mask = 1 - K.cumsum(mask, axis=1)

            for _ in range(len(inputs.shape) - 2):
                mask = K.expand_dims(mask, 2)

            if mode == "mul":
                return inputs * mask
            elif mode == "add":
                return inputs - (1 - mask) * 1e12

    def call(self, QKVs):
        """Core logic of multi-head self attention.

        Args:
            QKVs (list): inputs of multi-head self attention i.e. qeury, key and value.

        Returns:
            object: ouput tensors.
        """
        if len(QKVs) == 3:
            Q_seq, K_seq, V_seq = QKVs
            Q_len, V_len = None, None
        elif len(QKVs) == 5:
            Q_seq, K_seq, V_seq, Q_len, V_len = QKVs
        Q_seq = K.dot(Q_seq, self.WQ)
        Q_seq = K.reshape(
            Q_seq, shape=(-1, K.shape(Q_seq)[1], self.multiheads, self.head_dim)
        )
        Q_seq = K.permute_dimensions(Q_seq, pattern=(0, 2, 1, 3))

        K_seq = K.dot(K_seq, self.WK)
        K_seq = K.reshape(
            K_seq, shape=(-1, K.shape(K_seq)[1], self.multiheads, self.head_dim)
        )
        K_seq = K.permute_dimensions(K_seq, pattern=(0, 2, 1, 3))

        V_seq = K.dot(V_seq, self.WV)
        V_seq = K.reshape(
            V_seq, shape=(-1, K.shape(V_seq)[1], self.multiheads, self.head_dim)
        )
        V_seq = K.permute_dimensions(V_seq, pattern=(0, 2, 1, 3))
        A = tf.matmul(Q_seq, K_seq, adjoint_a=False, adjoint_b=True) / K.sqrt(
            K.cast(self.head_dim, dtype=Q_seq.dtype)
        )

        A = K.permute_dimensions(
            A, pattern=(0, 3, 2, 1)
        )  # A.shape=[batch_size,K_sequence_length,Q_sequence_length,self.multiheads]

        A = self.Mask(A, V_len, "add")
        A = K.permute_dimensions(A, pattern=(0, 3, 2, 1))

        if self.mask_right:
            ones = K.ones_like(A[:1, :1])
            lower_triangular = K.tf.matrix_band_part(ones, num_lower=-1, num_upper=0)
            mask = (ones - lower_triangular) * 1e12
            A = A - mask
        A = K.softmax(A)

        O_seq = tf.matmul(A, V_seq, adjoint_a=True, adjoint_b=False)
        O_seq = K.permute_dimensions(O_seq, pattern=(0, 2, 1, 3))

        O_seq = K.reshape(O_seq, shape=(-1, K.shape(O_seq)[1], self.output_dim))
        O_seq = self.Mask(O_seq, Q_len, "mul")
        return O_seq

    def get_config(self):
        """add multiheads, multiheads and mask_right into layer config.

        Returns:
            dict: config of SelfAttention layer.
        """
        config = super(SelfAttention, self).get_config()
        config.update(
            {
                "multiheads": self.multiheads,
                "head_dim": self.head_dim,
                "mask_right": self.mask_right,
            }
        )
        return config

In [4]:
embedding_matrix = np.stack(word2vec_file['document_vector'].values)
word2vec_file["article_length"] = word2vec_file["document_vector"].apply(len)
max_length = word2vec_file["article_length"].max()

In [5]:
hparams = {
    "wordEmb_file": word2vec_file,  # Pre-trained word embeddings
    'layers': [max_length, max_length, max_length],
    "title_size": max_length,                          # Size of news titles
    "npratio": 4,                              # Negative sampling ratio
    "word_emb_dim": max_length,                       # Word embedding dimension
    "head_num": 4,                            # Number of attention heads
    "head_dim": int(max_length/4),                            # Dimension of each attention head
    "attention_hidden_dim": 50,               # Hidden layer size for attention mechanism
    "dropout": 0.2,                            # Dropout rate
    "learning_rate": 0.01,                    # Learning rate
    "epochs": 10,                              # Number of epochs
    "batch_size": 32,                          # Batch size
    "loss": "log_loss",              # Loss function
    "optimizer": "adam",                       # Optimizer
    "history_size": 5,     #20                    # Number of past news items
    "num_candidate_news": 10,                 # Number of candidate news items
    "vocab_size": 500, #125541
    "metrics": ["AUC", "accuracy"]

}

In [6]:
nrms_model = NRMSModel(hparams,word2vec_embedding=embedding_matrix,word_emb_dim=hparams["word_emb_dim"],vocab_size=hparams["vocab_size"])

Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call


In [7]:
# Explode all list columns into individual rows
expanded_history = train_history.explode(
    ["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"],
    ignore_index=True,
)

# Ensure impression_time_fixed is treated as datetime
expanded_history["impression_time_fixed"] = pd.to_datetime(expanded_history["impression_time_fixed"])

# Sort by user_id and impression_time_fixed (most recent first)
expanded_history = expanded_history.sort_values(by=["user_id", "impression_time_fixed"], ascending=[True, False])

# Truncate to the most recent hparams["history_size"] articles for each user
truncated_history = (
    expanded_history.groupby("user_id")
    .head(hparams["history_size"])  # Keep only the top N most recent articles
    .groupby("user_id")["article_id_fixed"]
    .apply(list)
    .reset_index()
)

# Pad histories with 0 for users with fewer than history_size articles
truncated_history["article_id_fixed"] = truncated_history["article_id_fixed"].apply(
    lambda x: x + [0] * (hparams["history_size"] - len(x))
)


behaviors_df = train_behaviors[["user_id","article_ids_inview","article_ids_clicked"]]

articles_df = articles_[["article_id","title"]]

In [8]:
"""
#### starting off with behaviors_df only and then we add the rest soon

def generate_samples(row, k=hparams["num_candidate_news"]):
   user_id = row['user_id']
   clicked = row['article_ids_clicked']
   inview = list(set(row['article_ids_inview']) - set(clicked))  # Remove clicked from inview
   samples = []
   
   # Positive samples (clicked)
   for article_id in clicked:
       samples.append((user_id, article_id, 1))
   
   # Negative samples (random from inview)
   if inview:  # Only if there are inview articles left
       for article_id in clicked:
           n_samples = min(k, len(inview))
           negatives = random.sample(inview, n_samples)
           samples.extend((user_id, neg_id, 0) for neg_id in negatives)
           
   return samples

# Sample 1000 rows
sample_size = 500
sample_df = behaviors_df.sample(n=sample_size, random_state=42)
"""

'\n#### starting off with behaviors_df only and then we add the rest soon\n\ndef generate_samples(row, k=hparams["num_candidate_news"]):\n   user_id = row[\'user_id\']\n   clicked = row[\'article_ids_clicked\']\n   inview = list(set(row[\'article_ids_inview\']) - set(clicked))  # Remove clicked from inview\n   samples = []\n   \n   # Positive samples (clicked)\n   for article_id in clicked:\n       samples.append((user_id, article_id, 1))\n   \n   # Negative samples (random from inview)\n   if inview:  # Only if there are inview articles left\n       for article_id in clicked:\n           n_samples = min(k, len(inview))\n           negatives = random.sample(inview, n_samples)\n           samples.extend((user_id, neg_id, 0) for neg_id in negatives)\n           \n   return samples\n\n# Sample 1000 rows\nsample_size = 500\nsample_df = behaviors_df.sample(n=sample_size, random_state=42)\n'

In [9]:
import random

def generate_samples(row, k=hparams["num_candidate_news"]):
    """Generate samples with labels based on article_ids_inview and article_ids_clicked."""
    user_id = row['user_id']
    clicked = row['article_ids_clicked']
    inview = list(set(row['article_ids_inview']) - set(clicked))  # Remove clicked from inview
    samples = []
    
    # Positive samples (clicked)
    for article_id in clicked:
        samples.append((user_id, article_id, 1))  # Label 1 for clicked articles
    
    # Negative samples (random from inview)
    if inview:  # Only if there are inview articles left
        for _ in range(len(clicked)):  # Ensure we generate the same number of negative samples as clicked
            n_samples = min(k, len(inview))  # Take k or fewer negative samples
            negatives = random.sample(inview, n_samples)
            samples.extend((user_id, neg_id, 0) for neg_id in negatives)  # Label 0 for non-clicked (negative)
            
    return samples

# Sample a subset of behaviors_df (for example, 500 samples)
sample_size = 500
sample_df = behaviors_df.sample(n=sample_size, random_state=42)

# Apply generate_samples to each row and create the final DataFrame
sample_df['samples'] = sample_df.apply(generate_samples, axis=1)
samples = [item for sublist in sample_df['samples'] for item in sublist]

# Convert to a DataFrame
train_df = pd.DataFrame(samples, columns=['user_id', 'article_id', 'label'])

# Shuffle the DataFrame to ensure random order
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Check the result
print(f"Generated and shuffled samples:\n{train_df.head()}")


Generated and shuffled samples:
   user_id  article_id  label
0  1014610     9776234      0
1  1939489     9772805      1
2  2590015     9407487      0
3  2452795     9773887      0
4   164957     9773873      0


In [10]:
# Check results
print(f"Sample size: {len(sample_df)}")
print(f"First few samples: {sample_df['samples'].iloc[0][:10]}")

Sample size: 500
First few samples: [(956287, 9769432, 1), (956287, 9770283, 0), (956287, 9771224, 0), (956287, 9771168, 0), (956287, 9120051, 0), (956287, 9771626, 0), (956287, 9771151, 0), (956287, 9765759, 0), (956287, 9771113, 0), (956287, 9176912, 0)]


In [11]:
article_to_index = {article_id: idx for idx, article_id in enumerate(word2vec_file['article_id'])}

train_df['title_features'] = train_df['article_id'].apply(
    lambda article_id: article_to_index.get(article_id, 0)  # Map single article_id to index
)

In [12]:
def map_article_features_with_word2vec(articles_df, word2vec_file, hparams):
    """
    Map article IDs to word embeddings using word2vec_file.

    Args:
        articles_df (DataFrame): DataFrame with article information (including article_id).
        word2vec_file (DataFrame): DataFrame containing article_id and document_vector.
        hparams (dict): Hyperparameters with 'title_size' for padding.

    Returns:
        dict: Mapping of article_id to padded document vectors.
    """
    # Create a mapping from article_id to its embedding
    article_embeddings = {
        row['article_id']: np.array(row['document_vector'])
        for _, row in word2vec_file.iterrows()
    }
    
    article_features = {}
    for _, row in articles_df.iterrows():
        article_id = row['article_id']
        embedding = article_embeddings.get(article_id, np.zeros(hparams['word_emb_dim']))
        
        # If the embedding is shorter than title_size, pad it
        if len(embedding) < hparams['title_size']:
            padded_embedding = np.pad(
                embedding,
                (0, hparams['title_size'] - len(embedding)),
                mode='constant',
                constant_values=0
            )
        else:
            padded_embedding = embedding[:hparams['title_size']]  # Truncate if too long

        article_features[article_id] = padded_embedding

    return article_features

# Example usage
article_features = map_article_features_with_word2vec(articles_df, word2vec_file, hparams)

# Map the features to the train_df
train_df['title_features'] = train_df['article_id'].map(article_features)

In [13]:
train_df

Unnamed: 0,user_id,article_id,label,title_features
0,1014610,9776234,0,"[0.039682284, -0.024288755, 0.09457365, 0.0297..."
1,1939489,9772805,1,"[0.061793916, -0.0006896765, -0.0156888, 0.026..."
2,2590015,9407487,0,"[0.038196277, -0.034157332, 0.051786672, 0.058..."
3,2452795,9773887,0,"[0.034705974, -0.0034392518, 0.0070585194, 0.0..."
4,164957,9773873,0,"[-0.044300657, 0.019420944, 0.00011168638, 0.0..."
...,...,...,...,...
4287,632021,9779996,0,"[0.004843264, 0.026948638, 0.010931431, 0.0127..."
4288,39221,9775518,0,"[-0.017093746, -0.047642525, 0.09617593, 0.050..."
4289,1293434,9777505,0,"[0.009022846, -0.0091448715, 0.0056488942, 0.0..."
4290,96059,9773962,0,"[0.04382465, -0.026772723, -0.02088443, 0.0247..."


In [14]:
def transform_to_user_groups(df, max_items=hparams["num_candidate_news"]):
    """Transform to user-level groups while preserving user_id column."""
    transformed_data = []
    
    for user_id, group in df.groupby('user_id'):
        articles = group['article_id'].tolist()[:max_items]
        labels = group['label'].tolist()[:max_items]
        features = group['title_features'].tolist()[:max_items]
        
        # Pad if necessary
        if len(articles) < max_items:
            padding_length = max_items - len(articles)
            articles.extend([0] * padding_length)
            labels.extend([0] * padding_length)
            features.extend([[0] * len(features[0])] * padding_length)
            
        transformed_data.append({
            'user_id': user_id,
            'article_ids': articles,
            'labels': labels,
            'title_features': features
        })
    
    return pd.DataFrame(transformed_data)

# Usage
grouped_df = transform_to_user_groups(train_df)

merged_df = pd.merge(grouped_df, truncated_history, on="user_id", how="inner")

In [15]:
merged_df

Unnamed: 0,user_id,article_ids,labels,title_features,article_id_fixed
0,11132,"[9746360, 9767751, 9770882, 9770594, 9770798, ...","[0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[[0.035727814, 0.012963935, 0.015877515, 0.030...","[9756202, 9758464, 9759080, 9758464, 9756202]"
1,19915,"[9775518, 9775402, 9774733, 9775347, 9758424, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[[-0.017093746, -0.047642525, 0.09617593, 0.05...","[9769679, 9769572, 9768566, 9769572, 9768328]"
2,26391,"[9773350, 9769531, 9773857, 9649632, 9774297, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.002001684, -0.010715722, 0.067146085, 0.01...","[9770989, 9769553, 9770741, 9769306, 9763284]"
3,34912,"[9775352, 9775277, 9774557, 9482380, 9773877, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.031933676, 0.0013623529, 0.006322594, 0.00...","[9770882, 9770829, 9769650, 9770798, 9770867]"
4,39221,"[9754160, 9770028, 9775419, 9769679, 9775500, ...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[[0.0190506, -0.021137215, -0.006801234, 0.023...","[9759345, 9737023, 9767344, 9763445, 9764640]"
...,...,...,...,...,...
457,2585224,"[9726469, 9775964, 9775855, 9776184, 9776278, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[[-0.011301067, -0.028080167, 0.021872781, 0.0...","[9770425, 9770178, 9768820, 9768802, 9760758]"
458,2587444,"[9779648, 9779860, 9779427, 9779777, 9779430, ...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[[0.034681696, -0.023107076, 0.014389009, 0.04...","[9770037, 9770030, 9766136, 9768177, 9767507]"
459,2588527,"[9774120, 9770709, 9770491, 9775184, 9775562, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[[0.011999497, -0.015975023, -0.05593742, -0.0...","[9769380, 9766949, 9759345, 9766264, 9724944]"
460,2588560,"[9774074, 9774120, 9774163, 9767665, 9772453, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0]","[[0.015317978, -0.004446184, 0.017638985, 0.07...","[9652291, 9676621, 9767697, 9769459, 9768793]"


In [16]:
def add_history_embeddings(df, word2vec_file, hparams):
    # Create a mapping from article_id to embedding vector
    article_features = {
        row['article_id']: row['document_vector']  # Directly use the numpy array
        for _, row in word2vec_file.iterrows()
    }
    
    history_embeddings = []
    
    for history in df['article_id_fixed']:
        # Get embeddings for each article in history
        embeddings = [
            article_features.get(article_id, np.zeros(hparams['word_emb_dim']))  # Default to zero vector
            for article_id in history[:hparams['history_size']]
        ]
        # Pad the embeddings if history is shorter than history_size
        if len(embeddings) < hparams['history_size']:
            padding = [np.zeros(hparams['word_emb_dim'])] * (hparams['history_size'] - len(embeddings))
            embeddings = padding + embeddings
        history_embeddings.append(embeddings)
    
    # Add history_embeddings as a new column
    df['history_embeddings'] = history_embeddings
    return df

# Example usage
tmp = add_history_embeddings(merged_df, word2vec_file, hparams)

In [17]:
def add_timestamps(df, articles_df, hparams):
    # Create a mapping from article_id to article_timestamp
    article_timestamps = {
        row['article_id']: row['article_timestamp']
        for _, row in articles_df.iterrows()
    }

    # Initialize history_timestamps
    history_timestamps = []

    for history in df['article_id_fixed']:
        # Get timestamps for each article in history
        timestamps = [
            article_timestamps.get(article_id, 0)  # Default to 0 if timestamp is missing
            for article_id in history[:hparams['history_size']]
        ]

        # Pad the timestamps if history is shorter than history_size
        if len(timestamps) < hparams['history_size']:
            timestamps = [0] * (hparams['history_size'] - len(timestamps)) + timestamps

        history_timestamps.append(timestamps)

    # Add history_timestamps as a new column
    df['history_timestamps'] = history_timestamps
    return df

# Example Usage
articles_["article_timestamp"] = articles_["published_time"].dt.day
tmp = add_timestamps(tmp, articles_, hparams)

In [18]:
def prepare_data(df, history_size=hparams["history_size"], embedding_dim=hparams["word_emb_dim"]):
    # Initialize arrays
    history_data = []
    timestamp_data = []
    target_data = []
    label_data = []

    for _, row in df.iterrows():
        # Prepare history embeddings
        histories = row['history_embeddings']
        if len(histories) > history_size:
            histories = histories[-history_size:]
        elif len(histories) < history_size:
            padding = [np.zeros(embedding_dim) for _ in range(history_size - len(histories))]
            histories = padding + histories

        history_data.append(histories)  # Shape: (history_size, embedding_dim)

        # Prepare timestamps
        timestamps = row['history_timestamps']  # Ensure this column exists
        if len(timestamps) > history_size:
            timestamps = timestamps[-history_size:]
        elif len(timestamps) < history_size:
            padding = [0] * (history_size - len(timestamps))  # Pad with zeros
            timestamps = padding + timestamps

        timestamp_data.append(timestamps)  # Shape: (history_size,)

        # Prepare target embeddings
        targets = row['title_features']
        target_data.append(targets)  # Shape: (num_candidates, embedding_dim)

        # Prepare labels
        labels = row['labels']
        label_data.append(labels)  # Shape: (num_candidates,)

    # Convert lists to numpy arrays
    history_data = np.array(history_data)  # Shape: (num_samples, history_size, embedding_dim)
    timestamp_data = np.array(timestamp_data)  # Shape: (num_samples, history_size)
    target_data = np.array(target_data)  # Shape: (num_samples, num_candidates, embedding_dim)
    label_data = np.array(label_data)  # Shape: (num_samples, num_candidates)

    return history_data, timestamp_data, target_data, label_data

In [19]:
# Example usage
history_input, timestamp_input, title_input, label_input = prepare_data(tmp, history_size=hparams["history_size"], embedding_dim=hparams["word_emb_dim"])

In [20]:
print("History Input Shape:", history_input.shape)  # (num_samples, history_size, embedding_dim)
print("timestamp Input Shape:", timestamp_input.shape)
print("Title Input Shape:", title_input.shape)  # (num_samples, num_candidates, embedding_dim)
print("Labels Shape:", label_input.shape)  # (num_samples, num_candidates)

History Input Shape: (462, 5, 300)
timestamp Input Shape: (462, 5)
Title Input Shape: (462, 10, 300)
Labels Shape: (462, 10)


In [21]:
# Split the training data (90% training, 10% validation)
#train_df, val_df = train_test_split(tmp, test_size=0.1, random_state=42)

# Prepare training data
history_input, timestamp_input,title_input, label_input = prepare_data(tmp, history_size=hparams["history_size"], embedding_dim=hparams["word_emb_dim"])

# Prepare validation data
val_history_input, val_timestamp_input, val_title_input, val_label_input = prepare_data(val_df, history_size=hparams["history_size"], embedding_dim=hparams["word_emb_dim"])

In [22]:
nrms_model.model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=hparams["learning_rate"]),
    loss="categorical_crossentropy",
    metrics=["AUC", "accuracy"],  # Include both AUC and accuracy here
)


from tensorflow.keras.callbacks import TensorBoard

log_dir = "logs"  # Directory to save the logs
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)

# Define EarlyStopping
from tensorflow.keras.callbacks import EarlyStopping

early_stopping = EarlyStopping(
    monitor="val_loss",
    patience=3,  # Stop training after 3 epochs with no improvement
    restore_best_weights=True,  # Restore the best weights after stopping
)


nrms_model.model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, None, 300)]  0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 5, 300)]     0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 5)]          0           []                               
                                                                                                  
 time_distributed_1 (TimeDistri  (None, None, 300)   38221900    ['input_3[0][0]']                
 buted)                                                                                       

In [28]:
# Compile the model once
nrms_model.model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hparams["learning_rate"]),
                          loss='binary_crossentropy',
                          metrics=["AUC", "accuracy"])

In [29]:
# Create TensorFlow datasets
dataset = tf.data.Dataset.from_tensor_slices(
    ({"input_1": history_input, "input_2": timestamp_input, "input_3": title_input}, label_input)
).shuffle(buffer_size=1000).batch(hparams["batch_size"]).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_tensor_slices(
    ({"input_1": val_history_input, "input_2": val_timestamp_input, "input_3": val_title_input}, val_label_input)
).batch(hparams["batch_size"]).prefetch(tf.data.AUTOTUNE)

# Train the model with validation
history = nrms_model.model.fit(
    dataset,
    epochs=hparams["epochs"],
    verbose=1,
    validation_data=val_dataset #,
    #callbacks=[early_stopping, tensorboard_callback]
)

Epoch 1/10
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Inside AttLayer2 call
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
