In [None]:
# Step 1: Install necessary libraries
!pip install tensorflow pandas numpy scikit-learn



In [80]:
# Step 2: Import required libraries
import tensorflow as tf
from tensorflow.keras import layers, Model
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
import os
from tensorflow.keras.preprocessing.text import Tokenizer

In [81]:
# Step 3: Mount Google Drive if using datasets stored there
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
# Step 4: Set file paths (modify these paths based on where your files are stored)
train_path = '/content/drive/MyDrive/deep_learning_project/data/train'  # Adjust to the correct path
test_path = '/content/drive/MyDrive/deep_learning_project/data/validation'  # Adjust to the correct path

In [83]:
# Step 5: Load the train and test datasets
df_train_history = pd.read_parquet(os.path.join(train_path, 'history.parquet'))
df_train_behaviors = pd.read_parquet(os.path.join(train_path, 'behaviors.parquet'))
df_train_articles = pd.read_parquet(os.path.join(train_path, 'articles.parquet'))

df_test_history = pd.read_parquet(os.path.join(test_path, 'history.parquet'))
df_test_behaviors = pd.read_parquet(os.path.join(test_path, 'behaviors.parquet'))
df_test_articles = pd.read_parquet(os.path.join(test_path, 'articles.parquet'))

In [84]:
# Step 6: Ensure consistent data types for merging in both train and test datasets
df_train_behaviors['article_id'] = df_train_behaviors['article_id'].fillna(-1).astype(int).astype(str)
df_train_articles['article_id'] = df_train_articles['article_id'].astype(str)
df_test_behaviors['article_id'] = df_test_behaviors['article_id'].fillna(-1).astype(int).astype(str)
df_test_articles['article_id'] = df_test_articles['article_id'].astype(str)

In [85]:
# Step 7: Merge behaviors with articles for both train and test datasets
df_train = pd.merge(df_train_behaviors, df_train_articles, left_on='article_id', right_on='article_id', how='left')
df_test = pd.merge(df_test_behaviors, df_test_articles, left_on='article_id', right_on='article_id', how='left')

In [86]:
print(df_train.columns)

Index(['impression_id', 'article_id', 'impression_time', 'read_time',
       'scroll_percentage', 'device_type', 'article_ids_inview',
       'article_ids_clicked', 'user_id', 'is_sso_user', 'gender', 'postcode',
       'age', 'is_subscriber', 'session_id', 'next_read_time',
       'next_scroll_percentage', 'title', 'subtitle', 'last_modified_time',
       'premium', 'body', 'published_time', 'image_ids', 'article_type', 'url',
       'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory',
       'category_str', 'total_inviews', 'total_pageviews', 'total_read_time',
       'sentiment_score', 'sentiment_label'],
      dtype='object')


In [95]:
print(df_test.columns)

Index(['impression_id', 'article_id', 'impression_time', 'read_time',
       'scroll_percentage', 'device_type', 'article_ids_inview',
       'article_ids_clicked', 'user_id', 'is_sso_user', 'gender', 'postcode',
       'age', 'is_subscriber', 'session_id', 'next_read_time',
       'next_scroll_percentage', 'title', 'subtitle', 'last_modified_time',
       'premium', 'body', 'published_time', 'image_ids', 'article_type', 'url',
       'ner_clusters', 'entity_groups', 'topics', 'category', 'subcategory',
       'category_str', 'total_inviews', 'total_pageviews', 'total_read_time',
       'sentiment_score', 'sentiment_label'],
      dtype='object')


In [87]:
# Step 8: Define the process_data function to extract input data for NRMS model

def process_data(data, max_history_len=10, max_candidate_len=5):
    """
    Process dataset to extract user history, candidate articles, and labels for model training.

    Args:
        data (DataFrame): The dataset containing merged user, behavior, and article information.
        max_history_len (int): Maximum number of articles in user history.
        max_candidate_len (int): Maximum number of candidate articles shown in an impression.

    Returns:
        tuple: Arrays of user history, candidate articles, and labels.
    """
    user_history_col = 'article_ids_clicked'  # Kolumna z historią artykułów klikniętych
    candidate_articles_col = 'article_ids_inview'  # Kolumna z artykułami-kandydatami

    # Mapowanie `article_id` na indeksy w embedding_matrix
    data[user_history_col] = data[user_history_col].apply(
        lambda x: [article_to_index[article] for article in x[:max_history_len] if article in article_to_index] + [0] * (max_history_len - len(x))
    )
    data[candidate_articles_col] = data[candidate_articles_col].apply(
        lambda x: [article_to_index[article] for article in x[:max_candidate_len] if article in article_to_index] + [0] * (max_candidate_len - len(x))
    )

    # Przygotowanie `user_history` i `candidate_articles` w wymaganych kształtach
    user_history = np.array(data[user_history_col].tolist(), dtype=np.int32)  # Kształt (None, 10)
    candidate_articles = np.array(data[candidate_articles_col].tolist(), dtype=np.int32)  # Kształt (None, 5)

    # Generowanie etykiet na podstawie tego, czy artykuł był kliknięty
    labels = [
        [(1 if candidate in clicked else 0) for candidate in candidates[:max_candidate_len]]
        for clicked, candidates in zip(data[user_history_col], data[candidate_articles_col])
    ]
    labels = np.array(labels, dtype=np.int32)

    return user_history, candidate_articles, labels


In [88]:
# Step 9: Process train and test data
X_train_user_history, X_train_candidates, y_train = process_data(df_train)
X_test_user_history, X_test_candidates, y_test = process_data(df_test)

In [89]:
# Step 10: Import the embedding fle provided by the competition organizers
embedding_df = pd.read_parquet('/content/drive/MyDrive/deep_learning_project/data/embeddings/document_vector.parquet')

# Check the embedding vectors dimension
embedding_dim = len(embedding_df['document_vector'].iloc[0])

# Mapping article_id -> embedding index
article_to_index = {article_id: idx for idx, article_id in enumerate(embedding_df['article_id'])}

# Initialisation of embedding matrix
num_articles = len(article_to_index)
embedding_matrix = np.zeros((num_articles, embedding_dim))

# Puopulate the embedding matrix
for idx, row in embedding_df.iterrows():
    article_id = row['article_id']
    vector = np.array(row['document_vector'])
    if article_id in article_to_index:
        index = article_to_index[article_id]
        embedding_matrix[index] = vector

In [90]:
# Step 11: Updated NRMS Model with Improved Architecture and Metadata

class ImprovedNRMSModel:
    """Improved NRMS model with deeper architecture, metadata features, and tunable embeddings."""

    def __init__(self, word_embedding_dim=embedding_dim, head_num=16, head_dim=25, attention_hidden_dim=80, dropout_rate=0.35):
        self.word_embedding_dim = word_embedding_dim
        self.head_num = head_num
        self.head_dim = head_dim
        self.attention_hidden_dim = attention_hidden_dim
        self.dropout_rate = dropout_rate

    def build_news_encoder(self):
        """Build the news encoder with article embeddings."""
        input_layer = layers.Input(shape=(), dtype="int32")  # Pojedynczy indeks artykułu

        # Embedding layer with custom embedding matrix
        embedding_layer = layers.Embedding(input_dim=embedding_matrix.shape[0],
                                          output_dim=embedding_matrix.shape[1],
                                          embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
                                          trainable=True)(input_layer)

        # Dodaj inne warstwy, jeśli potrzebne (np. Dropout, LayerNormalization)
        output = layers.Dense(self.word_embedding_dim, activation="relu")(embedding_layer)

        model = Model(inputs=input_layer, outputs=output, name="news_encoder")
        return model

    def build_user_encoder(self, news_encoder):
        """Build user encoder to capture user interests."""
        input_layer = layers.Input(shape=(10,), dtype="int32")  # Sekwencja 10 artykułów dla użytkownika

        # Przetwarzanie sekwencji za pomocą `news_encoder` bez `TimeDistributed`
        clicked_news_embeddings = layers.Lambda(lambda x: tf.stack([news_encoder(article) for article in tf.unstack(x, axis=1)], axis=1))(input_layer)

        # Użycie Self-Attention
        y = layers.MultiHeadAttention(num_heads=self.head_num, key_dim=self.head_dim)(clicked_news_embeddings, clicked_news_embeddings)
        y = layers.Dropout(self.dropout_rate)(y)
        y = layers.LayerNormalization()(y)

        # Dodanie warstwy uwagi
        attention_output = layers.Attention()([y, y])
        output = layers.GlobalAveragePooling1D()(attention_output)

        model = Model(inputs=input_layer, outputs=output, name="user_encoder")
        return model

    def build_model(self, metadata_dim=None):
        """Build the full NRMS model with metadata concatenation."""
        user_history = layers.Input(shape=(10,), dtype="int32", name="user_history")  # Użytkownik - 10 klikniętych artykułów
        candidate_news = layers.Input(shape=(5,), dtype="int32", name="candidate_news")

        news_encoder = self.build_news_encoder()
        user_encoder = self.build_user_encoder(news_encoder)

        # Encode user history and candidate articles
        user_repr = user_encoder(user_history)
        candidate_repr = layers.Lambda(lambda x: tf.stack([news_encoder(article) for article in tf.unstack(x, axis=1)], axis=1))(candidate_news)

        # Dot product for similarity scores and softmax for probabilities
        scores = layers.Dot(axes=-1)([candidate_repr, user_repr])
        probs = layers.Activation("softmax")(scores)

        # Compile the model with custom metrics or additional regularization if needed
        model = Model(inputs=[user_history, candidate_news], outputs=probs)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4), loss="categorical_crossentropy", metrics=["AUC"])

        return model

In [91]:
# Step 12: Instantiate and compile the Improved NRMS model
improved_model = ImprovedNRMSModel()
model = improved_model.build_model()

In [92]:
# Step 13: Define callbacks for training (Learning Rate Scheduler)
from tensorflow.keras.callbacks import ReduceLROnPlateau

lr_scheduler = ReduceLROnPlateau(monitor='val_auc', factor=0.5, patience=2, min_lr=1e-6)


In [93]:
# Step 14: Train the model with updated parameters
history = model.fit([X_train_user_history, X_train_candidates], y_train,
                    validation_data=([X_test_user_history, X_test_candidates], y_test),
                    epochs=20, batch_size=64, callbacks=[lr_scheduler])

Epoch 1/20
[1m3639/3639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 7ms/step - AUC: 0.5390 - loss: 0.9894 - val_AUC: 0.7744 - val_loss: 0.7222 - learning_rate: 1.0000e-04
Epoch 2/20
[1m  59/3639[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m9s[0m 3ms/step - AUC: 0.7938 - loss: 0.7650

  callback.on_epoch_end(epoch, logs)


[1m3639/3639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 6ms/step - AUC: 0.8144 - loss: 0.7028 - val_AUC: 0.8454 - val_loss: 0.5928 - learning_rate: 1.0000e-04
Epoch 3/20
[1m3639/3639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - AUC: 0.8724 - loss: 0.5898 - val_AUC: 0.8631 - val_loss: 0.5419 - learning_rate: 1.0000e-04
Epoch 4/20
[1m3639/3639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - AUC: 0.8882 - loss: 0.5368 - val_AUC: 0.8605 - val_loss: 0.5371 - learning_rate: 1.0000e-04
Epoch 5/20
[1m3639/3639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 5ms/step - AUC: 0.8944 - loss: 0.5022 - val_AUC: 0.8581 - val_loss: 0.5373 - learning_rate: 1.0000e-04
Epoch 6/20
[1m3639/3639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 5ms/step - AUC: 0.8977 - loss: 0.4792 - val_AUC: 0.8646 - val_loss: 0.5184 - learning_rate: 1.0000e-04
Epoch 7/20
[1m3639/3639[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 6ms/step - AUC: 

In [94]:
# Step 15: Evaluate the model on test data
test_auc = model.evaluate([X_test_user_history, X_test_candidates], y_test, return_dict=True)['AUC']
print(f"AUC Score on Test Data: {test_auc}")

[1m7646/7646[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 3ms/step - AUC: 0.8541 - loss: 0.7161
AUC Score on Test Data: 0.8527636528015137
