In [1]:
from helpers import *

In [2]:
# Load datasets
df_articles, df_clicks, article_embeddings = load_dataset()

In [3]:
# preprocess data
df_articles = preprocessing_articles(df_articles)
df_clicks = preprocessing_clicks(df_clicks)
article_embeddings_df = pd.DataFrame(article_embeddings)

print("df_articles shape", df_articles.shape)
print("article_embeddings shape", article_embeddings_df.shape)

df_articles shape (364047, 5)
article_embeddings shape (364047, 250)


In [4]:
articles_clicked = df_clicks.click_article_id.value_counts().index
df_articles = df_articles.loc[articles_clicked]
article_embeddings_df = article_embeddings_df.loc[articles_clicked]

In [5]:
print("df_articles shape", df_articles.shape)
print("article_embeddings shape", article_embeddings_df.shape)

df_articles shape (46033, 5)
article_embeddings shape (46033, 250)


In [7]:

# Extract interactions
user_ids = df_clicks['user_id'].astype('category').cat.codes.values
article_ids = df_clicks['click_article_id'].astype('category').cat.codes.values
interactions = np.ones(len(df_clicks))  # Every click is considered as an interaction


In [10]:

# Define the number of unique users and articles
num_users = df_clicks['user_id'].nunique()
num_articles = df_clicks['click_article_id'].nunique()

# Combine data into one array
interaction_data = np.stack([user_ids, article_ids, interactions], axis=1)

# Split data into train and validation sets
train_data, val_data = train_test_split(interaction_data, test_size=0.1, random_state=42)


In [11]:
# Step 2: Build the Model
class CollaborativeFilteringModel(tf.keras.Model):
    def __init__(self, num_users, num_articles, embedding_dim):
        super(CollaborativeFilteringModel, self).__init__()
        self.user_embedding = layers.Embedding(num_users, embedding_dim, embeddings_regularizer=tf.keras.regularizers.l2(1e-6))
        self.article_embedding = layers.Embedding(num_articles, embedding_dim, embeddings_regularizer=tf.keras.regularizers.l2(1e-6))
        self.dot = layers.Dot(axes=1)
    
    def call(self, inputs):
        user_ids, article_ids = inputs[:, 0], inputs[:, 1]
        user_embedding = self.user_embedding(user_ids)
        article_embedding = self.article_embedding(article_ids)
        dot_product = self.dot([user_embedding, article_embedding])
        return tf.nn.sigmoid(dot_product)

def create_interaction_instances(data):
    X = data[:, :2]  # user_id and article_id
    y = data[:, 2]   # interaction
    return X, y

In [12]:
embedding_dim = 50
model = CollaborativeFilteringModel(num_users, num_articles, embedding_dim)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[ndcg_5, ndcg_10, mean_mrr, g_auc])

In [None]:
# Prepare training and validation instances
X_train, y_train = create_interaction_instances(train_data)
X_val, y_val = create_interaction_instances(val_data)