In [1]:
from helpers import *

In [2]:
# Load datasets
df_articles, df_clicks, article_embeddings = load_dataset()

In [3]:
# preprocess data
df_articles = preprocessing_articles(df_articles)
df_clicks = preprocessing_clicks(df_clicks)
article_embeddings_df = pd.DataFrame(article_embeddings)

print("df_articles shape", df_articles.shape)
print("article_embeddings shape", article_embeddings_df.shape)

df_articles shape (364047, 5)
article_embeddings shape (364047, 250)


In [4]:
articles_clicked = df_clicks.click_article_id.value_counts().index
df_articles = df_articles.loc[articles_clicked]
article_embeddings_df = article_embeddings_df.loc[articles_clicked]

In [5]:
print("df_articles shape", df_articles.shape)
print("article_embeddings shape", article_embeddings_df.shape)

df_articles shape (46033, 5)
article_embeddings shape (46033, 250)


In [6]:
def train_test_split_sessions(clicks_df, test_size=0.1, val_size=0.1, random_state=42):
    session_ids = clicks_df['session_id'].unique()
    train_sessions, test_sessions = train_test_split(session_ids, test_size=test_size, random_state=random_state)
    train_sessions, val_sessions = train_test_split(train_sessions, test_size=val_size, random_state=random_state)
    
    train_df = clicks_df[clicks_df['session_id'].isin(train_sessions)]
    val_df = clicks_df[clicks_df['session_id'].isin(val_sessions)]
    test_df = clicks_df[clicks_df['session_id'].isin(test_sessions)]
    all_df = clicks_df[clicks_df['session_id'].isin(session_ids)]
    
    return train_df, val_df, test_df, all_df
 

# Split the clicks dataframe
train_clicks_df, val_clicks_df, test_clicks_df, all_clicks_df = train_test_split_sessions(df_clicks)

print(f"Training clicks shape: {train_clicks_df.shape}")
print(f"Validation clicks shape: {val_clicks_df.shape}")
print(f"Testing clicks shape: {test_clicks_df.shape}")
print(f"All clicks shape: {all_clicks_df.shape}")

Training clicks shape: (2419742, 14)
Validation clicks shape: (269559, 14)
Testing clicks shape: (298880, 14)
All clicks shape: (2988181, 14)


In [7]:
#### Merging Articles Embeddings with Articles Metadata

# Merging with articles_metadata
# articles_merged_df = pd.merge(df_articles, article_embeddings_df, on='article_id')

In [8]:
tqdm.pandas()

def create_user_profiles(clicks_df, article_embeddings_df):
    user_profiles = clicks_df.groupby('user_id')['click_article_id'].apply(list).reset_index()
    embeddings_dict = article_embeddings_df.T.to_dict('list')
    
    user_profiles['user_embedding'] = user_profiles['click_article_id'].progress_apply(
        lambda x: np.mean([embeddings_dict[article] for article in x if article in embeddings_dict], axis=0)
    )
    
    return user_profiles

user_profiles_df_train = create_user_profiles(train_clicks_df, article_embeddings_df)
user_profiles_df_test = create_user_profiles(test_clicks_df, article_embeddings_df)
user_profiles_df_val = create_user_profiles(val_clicks_df, article_embeddings_df)
user_profiles_df_all = create_user_profiles(all_clicks_df, article_embeddings_df)

100%|██████████| 294662/294662 [00:16<00:00, 18079.43it/s]
100%|██████████| 80449/80449 [00:02<00:00, 32987.88it/s]
100%|██████████| 74242/74242 [00:02<00:00, 34546.04it/s]
100%|██████████| 322897/322897 [00:19<00:00, 16409.14it/s]


In [9]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.optimizers import Adam

def create_content_based_model(input_dim):
    model = models.Sequential()
    # Input Layer
    model.add(layers.Input(shape=(input_dim,)))
    
    # Hidden Layers
    model.add(layers.Dense(128, activation='relu'))
    model.add(layers.Dropout(0.2))
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dropout(0.2))
    
    # Output Layer - Predicting the relevance score
    model.add(layers.Dense(1, activation='sigmoid'))
    
    # model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=[ndcg_5, ndcg_10, mean_mrr, g_auc])
    return model


In [10]:
# Prepare data
def prepare_data(user_profiles_df_train, articles_df, articles_embeddings_df):
    X = []
    y = []
    
    embeddings_dict = articles_embeddings_df.T.to_dict('list')
    
    for i, user in tqdm(user_profiles_df_train.iterrows(), total=len(user_profiles_df_train)):
        if i >= 500:
            break
        
        user_embedding = user['user_embedding']
        clicked_articles = user['click_article_id']
        
        for article_id in clicked_articles:
            if article_id in embeddings_dict:
                article_embedding = embeddings_dict[article_id]
                combined_features = np.concatenate((user_embedding, article_embedding))
                X.append(combined_features)
                y.append(1) # Positive sample
        
        # Add some negative samples for training
        negative_samples = articles_df[~articles_df['article_id'].isin(clicked_articles)]['article_id'].sample(n=len(clicked_articles))
        
        for article_id in negative_samples:
            if article_id in embeddings_dict:
                article_embedding = embeddings_dict[article_id]
                combined_features = np.concatenate((user_embedding, article_embedding))
                X.append(combined_features)
                y.append(0) # Negative sample
                
    X = np.array(X)
    y = np.array(y)
    
    return X, y

In [11]:
X_train, y_train = prepare_data(user_profiles_df_train, df_articles, article_embeddings_df)


  0%|          | 500/294662 [00:00<09:38, 508.84it/s]


In [12]:
X_val, y_val = prepare_data(user_profiles_df_val, df_articles, article_embeddings_df)


  1%|          | 500/74242 [00:00<01:58, 622.96it/s]


In [13]:
# import tensorflow.keras.backend as K
# import numpy as np
# from sklearn.metrics import roc_auc_score
# from tqdm import tqdm
# import numpy as np

# def precision_at_k(true_labels, pred_scores, k=5):
#     top_k_indices = np.argsort(pred_scores)[-k:]
#     top_k_true_labels = true_labels[top_k_indices]
#     return np.sum(top_k_true_labels) / k

# def recall_at_k(true_labels, pred_scores, k=5):
#     top_k_indices = np.argsort(pred_scores)[-k:]
#     top_k_true_labels = true_labels[top_k_indices]
#     return np.sum(top_k_true_labels) / np.sum(true_labels)

def mrr(labels, predictions):
    if len(labels) != len(predictions):
        raise ValueError("Length of labels and predictions must be equal")

    # Combine labels and predictions, then sort by prediction score in descending order
    combined = list(zip(labels, predictions))
    combined_sorted = sorted(combined, key=lambda x: x[1], reverse=True)

    # Identify the rank position of the first relevant item (label == 1)
    for idx, (label, _) in enumerate(combined_sorted):
        if label == 1:
            return 1.0 / (idx + 1)

    # If no relevant item is found, return 0
    return 0.0


# def ndcg_at_k(y_true, y_pred, k=5):
#     def compute_dcg(y_true, y_pred, k):
#         order = np.argsort(y_pred)[::-1]
#         y_true = np.take(y_true, order[:k])
#         gains = 2 ** y_true - 1
#         discounts = np.log2(np.arange(len(y_true)) + 2)
#         return np.sum(gains / discounts)

#     def compute_ndcg(y_true, y_pred, k):
#         dcg = compute_dcg(y_true, y_pred, k)
#         ideal_dcg = compute_dcg(y_true, y_true, k)  # Ideal sorted DCG
#         return dcg / ideal_dcg if ideal_dcg > 0 else 0

#     return tf.py_function(compute_ndcg, (y_true, y_pred, k), tf.double)

# def g_auc(y_true, y_pred, user_ids):
#     def compute_auc(y_true, y_pred, user_ids):
#         users = np.unique(user_ids)
#         aucs = []
#         for user in users:
#             user_indices = np.where(user_ids == user)[0]
#             user_indices = tf.constant(user_indices, dtype=tf.int32)
            
#             user_true = tf.gather(y_true, user_indices)
#             user_pred = tf.gather(y_pred, user_indices)
            
#             user_true_np = user_true.numpy()
#             user_pred_np = user_pred.numpy()

#             if len(np.unique(user_true_np)) > 1:  # Avoid cases where true labels are all the same
#                 auc = roc_auc_score(user_true_np, user_pred_np)
#                 aucs.append(auc)
#         return np.mean(aucs) if aucs else 0.

#     return tf.py_function(compute_auc, (y_true, y_pred, user_ids), tf.double)

In [14]:
# Assuming article_embeddings's second dimension size is 250
input_dim = X_train.shape[1]
content_based_model = create_content_based_model(input_dim)

content_based_model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               64128     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 1)                 65        
                                                                 
Total params: 72449 (283.00 KB)
Trainable params: 72449 (283.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [15]:
X_train

array([[-0.21033628, -0.96357318, -0.19693483, ..., -0.4184863 ,
         0.1679776 ,  0.27869353],
       [-0.21033628, -0.96357318, -0.19693483, ..., -0.39606935,
         0.30193529,  0.48606798],
       [-0.21033628, -0.96357318, -0.19693483, ..., -0.0688789 ,
         0.24662791, -0.00772025],
       ...,
       [-0.46035395, -0.97221979,  0.08281855, ...,  0.27636528,
         0.85097331,  0.57291663],
       [-0.46035395, -0.97221979,  0.08281855, ..., -0.60704958,
        -0.03966475, -0.49296063],
       [-0.46035395, -0.97221979,  0.08281855, ...,  0.70272529,
        -0.16361353,  0.03004179]])

In [16]:
class CustomMetricsCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        print(f"\n\nEpoch {epoch+1}:", end=" ")
        for key, value in logs.items():
            print(f"\n- {key}: {value:.4f}", end=", ")
        print("\n")

# Using the custom callback
custom_metrics_callback = CustomMetricsCallback()
# Train the model
history = content_based_model.fit(
    X_train, 
    y_train, 
    epochs=10, 
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[custom_metrics_callback]
)

Epoch 1/10

Epoch 1: 
- loss: 0.5084, 
- ndcg_5: 0.4753, 
- ndcg_10: 0.4753, 
- mean_mrr: 0.0604, 
- g_auc: 0.8267, 
- val_loss: 0.4117, 
- val_ndcg_5: 0.5318, 
- val_ndcg_10: 0.5318, 
- val_mean_mrr: 0.0674, 
- val_g_auc: 0.8877, 

Epoch 2/10

Epoch 2: 
- loss: 0.4403, 
- ndcg_5: 0.5134, 
- ndcg_10: 0.5134, 
- mean_mrr: 0.0651, 
- g_auc: 0.8782, 
- val_loss: 0.3798, 
- val_ndcg_5: 0.5318, 
- val_ndcg_10: 0.5318, 
- val_mean_mrr: 0.0674, 
- val_g_auc: 0.9018, 

Epoch 3/10

Epoch 3: 
- loss: 0.4023, 
- ndcg_5: 0.5106, 
- ndcg_10: 0.5106, 
- mean_mrr: 0.0648, 
- g_auc: 0.8990, 
- val_loss: 0.3628, 
- val_ndcg_5: 0.5318, 
- val_ndcg_10: 0.5318, 
- val_mean_mrr: 0.0674, 
- val_g_auc: 0.9122, 

Epoch 4/10

Epoch 4: 
- loss: 0.3742, 
- ndcg_5: 0.4965, 
- ndcg_10: 0.4965, 
- mean_mrr: 0.0630, 
- g_auc: 0.9141, 
- val_loss: 0.3354, 
- val_ndcg_5: 0.5318, 
- val_ndcg_10: 0.5318, 
- val_mean_mrr: 0.0674, 
- val_g_auc: 0.9224, 

Epoch 5/10

Epoch 5: 
- loss: 0.3504, 
- ndcg_5: 0.5289, 
- ndcg_10:

In [17]:
def evaluate_model_optimized(model, user_profiles_df_train, articles_df, articles_embeddings_df, k=10, num_users=2000):
    embeddings_dict = articles_embeddings_df.T.to_dict('list')
    precisions = []
    recalls = []
    mrrs = []
    ndcgs = []
    
    all_true_labels = []
    all_scores = []

    sampled_user_profiles_df = user_profiles_df_train.sample(n=num_users, random_state=42)
    
    for _, user in tqdm(sampled_user_profiles_df.iterrows(), total=num_users, desc="Evaluating", ncols=100):
        user_embedding = user['user_embedding']
        user_id = user['user_id']
        clicked_articles = set(user['click_article_id'])

        all_embeddings = []
        article_ids = []
        for article_id in articles_df['article_id']:
            if article_id in embeddings_dict:
                article_embedding = embeddings_dict[article_id]
                combined_features = np.concatenate((user_embedding, article_embedding)).reshape(1, -1)
                all_embeddings.append(combined_features)
                article_ids.append(article_id)
        
        all_embeddings = np.vstack(all_embeddings)
        scores = model.predict(all_embeddings, verbose=0).flatten()  # Set verbose=0 to suppress model output
        true_labels = np.array([1 if article_id in clicked_articles else 0 for article_id in article_ids])

        precisions.append(precision_at_k(true_labels, scores, k))
        recalls.append(recall_at_k(true_labels, scores, k))
        mrrs.append(mrr(true_labels, scores))
        ndcgs.append(ndcg_at_k(true_labels, scores, k))
        
        all_true_labels.extend(true_labels)
        all_scores.extend(scores)

    avg_precision = np.mean(precisions)
    avg_recall = np.mean(recalls)
    avg_mrr = np.mean(mrrs)
    avg_ndcg = np.mean(ndcgs)
    g_auc = roc_auc_score(all_true_labels, all_scores)

    return avg_ndcg, avg_mrr, avg_precision, avg_recall, g_auc

In [18]:
# X_test, y_test = prepare_training_data(user_test_profiles_df, df_articles, article_embeddings_df)

In [19]:
# ndcg_score, mrr_score, auc_score, y_pred = evaluate_model_on_test_data(content_based_model, X_test, y_test)
# print(f"NDCG@10: {ndcg_score:.4f}, MRR: {mrr_score:.4f}, AUC: {auc_score:.4f}")
# Evaluation
# ndcg_score, mrr_score, g_auc_score, y_true, y_pred, user_ids = evaluate_model_optimized(content_based_model, user_profiles_df_test, df_articles, article_embeddings_df, k=10, num_users=len(user_profiles_df_test))
avg_ndcg, avg_mrr, avg_precision, avg_recall, g_auc = evaluate_model_optimized(content_based_model, user_profiles_df_test, df_articles, article_embeddings_df, k=10, num_users=10)
print(f"NDCG@10: {avg_ndcg:.4f}, MRR: {avg_mrr:.4f}, precision: {avg_precision:.4f}, recall: {avg_recall:.4f}, g_auc: {g_auc:.4f}")


Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:09<00:00,  1.10it/s]


NDCG@10: 0.0693, MRR: 0.1359, precision: 0.0200, recall: 0.0625, g_auc: 0.9147


In [20]:
# Evaluating: 100%|███████████████████████████████████████████████████| 10/10 [00:08<00:00,  1.13it/s]
# NDCG@10: 0.3124, MRR: 0.3869, precision: 0.1400, recall: 0.4244, g_auc: 0.9683

In [21]:
def infer_top_k_articles(user_id, user_profiles_df, df_articles, article_embeddings_df, model, k=5):
    tmp_df_articles = df_articles.copy()
    # Retrieve the user's embedding
    user_profile = user_profiles_df[user_profiles_df['user_id'] == user_id].iloc[0]
    
    if user_profile.empty:
        raise ValueError("User ID not found in the user profiles.")

    user_embedding = user_profile['user_embedding']

    # Get all articles embeddings
    embeddings_dict = article_embeddings_df.T.to_dict('list')
    
    article_ids = []
    combined_features_list = []
    
    for article_id, article_embedding in embeddings_dict.items():
        article_ids.append(article_id)
        combined_features = np.concatenate((user_embedding, article_embedding)).reshape(1, -1)
        combined_features_list.append(combined_features)

    all_embeddings = np.vstack(combined_features_list)
    
    # Predict relevance scores using the trained model
    scores = model.predict(all_embeddings, verbose=0).flatten()

    print(user_profile["click_article_id"])
    # Add scores to dataframe
    tmp_df_articles['score'] = tmp_df_articles['article_id'].map(dict(zip(article_ids, scores)))
    tmp_df_articles = tmp_df_articles.sort_values(by='score', ascending=False)

    top_articles = tmp_df_articles.copy()[["article_id","category_id","score"]]
    user_article_clicked = top_articles[top_articles['article_id'].isin(user_profile["click_article_id"])].reset_index(drop=True)

    top_articles = top_articles[~top_articles['article_id'].isin(user_profile["click_article_id"])]

    # Rank articles based on scores
    top_k_indices = np.argsort(scores)[-k:][::-1]
    top_k_article_ids = [article_ids[i] for i in top_k_indices]
    
    # Rank articles based on scores (worst)
    bottom_k_indices = np.argsort(scores)[:k]
    bottom_k_article_ids = [article_ids[i] for i in bottom_k_indices]

    # Fetch top K articles metadata
    top_k_articles = top_articles[top_articles['article_id'].isin(top_k_article_ids)].reset_index(drop=True)
    bottom_k_article_ids = top_articles[top_articles['article_id'].isin(bottom_k_article_ids)].reset_index(drop=True)
    bottom_k_article_ids = bottom_k_article_ids.sort_values(by='score', ascending=True)
    
    # Display the top K articles usi
    return top_k_articles, bottom_k_article_ids, user_article_clicked

In [22]:
user_profiles_df_train

Unnamed: 0,user_id,click_article_id,user_embedding
0,0,"[157541, 68866, 96755, 313996, 160158, 233470,...","[-0.2103362800553441, -0.9635731801390648, -0...."
1,1,"[235840, 96663, 59758, 160474, 36162, 234481, ...","[-0.20247302064672112, -0.9621158763766289, -0..."
2,3,"[236444, 234318, 236065, 236294, 234686, 23376...","[-0.5879472325054499, -0.9644155089671795, -0...."
3,4,"[336499, 271261, 48915, 44488]","[-0.010430984199047089, -0.9642642736434937, -..."
4,5,"[124228, 283776, 286310, 237257, 156619, 27155...","[-0.09797036762548876, -0.9608012786933354, -0..."
...,...,...,...
294657,322890,"[62464, 10023]","[-0.19085068255662918, -0.9595281183719635, 0...."
294658,322892,"[42567, 39894]","[-0.19202575460076332, -0.9561611711978912, -0..."
294659,322893,"[50644, 36162]","[0.29815271496772766, -0.9459012746810913, -0...."
294660,322895,"[289197, 63746]","[-0.15992705523967743, -0.9621853232383728, 0...."


In [29]:
user_id=3
user_id=4
top_k_articles, bottom_k_article_ids, user_article_clicked = infer_top_k_articles(user_id, user_profiles_df_all, df_articles, article_embeddings_df, content_based_model, k=5)

[336499, 271261, 48915, 44488, 195887, 195084, 63307]


In [24]:
user_article_clicked

Unnamed: 0,article_id,category_id,score
0,195887,317,0.999075
1,195084,317,0.999071
2,63307,132,0.986042


In [25]:
top_k_articles

Unnamed: 0,article_id,category_id,score
0,194617,317,0.999978
1,192943,317,0.999973
2,195125,317,0.99995
3,199197,323,0.999947
4,195643,317,0.999937


In [26]:
bottom_k_article_ids

Unnamed: 0,article_id,category_id,score
4,323476,434,1e-05
3,340621,438,1.2e-05
2,254178,389,1.2e-05
1,323508,434,1.7e-05
0,259554,395,2.5e-05


In [28]:
content_based_model.save('./input/content-based.h5')

  saving_api.save_model(


In [30]:
# Save DataFrames to disk
user_profiles_df_all.to_pickle("./input/user_profiles_df_all.pkl")
df_articles.to_pickle("./input/df_articles.pkl")
article_embeddings_df.to_pickle("./input/article_embeddings_df.pkl")

In [31]:
article_embeddings_df.shape

(46033, 250)