In [1]:
from helpers import *
from tensorflow.keras.models import load_model

In [2]:
model = load_model('./input/content-based-reduced.h5', custom_objects={'ndcg_5': ndcg_5, 'ndcg_10': ndcg_10, 'mean_mrr': mean_mrr, 'g_auc': g_auc})
# model = load_model('./remote_save/content-based-reduced.h5', custom_objects={'ndcg_5': ndcg_5, 'ndcg_10': ndcg_10, 'mean_mrr': mean_mrr, 'g_auc': g_auc})

In [3]:
with open('./input/test_model', 'wb') as file:
    pickle.dump(model, file)

In [4]:
with open('./input/embeddings_dict.pkl', 'rb') as f:
   embeddings_dict = pickle.load(f)

In [5]:
# Load DataFrames from disk
user_profiles_df_all = pd.read_pickle("./input/user_profiles_df_all-reduced.pkl")
df_articles = pd.read_pickle("./input/df_articles-reduced.pkl")
article_embeddings_df = pd.read_pickle("./input/article_embeddings_df-reduced.pkl")
# user_profiles_df_all = pd.read_pickle("./input/user_profiles_df_all.pkl")
# df_articles = pd.read_pickle("./input/df_articles.pkl")
# article_embeddings_df = pd.read_pickle("./input/article_embeddings_df.pkl")
len(user_profiles_df_all["user_id"].unique())

322897

In [6]:
def infer_all_articles_scores(user_id, df, df_articles, article_embeddings_df, model):
    # Retrieve the user's embedding
    user_profile = df[df['user_id'] == user_id].iloc[0]
    
    if user_profile.empty:
        raise ValueError("User ID not found in the user profiles.")

    user_embedding = user_profile['user_embedding']

    # Get all articles embeddings
    embeddings_dict = article_embeddings_df.T.to_dict('list')
    
    article_ids = list(embeddings_dict.keys())
    combined_features_list = [np.concatenate((user_embedding, article_embedding)).reshape(1, -1) 
                              for article_embedding in embeddings_dict.values()]

    all_embeddings = np.vstack(combined_features_list)
    print("all_embeddings", all_embeddings.shape)
    
    # Predict relevance scores using the trained model
    scores = model.predict(all_embeddings, verbose=0).flatten()

    # Create a dataframe with article IDs, category IDs, and scores
    article_scores_df = df_articles[['article_id', 'category_id']].copy()
    article_scores_df['score'] = article_scores_df['article_id'].map(dict(zip(article_ids, scores)))
    
    # Remove any unwanted header rows if present
    # article_scores_df.columns = article_scores_df.columns.droplevel(0)
    article_scores_df.reset_index(drop=True, inplace=True)
    return article_scores_df

In [7]:
user_id=15587
articles_scores = infer_all_articles_scores(user_id, user_profiles_df_all, df_articles, article_embeddings_df, model)
articles_scores

all_embeddings (46033, 150)


Unnamed: 0,article_id,category_id,score
0,160974,281,9.995703e-01
1,272143,399,8.838707e-01
2,336221,437,9.999989e-01
3,234698,375,9.998107e-01
4,123909,250,2.258775e-01
...,...,...,...
46028,283269,412,7.818530e-03
46029,329065,436,5.289585e-07
46030,38473,51,1.266307e-06
46031,289316,421,6.379398e-04


In [8]:
user_profiles_df_all[user_profiles_df_all["user_id"] == 15958]

Unnamed: 0,user_id,click_article_id,user_embedding
15958,15958,"[343112, 284463, 95680, 207122, 261680, 273397...","[0.7785356119275093, 0.06490754832824071, -0.5..."


In [9]:
articles_scores[articles_scores["article_id"].isin([95680, 300470, 261680, 273397])]

Unnamed: 0,article_id,category_id,score
55,300470,428,0.209598
651,261680,396,8e-06
864,273397,399,0.000373
3754,95680,209,0.001313


In [10]:
def compute_dcg(y_true, y_pred, k):
    order = np.argsort(y_pred)[::-1]
    y_true = np.take(y_true, order[:k])
    gains = 2 ** y_true - 1
    discounts = np.log2(np.arange(1, len(y_true) + 1) + 1)
    return np.sum(gains / discounts)

def compute_ndcg(y_true, y_pred, k):
    if len(np.unique(y_true)) < 2:
        return 0.0
    dcg = compute_dcg(y_true, y_pred, k)
    idcg = compute_dcg(y_true, sorted(y_true, reverse=True), k)
    return dcg / idcg if idcg > 0 else 0.0

In [11]:
sampled_user_profiles_df = user_profiles_df_all
articles_df = df_articles

In [12]:
def show_detailed_ndcg_for_user(user_id, k=10):
    user = sampled_user_profiles_df.loc[sampled_user_profiles_df['user_id'] == user_id].iloc[0]
    user_embedding = user['user_embedding']
    clicked_articles = set(user['click_article_id'])

    all_embeddings = []
    article_ids = [article_id for article_id in articles_df['article_id'] if article_id in embeddings_dict]
    all_embeddings = [np.concatenate((user_embedding, embeddings_dict[article_id])).reshape(1, -1) for article_id in article_ids]
    
    all_embeddings = np.vstack(all_embeddings)
    print("all_embeddings", all_embeddings.shape)
    scores = model.predict(all_embeddings, verbose=0).flatten()
    
    true_labels = np.array([1 if article_id in clicked_articles else 0 for article_id in article_ids])
    
    order = np.argsort(scores)[::-1][:k]
    ranked_article_ids = np.array(article_ids)[order]
    ranked_scores = scores[order]
    ranked_true_labels = true_labels[order]
    
    dcg_score = compute_dcg(ranked_true_labels, ranked_scores, k)
    idcg_score = compute_dcg(ranked_true_labels, sorted(ranked_true_labels, reverse=True), k)
    ndcg_score = dcg_score / idcg_score if idcg_score > 0 else 0.0
    
    print(f"User ID: {user_id}")

    print("\nGround Truth Relevance:")
    for article_id, label in zip(article_ids, true_labels):
        if label > 0:
            print(f"  Article {article_id}: Relevance {label}")
    
    print("\nTop-{0} Predicted Ranking:".format(k))
    for i, (article_id, score, true_label) in enumerate(zip(ranked_article_ids, ranked_scores, ranked_true_labels)):
        print(f"  Rank {i+1}: Article {article_id} | Predicted Score: {score:.4f} | True Relevance: {true_label}")
    
    print(f"\nDCG@{k}: {dcg_score:.4f}")
    print(f"IDCG@{k}: {idcg_score:.4f}")
    print(f"NDCG@{k}: {ndcg_score:.4f}")
show_detailed_ndcg_for_user(15587, k=10)

all_embeddings (46033, 150)
User ID: 15587

Ground Truth Relevance:
  Article 336221: Relevance 1
  Article 234698: Relevance 1
  Article 233605: Relevance 1
  Article 161586: Relevance 1

Top-10 Predicted Ranking:
  Rank 1: Article 336221 | Predicted Score: 1.0000 | True Relevance: 1
  Rank 2: Article 236338 | Predicted Score: 0.9998 | True Relevance: 0
  Rank 3: Article 234698 | Predicted Score: 0.9998 | True Relevance: 1
  Rank 4: Article 336223 | Predicted Score: 0.9998 | True Relevance: 0
  Rank 5: Article 236552 | Predicted Score: 0.9996 | True Relevance: 0
  Rank 6: Article 160974 | Predicted Score: 0.9996 | True Relevance: 0
  Rank 7: Article 160132 | Predicted Score: 0.9996 | True Relevance: 0
  Rank 8: Article 336220 | Predicted Score: 0.9995 | True Relevance: 0
  Rank 9: Article 336254 | Predicted Score: 0.9995 | True Relevance: 0
  Rank 10: Article 236688 | Predicted Score: 0.9994 | True Relevance: 0

DCG@10: 1.5000
IDCG@10: 0.9200
NDCG@10: 1.6304


In [13]:
show_ndcg_for_user(15587)

NameError: name 'show_ndcg_for_user' is not defined