**1) Build Article Embeddings**

In [16]:
import os
import json
import numpy as np
import pandas as pd
import pickle

# 1) Load the news data
def load_news_data(news_file):
    """
    Loads the MIND 'news.tsv' file into a DataFrame with columns:
      [news_id, category, subcategory, title, abstract, url, title_entities, abstract_entities]
    Returns a DataFrame indexed by news_id.
    """
    cols = ['news_id', 'category', 'subcategory', 'title', 'abstract',
            'url', 'title_entities', 'abstract_entities']
    news_df = pd.read_csv(news_file, sep='\t', header=None, names=cols)
    news_df.set_index('news_id', inplace=True)
    return news_df

# 2) Parse the JSON-like entity columns

def parse_entity_list(entity_str):
    """
    Given a JSON-like string of entity objects (from 'title_entities' or 'abstract_entities'),
    extract the 'WikidataId' fields into a list of IDs.
    Example of entity_str:
        '[{"Label":"Skin tag","WikidataId":"Q12345","Type":"C"}, ...]'
    If parsing fails or empty, returns [].
    """
    try:
        data = json.loads(entity_str)
        wikidata_ids = [obj['WikidataId'] for obj in data if 'WikidataId' in obj]
        return wikidata_ids
    except (json.JSONDecodeError, TypeError):
        return []

# 3) Load entity embeddings

def load_entity_embeddings(embedding_file):
    """
    Loads 'entity_embedding.vec' into a dict: {WikidataId -> embedding (np.array)}.
    """
    embeddings = {}
    with open(embedding_file, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            entity_id = parts[0]            # e.g. 'Q12345'
            vector_vals = [float(x) for x in parts[1:]]
            embeddings[entity_id] = np.array(vector_vals, dtype=np.float32)
    return embeddings

# 4) Build per-article embeddings

def build_article_embeddings(news_df, embeddings, use_abstract=False):
    """
    For each article in 'news_df', gather all WikidataIds from 'title_entities'
    (and optionally 'abstract_entities'), look them up in 'embeddings',
    and average them to create a single (d-dimensional) vector.
    Returns a dict: { news_id -> np.array (d,) }.
    """
    article_vectors = {}

    for news_id, row in news_df.iterrows():
        # Parse title entities
        title_ids = parse_entity_list(row['title_entities'])
        # Optional: parse abstract entities for more coverage
        if use_abstract:
            abstract_ids = parse_entity_list(row['abstract_entities'])
            entity_ids = title_ids + abstract_ids
        else:
            entity_ids = title_ids

        # Gather embeddings
        valid_vectors = [embeddings[eid] for eid in entity_ids if eid in embeddings]

        if valid_vectors:
            article_vectors[news_id] = np.mean(valid_vectors, axis=0)
        # else, skip or handle articles with no entities

    return article_vectors

# Example usage in Jupyter:
# (Adjust the file paths as needed for your environment)

dataset_dir = '../MINDsmall_train'
news_file = os.path.join(dataset_dir, 'news.tsv')
embedding_file = os.path.join(dataset_dir, 'entity_embedding.vec')
news_df = load_news_data(news_file)
embeddings = load_entity_embeddings(embedding_file)
article_vectors = build_article_embeddings(news_df, embeddings, use_abstract=True)

print(f"Loaded news_df with shape: {news_df.shape}")
print(f"Loaded {len(embeddings)} entity embeddings.")
print(f"Created embeddings for {len(article_vectors)} articles out of {len(news_df)}.")

# (Optional) Save article_vectors to pickle
output_file = 'article_vectors.pkl'
with open(output_file, 'wb') as f:
 pickle.dump(article_vectors, f)
print(f"Saved article embeddings to: {output_file}")

Loaded news_df with shape: (51282, 7)
Loaded 26904 entity embeddings.
Created embeddings for 44262 articles out of 51282.
Saved article embeddings to: article_vectors.pkl


**2) Cluster Articles**

In [17]:
import pickle
import numpy as np
from sklearn.cluster import KMeans

# Load the article embeddings from pickle
with open('article_vectors.pkl', 'rb') as f:
    article_vectors = pickle.load(f)

print("Number of articles with embeddings:", len(article_vectors))

# Prepare data for clustering: shape (num_articles, embedding_dim)
article_ids = list(article_vectors.keys())
X = np.stack([article_vectors[a] for a in article_ids], axis=0)
print("Embedding array shape:", X.shape)

# Choose your K (number of clusters)
num_clusters = 50

# Fit K-Means (or another clustering algorithm)
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
kmeans.fit(X)

# The cluster label for each article
labels = kmeans.labels_  # shape: (num_articles,)
print("Cluster label array shape:", labels.shape)

# Build a dictionary: article_id -> cluster_label
cluster_assignments = {
    article_id: int(cluster_label)
    for article_id, cluster_label in zip(article_ids, labels)
}

# (Optional) Save the cluster assignments and the KMeans model
with open('cluster_assignments.pkl', 'wb') as f:
    pickle.dump(cluster_assignments, f)

with open('kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

print(f"Saved cluster assignments for {len(cluster_assignments)} articles.")
print("K-Means model also saved to 'kmeans_model.pkl'.")


Number of articles with embeddings: 44262
Embedding array shape: (44262, 100)
Cluster label array shape: (44262,)
Saved cluster assignments for 44262 articles.
K-Means model also saved to 'kmeans_model.pkl'.


**3)Build User Profiles (Train Set)**

In [18]:
import pandas as pd
import numpy as np
import pickle
from collections import defaultdict

cols = ['impression_id', 'user_id', 'time', 'history', 'impressions']
train_behaviors_df = pd.read_csv('../MINDsmall_train/behaviors.tsv', sep='\\t', header=None, names=cols)
test_behaviors_df = pd.read_csv('../MINDsmall_dev/behaviors.tsv', sep='\\t', header=None, names=cols)

# Load cluster assignments (article_id -> cluster_label)
with open('cluster_assignments.pkl', 'rb') as f:
    cluster_assignments = pickle.load(f)



def build_user_profiles_embedding_based(behaviors_df, article_vectors):
    user_profiles = {}
    for _, row in behaviors_df.iterrows():
        user_id = row["user_id"]
        if not isinstance(row["history"], str):
            continue
        clicked_ids = row["history"].split()
        vectors = [article_vectors[aid] for aid in clicked_ids if aid in article_vectors]
        if vectors:
            user_profiles[user_id] = np.mean(vectors, axis=0)
    return user_profiles

# Actually build the profiles
user_profiles = build_user_profiles_embedding_based(train_behaviors_df, article_vectors)

print(f"Built profiles for {len(user_profiles)} users.")

# Save profiles to a pickle
with open('user_profiles.pkl', 'wb') as f:
    pickle.dump(user_profiles, f)

print("Saved user profiles to 'user_profiles.pkl'.")



  train_behaviors_df = pd.read_csv('../MINDsmall_train/behaviors.tsv', sep='\\t', header=None, names=cols)
  test_behaviors_df = pd.read_csv('../MINDsmall_dev/behaviors.tsv', sep='\\t', header=None, names=cols)


Built profiles for 48887 users.
Saved user profiles to 'user_profiles.pkl'.


**Generate Recommendations**

In [19]:
import pickle
import numpy as np
from collections import defaultdict
# from sklearn.metrics.pairwise import cosine_similarity  # optional if you want similarity-based ranking

# 1) Load required data
with open('user_profiles.pkl', 'rb') as f:
    user_profiles = pickle.load(f)  # { user_id -> np.array of shape (num_clusters,) }

with open('cluster_assignments.pkl', 'rb') as f:
    cluster_assignments = pickle.load(f)  # { article_id -> cluster_label }

# (Optional) load article embeddings if you want advanced ranking:
with open('article_vectors.pkl', 'rb') as f:
    article_vectors = pickle.load(f)  # { article_id -> np.array(dim,) }

# 2) Build a reverse map: cluster -> all articles in that cluster
cluster_to_articles = defaultdict(list)
for art_id, cluster_label in cluster_assignments.items():
    cluster_to_articles[cluster_label].append(art_id)

def recommend_for_user(
    user_id,
    user_profiles,
    cluster_to_articles,
    article_vectors=None,
    top_clusters=2,
    top_k=5
):
    """
    Recommend top_k articles for the given user by:
      1) Selecting the top_clusters with the highest probability in user_profiles.
      2) Gathering all articles from those clusters.
      3) (Optional) Ranking them by similarity to user's embedding (or any heuristic).
    """
    if user_id not in user_profiles:
        # Cold start or invalid user
        return []

    # 1) Identify top clusters for this user
    distribution = user_profiles[user_id]  # shape: (num_clusters,)
    # sort clusters by descending probability
    cluster_indices = np.argsort(distribution)[::-1]
    selected_clusters = cluster_indices[:top_clusters]

    # 2) Gather candidate articles from these clusters
    candidate_articles = []
    for c_idx in selected_clusters:
        candidate_articles.extend(cluster_to_articles[c_idx])
    candidate_articles = list(set(candidate_articles))  # remove duplicates if clusters overlap

    # 3) (Optional) Rank candidates by similarity to user’s embedding
    # One approach is to build a "user embedding" by weighting cluster centroids or
    # by averaging the embeddings of the user’s clicked articles. For now, let's skip advanced ranking.

    # Quick naive approach: just pick the first top_k
    # If you have an advanced approach, you'd do something like:
    """
    user_embed = ... # e.g., an average of clicked article vectors
    sims = []
    for art_id in candidate_articles:
        if art_id in article_vectors:
            art_vec = article_vectors[art_id]
            # compute cosine similarity
            similarity_val = np.dot(user_embed, art_vec) / (np.linalg.norm(user_embed)*np.linalg.norm(art_vec))
            sims.append((art_id, similarity_val))
    # sort by similarity desc
    sims.sort(key=lambda x: x[1], reverse=True)
    recommended = [x[0] for x in sims[:top_k]]
    """
    # But for this minimal example, we do no ranking
    recommended = candidate_articles[:top_k]

    return recommended


# Example usage:
test_user_id = 'U13740'  # or pick any user from user_profiles
recommendations = recommend_for_user(test_user_id, user_profiles, cluster_to_articles, article_vectors, top_clusters=2, top_k=5)
print(f"Recommendations for user {test_user_id}:", recommendations)


Recommendations for user U13740: ['N14637', 'N62254', 'N29180', 'N50637', 'N20904']


In [20]:
from numpy.linalg import norm
import numpy as np

def get_article_title(news_df, article_id):
    """Return the title for a given article_id, or a fallback message if not found."""
    if article_id in news_df.index:
        return news_df.loc[article_id, 'title']
    return "[Title not found]"

def recommend_for_user(
    user_id,
    user_profiles,
    cluster_to_articles,
    news_df,
    train_behaviors_df,
    article_vectors,
    top_clusters=2,
    top_k=5
):
    """
    Recommend top_k articles for the given user by:
      1) Selecting the top_clusters with the highest probability in user_profiles.
      2) Gathering all articles from those clusters.
      3) Building a user embedding by averaging the embeddings of the user's clicked articles.
      4) Ranking those candidate articles by cosine similarity to the user embedding.
      5) Returning the top_k.
    Also prints some user history for context, plus recommended article titles.
    """
    if user_id not in user_profiles:
        print(f"No profile found for user {user_id}. Returning empty list.")
        return []


     # ----------------------------------
    # Print some of the user's history
    # ----------------------------------
    user_rows = train_behaviors_df[train_behaviors_df['user_id'] == user_id]
    if user_rows.empty:
        print(f"No behavior records found for user {user_id}.")
    else:
        first_row = user_rows.iloc[0]
        history_str = first_row['history']
        if isinstance(history_str, str):
            clicked_articles = history_str.split()
            #print(f"User {user_id} clicked {len(clicked_articles)} articles. Showing up to 15 titles:")
            for art_id in clicked_articles[:15]:
                title = news_df.loc[art_id, 'title'] if art_id in news_df.index else '[Title not found]'
                #print(f" - {art_id}: {title}")
        else:
            print(f"User {user_id} has no clicked articles in the first row record.")
    
    # ----------------------------------
    # Identify top clusters for this user
    # ----------------------------------
    distribution = user_profiles[user_id]  # shape: (num_clusters,)
    cluster_indices = np.argsort(distribution)[::-1]
    selected_clusters = cluster_indices[:top_clusters]

    # Gather candidate articles from these clusters
    candidate_articles = []
    for c_idx in selected_clusters:
        candidate_articles.extend(cluster_to_articles[c_idx])
    candidate_articles = list(set(candidate_articles))  # remove duplicates

    # ----------------------------------
    # Build the user's embedding
    # ----------------------------------
    # For a better user embedding, gather *all* the user's clicked articles from train_behaviors_df
    full_clicked = []
    user_rows = train_behaviors_df[train_behaviors_df['user_id'] == user_id]
    for row_idx, row in user_rows.iterrows():
        history_str = row['history']
        if isinstance(history_str, str):
            full_clicked.extend(history_str.split())

    # Filter only the articles that have an embedding
    valid_clicked_embeddings = [
        article_vectors[a] for a in full_clicked if a in article_vectors
    ]

    if not valid_clicked_embeddings:
        #print(f"User {user_id} has no article embeddings in clicked history. Cannot rank by similarity.")
        # fallback: just return naive
        recommended = candidate_articles[:top_k]
        return recommended

    user_embedding = np.mean(valid_clicked_embeddings, axis=0)

    # ----------------------------------
    # Rank candidates by cosine similarity
    # ----------------------------------
    scored_candidates = []
    for art_id in candidate_articles:
        # skip if article has no embedding
        if art_id not in article_vectors:
            continue

        art_vec = article_vectors[art_id]
        # Cosine similarity
        sim = np.dot(user_embedding, art_vec) / (norm(user_embedding)*norm(art_vec))
        scored_candidates.append((art_id, sim))

    # Sort by similarity desc
    scored_candidates.sort(key=lambda x: x[1], reverse=True)

    # pick top_k
    recommended = [x[0] for x in scored_candidates[:top_k]]


    """
        # ----------------------------------
        # Print recommended article titles
        # ----------------------------------
        print(f"\nRecommended articles for user {user_id}:")
        for art_id in recommended:
            if art_id in news_df.index:
                print(f" - {art_id}: {news_df.loc[art_id, 'title']}")
            else:
                print(f" - {art_id}: [Title not found]")
    """
    return recommended

# Example usage:
# We'll assume:
#  - user_profiles, cluster_to_articles loaded from pickle
#  - train_behaviors_df is training set DataFrame
#  - news_df is articles DataFrame, indexed by news_id

test_user_id = 'U72339'
#test_user_id = 'U13740'  # example user
recommendations = recommend_for_user(
    test_user_id,
    user_profiles,
    cluster_to_articles,
    news_df,
    train_behaviors_df,
    article_vectors,
    top_clusters=2,
    top_k=5
)


**Evaluation and Performance Metrics**

In [21]:
import numpy as np

def ndcg_k(ranked_list, ground_truth, k=5):
    """
    ranked_list: list of article IDs in recommended order
    ground_truth: set or list of the clicked (relevant) article IDs
    k: rank cutoff
    
    Returns NDCG at K
    """
    # DCG
    dcg = 0.0
    for i, art_id in enumerate(ranked_list[:k]):
        if art_id in ground_truth:
            # relevance is 1 if clicked, 0 otherwise
            dcg += 1.0 / np.log2(i + 2)  # index i is 0-based, so position is i+1; we use i+2 for the log
    
    # IDCG (Ideal DCG) if all relevant items are at the top
    # if the user clicked m items, the best possible DCG at k is the sum of 1/log2 of positions, for min(m, k) times
    ideal_count = min(len(ground_truth), k)
    idcg = 0.0
    for i in range(ideal_count):
        idcg += 1.0 / np.log2(i + 2)
    
    return dcg / idcg if idcg > 0 else 0.0

def mrr_k(ranked_list, ground_truth, k=5):
    """
    Mean Reciprocal Rank @ K
    Returns 0 if no relevant article is found in the top k.
    """
    for i, art_id in enumerate(ranked_list[:k]):
        if art_id in ground_truth:
            return 1.0 / (i + 1)  # i is 0-based
    return 0.0

def auc_score(ranked_list, ground_truth):
    """
    A simple AUC approach: for each pair (clicked vs. not-clicked) in the ranking,
    check ordering. We'll treat the index in 'ranked_list' as the predicted rank
    (lower index => higher predicted relevance).
    
    This is O(n^2) for the length of ranked_list. For large lists, a more efficient approach is recommended.
    """
    # Convert ground_truth to a set for quick membership test
    ground_truth_set = set(ground_truth)

    # Build a list of (article_id, label, rank)
    labeled_ranked = []
    for rank, art_id in enumerate(ranked_list):
        label = 1 if art_id in ground_truth_set else 0
        labeled_ranked.append((art_id, label, rank))
    
    # We'll count pairwise (clicked vs not clicked)
    n_pairs = 0
    n_correct = 0
    for i in range(len(labeled_ranked)):
        for j in range(i+1, len(labeled_ranked)):
            label_i = labeled_ranked[i][1]
            label_j = labeled_ranked[j][1]
            if label_i != label_j:
                n_pairs += 1
                # if item i is relevant (label=1) and item j is not (label=0), that is correct if i < j
                if label_i == 1 and i < j:
                    n_correct += 1
                # if item j is relevant and i is not, correct if j < i
                if label_j == 1 and j < i:
                    n_correct += 1
    if n_pairs == 0:
        return 0.0
    return n_correct / n_pairs


In [22]:
train_users = set(train_behaviors_df['user_id'].unique())
test_users = set(test_behaviors_df['user_id'].unique())

eval_users = train_users & test_users
print(f"Users in both train and test: {len(eval_users)}")


Users in both train and test: 5943


In [26]:
def filter_users_with_min_clicks(behaviors_file, min_clicks=10):
    """
    Filters users with at least `min_clicks` in their click history.
    Returns a DataFrame of filtered user interactions.
    """
    # Load behaviors file
    cols = ['impression_id', 'user_id', 'time', 'history', 'impressions']
    behaviors_df = pd.read_csv(behaviors_file, sep='\t', header=None, names=cols)

    # Count clicks in the 'history' column (split by space)
    behaviors_df['click_count'] = behaviors_df['history'].apply(lambda x: len(x.split()) if pd.notnull(x) else 0)

    # Filter users with at least `min_clicks`
    filtered_users = behaviors_df[behaviors_df['click_count'] >= min_clicks]
    return filtered_users



def parse_impressions(impressions_str):
    """
    Given something like 'N123-1 N456-0 N789-1', 
    return (all_article_ids, clicked_articles).
    """
    items = impressions_str.split()
    all_ids = []
    clicked = []
    for x in items:
        article_id, label_str = x.split('-')
        all_ids.append(article_id)
        if label_str == '1':
            clicked.append(article_id)
    return all_ids, clicked



behaviors_file = os.path.join(dataset_dir, 'behaviors.tsv')
filtered_users_df = filter_users_with_min_clicks(behaviors_file, min_clicks=5)

# Filter test behaviors to include only users with sufficient history
test_behaviors_df = test_behaviors_df[test_behaviors_df['user_id'].isin(filtered_users_df['user_id'])]


# Evaluate on each impression
all_ndcg = []
all_mrr = []
all_auc = []

k_eval = 10  # We'll evaluate at k=5 for NDCG & MRR

for row in test_behaviors_df.itertuples(index=False):
    user_id = row.user_id
    if user_id not in eval_users:
        continue  # skip users not in train-test overlap
    
    impressions_str = row.impressions
    all_articles, clicked_articles = parse_impressions(impressions_str)

    # Generate a ranked list from your recommender
    # e.g. use the same function from step 4 (with ranking)
    ranked_list = recommend_for_user(
        user_id,
        user_profiles,
        cluster_to_articles,
        news_df,
        train_behaviors_df,
        article_vectors,
        top_clusters=2,
        top_k=len(all_articles)  # We can rank at least the same size as impressions
    )

    # Evaluate vs ground truth
    ndcg_val = ndcg_k(ranked_list, clicked_articles, k=k_eval)
    mrr_val = mrr_k(ranked_list, clicked_articles, k=k_eval)
    auc_val = auc_score(ranked_list, clicked_articles)
    
    all_ndcg.append(ndcg_val)
    all_mrr.append(mrr_val)
    all_auc.append(auc_val)

# Final average
mean_ndcg = np.mean(all_ndcg) if all_ndcg else 0.0
mean_mrr = np.mean(all_mrr) if all_mrr else 0.0
mean_auc = np.mean(all_auc) if all_auc else 0.0

print(f"Evaluation results on test (over users in both train & test):")
print(f"NDCG@{k_eval}: {mean_ndcg:.8f}")
print(f"MRR@{k_eval}:  {mean_mrr:.8f}")
print(f"AUC:           {mean_auc:.8f}")

No profile found for user U17949. Returning empty list.
No profile found for user U4780. Returning empty list.
No profile found for user U17949. Returning empty list.
Evaluation results on test (over users in both train & test):
NDCG@10: 0.00013462
MRR@10:  0.00009511
AUC:           0.00053390


In [24]:
from utils.evaluation import evaluate_model


class ContentBasedRecommender:
    def __init__(self, article_vectors, user_profiles):
        self.article_vectors = article_vectors
        self.user_profiles = user_profiles

        # Use your good logic from recommend_for_user
        self.recommend = lambda user_id, N: recommend_for_user(
            user_id,
            self.user_profiles,
            cluster_to_articles,
            news_df,
            train_behaviors_df,
            self.article_vectors,
            top_clusters=1,
            top_k=N
        )

# Initialize the recommender
recommender = ContentBasedRecommender(article_vectors, user_profiles)

# Use the filtered test behaviors for evaluation
K = 5  # Set the value of K for evaluation
avg_ndcg, avg_auc, avg_mrr = evaluate_model(recommender, test_behaviors_df, K)

print(f"Evaluation Results:")
print(f"NDCG@{K}: {avg_ndcg:.8f}")
print(f"AUC@{K}: {avg_auc:.8f}")
print(f"MRR@{K}: {avg_mrr:.8f}")

ModuleNotFoundError: No module named 'utils'