**1) Build Article Embeddings**

In [27]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
from sklearn.decomposition import PCA
from numpy.linalg import norm
import json
from recommenders.utils.timer import Timer

import sys
print(sys.executable)
from codecarbon import EmissionsTracker





tracker = EmissionsTracker()


def filter_users_with_min_clicks(df, min_clicks=5):
    df['click_count'] = df['history'].apply(lambda x: len(str(x).split()) if pd.notnull(x) else 0)
    filtered_users = df[df['click_count'] >= min_clicks].drop(columns=['click_count'])
    print(len(filtered_users))
    return df[df['click_count'] >= min_clicks].drop(columns=['click_count'])

# ----------------------------------------------------
# Step 1: Load and Preprocess Article Embeddings
# ----------------------------------------------------
def parse_entity_list(entity_str):
    try:
        data = json.loads(entity_str)
        return [obj['WikidataId'] for obj in data if 'WikidataId' in obj]
    except (json.JSONDecodeError, TypeError):
        return []

def build_article_embeddings(news_df, entity_embeddings, use_abstract=True):
    article_vectors = {}
    for news_id, row in news_df.iterrows():
        title_ids = parse_entity_list(row['title_entities'])
        abstract_ids = parse_entity_list(row['abstract_entities']) if use_abstract else []
        entity_ids = title_ids + abstract_ids
        vectors = [entity_embeddings[eid] for eid in entity_ids if eid in entity_embeddings]
        if vectors:
            article_vectors[news_id] = np.mean(vectors, axis=0)
    return article_vectors

# ----------------------------------------------------
# Step 2: Fit PCA on Article Embeddings
# ----------------------------------------------------
def apply_pca_to_article_vectors(article_vectors, n_components=50):
    article_ids = list(article_vectors.keys())
    matrix = np.stack([article_vectors[aid] for aid in article_ids])
    pca = PCA(n_components=n_components)
    reduced_matrix = pca.fit_transform(matrix)
    reduced_article_vectors = {aid: reduced_matrix[i] for i, aid in enumerate(article_ids)}
    return reduced_article_vectors, pca

# ----------------------------------------------------
# Step 3: Build User Profiles (from article embeddings)
# ----------------------------------------------------
def build_user_profiles_embedding_based(behaviors_df, article_vectors):
    user_profiles = {}
    for _, row in behaviors_df.iterrows():
        user_id = row['user_id']
        if not isinstance(row['history'], str):
            continue
        clicked_ids = row['history'].split()
        vectors = [article_vectors[aid] for aid in clicked_ids if aid in article_vectors]
        if vectors:
            user_profiles[user_id] = np.mean(vectors, axis=0)
    return user_profiles

# ----------------------------------------------------
# Step 4: Recommender Function Using PCA
# ----------------------------------------------------
def recommend_for_user(user_id, user_profiles, article_vectors, candidate_articles=None, top_k=5):
    if user_id not in user_profiles:
        return []

    user_embedding = user_profiles[user_id]

    # Filter candidates
    if candidate_articles is None:
        candidate_articles = list(article_vectors.keys())
    else:
        candidate_articles = [aid for aid in candidate_articles if aid in article_vectors]

    if not candidate_articles:
        return []

    scored = []
    for aid in candidate_articles:
        art_vec = article_vectors[aid]
        sim = np.dot(user_embedding, art_vec) / (norm(user_embedding) * norm(art_vec))
        scored.append((aid, sim))

    scored.sort(key=lambda x: x[1], reverse=True)
    return [x[0] for x in scored[:top_k]]


# ----------------------------------------------------
# Step 5: Evaluation Functions
# ----------------------------------------------------
def parse_impressions(impressions_str):
    items = impressions_str.split()
    all_ids = []
    clicked = []
    for x in items:
        article_id, label_str = x.split('-')
        all_ids.append(article_id)
        if label_str == '1':
            clicked.append(article_id)
    return all_ids, clicked

"""def dcg_at_k(recommended, actual_clicked, K=5):
    return sum(1 / np.log2(i + 2) for i, item in enumerate(recommended[:K]) if item in actual_clicked)

def idcg_at_k(actual_clicked, K=5):
    num_relevant = min(len(actual_clicked), K)
    return sum(1 / np.log2(i + 2) for i in range(num_relevant)) or 1

def ndcg_at_k(recommended, actual_clicked, K=5):
    return dcg_at_k(recommended, actual_clicked, K) / idcg_at_k(actual_clicked, K)

def auc_at_k(recommended, actual_clicked, K=5):
    recommended = recommended[:K]
    relevance = [1 if item in actual_clicked else 0 for item in recommended]
    num_pos = sum(relevance)
    num_neg = len(relevance) - num_pos
    if num_pos == 0: return 0.0
    if num_neg == 0: return 1.0
    correct_pairs = sum(1 for i in range(len(relevance)) if relevance[i] == 1
                        for j in range(i+1, len(relevance)) if relevance[j] == 0)
    return correct_pairs / (num_pos * num_neg)

def mrr_at_k(recommended, actual_clicked, K=5):
    recommended = recommended[:K]
    for i, item in enumerate(recommended):
        if item in actual_clicked:
            return 1.0 / (i + 1)
    return 0.0

def evaluate_model_self(user_profiles, cluster_to_articles, train_behaviors_df,
                   test_behaviors_df, article_vectors, pca, K=5):
    ndcg_scores, mrr_scores, auc_scores = [], [], []
    for _, row in test_behaviors_df.iterrows():
        user_id = row['user_id']
        if pd.isna(row['impressions']):
            continue
        all_ids, clicked = parse_impressions(row['impressions'])
        if not clicked:
            continue

        recommended = recommend_for_user(user_id, user_profiles, cluster_to_articles,
                                         train_behaviors_df, article_vectors, pca,
                                         top_clusters=5, top_k=len(all_ids))
        ndcg_scores.append(ndcg_at_k(recommended, clicked, K))
        mrr_scores.append(mrr_at_k(recommended, clicked, K))
        auc_scores.append(auc_at_k(recommended, clicked, K))

    return np.mean(ndcg_scores), np.mean(auc_scores), np.mean(mrr_scores)

    """



[codecarbon INFO @ 00:05:17] [setup] RAM Tracking...
[codecarbon INFO @ 00:05:17] [setup] CPU Tracking...
 Windows OS detected: Please install Intel Power Gadget to measure CPU



C:\Users\Bex\AppData\Local\Programs\Python\Python312\python.exe


[codecarbon INFO @ 00:05:19] CPU Model on constant consumption mode: AMD Ryzen 7 7800X3D 8-Core Processor
[codecarbon INFO @ 00:05:19] [setup] GPU Tracking...
[codecarbon INFO @ 00:05:19] No GPU found.
[codecarbon INFO @ 00:05:19] >>> Tracker's metadata:
[codecarbon INFO @ 00:05:19]   Platform system: Windows-11-10.0.26100-SP0
[codecarbon INFO @ 00:05:19]   Python version: 3.12.4
[codecarbon INFO @ 00:05:19]   CodeCarbon version: 2.8.3
[codecarbon INFO @ 00:05:19]   Available RAM : 63.213 GB
[codecarbon INFO @ 00:05:19]   CPU count: 16
[codecarbon INFO @ 00:05:19]   CPU model: AMD Ryzen 7 7800X3D 8-Core Processor
[codecarbon INFO @ 00:05:19]   GPU count: None
[codecarbon INFO @ 00:05:19]   GPU model: None
[codecarbon INFO @ 00:05:19] Saving emissions data to file C:\Users\Bex\OneDrive - NTNU\NTNU\4 år\Recommender systems\Project\content based\emissions.csv


"def dcg_at_k(recommended, actual_clicked, K=5):\n    return sum(1 / np.log2(i + 2) for i, item in enumerate(recommended[:K]) if item in actual_clicked)\n\ndef idcg_at_k(actual_clicked, K=5):\n    num_relevant = min(len(actual_clicked), K)\n    return sum(1 / np.log2(i + 2) for i in range(num_relevant)) or 1\n\ndef ndcg_at_k(recommended, actual_clicked, K=5):\n    return dcg_at_k(recommended, actual_clicked, K) / idcg_at_k(actual_clicked, K)\n\ndef auc_at_k(recommended, actual_clicked, K=5):\n    recommended = recommended[:K]\n    relevance = [1 if item in actual_clicked else 0 for item in recommended]\n    num_pos = sum(relevance)\n    num_neg = len(relevance) - num_pos\n    if num_pos == 0: return 0.0\n    if num_neg == 0: return 1.0\n    correct_pairs = sum(1 for i in range(len(relevance)) if relevance[i] == 1\n                        for j in range(i+1, len(relevance)) if relevance[j] == 0)\n    return correct_pairs / (num_pos * num_neg)\n\ndef mrr_at_k(recommended, actual_clicked,

In [28]:
import os
import time
# ----- File Paths -----
dataset_dir = "../MINDsmall_train"
news_file = os.path.join(dataset_dir, "news.tsv")
embedding_file = os.path.join(dataset_dir, "entity_embedding.vec")
train_behaviors_file = os.path.join(dataset_dir, "behaviors.tsv")
test_behaviors_file = "../MINDsmall_dev/behaviors.tsv"

# ----- Load data -----
news_cols = ['news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities']
news_df = pd.read_csv(news_file, sep='\t', header=None, names=news_cols)
news_df.set_index('news_id', inplace=True)

train_behaviors_df = pd.read_csv(train_behaviors_file, sep='\t', header=None,
    names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

train_behaviors_df = filter_users_with_min_clicks(train_behaviors_df, min_clicks=1)

test_behaviors_df = pd.read_csv(test_behaviors_file, sep='\t', header=None,
    names=['impression_id', 'user_id', 'time', 'history', 'impressions'])

test_behaviors_df = filter_users_with_min_clicks(test_behaviors_df, min_clicks=1)

# Filter test behaviors to only include users present in train behaviors
valid_users = set(train_behaviors_df['user_id'].unique())
test_behaviors_df = test_behaviors_df[test_behaviors_df['user_id'].isin(valid_users)]


# ----- Load entity embeddings -----
entity_embeddings = {}
with open(embedding_file, 'r', encoding='utf-8') as f:
    for line in f:
        parts = line.strip().split()
        entity_id = parts[0]
        vector = np.array([float(x) for x in parts[1:]], dtype=np.float32)
        entity_embeddings[entity_id] = vector



# ----- Step 1: Build article embeddings -----
tracker.start()
with Timer() as train_time:
    
    article_vectors = build_article_embeddings(news_df, entity_embeddings, use_abstract=True)

    # ----- Step 2: Apply PCA -----
    reduced_article_vectors, pca = apply_pca_to_article_vectors(article_vectors, n_components=50)

 
rows = news_df.shape[0]
print(f"Time per article (embedding + PCA): {train_time.interval / rows:.6f} seconds")
total_kwh = tracker.stop()


print(f"Total energy for embedding + PCA: {total_kwh:.6f} kWh")
print(f"Energy per user profile: {total_kwh} /, {rows:.8f} = {total_kwh / rows:.8f} kWh") 

# ----- Step 3: Cluster articles (e.g. using KMeans) -----
from sklearn.cluster import KMeans
X = np.stack(list(reduced_article_vectors.values()))
article_ids = list(reduced_article_vectors.keys())
kmeans = KMeans(n_clusters=20, random_state=42, n_init=10)
kmeans.fit(X)
cluster_assignments = {aid: int(label) for aid, label in zip(article_ids, kmeans.labels_)}
cluster_to_articles = defaultdict(list)
for aid, label in cluster_assignments.items():
    cluster_to_articles[label].append(aid)

# ----- Step 4: Build user cluster profiles -----
def build_cluster_distribution_profiles(behaviors_df, cluster_assignments, num_clusters=50):
    user_profiles = defaultdict(lambda: np.zeros(num_clusters, dtype=np.float32))
    for _, row in behaviors_df.iterrows():
        if not isinstance(row['history'], str):
            continue
        for aid in row['history'].split():
            if aid in cluster_assignments:
                user_profiles[row['user_id']][cluster_assignments[aid]] += 1
    return {u: v / (v.sum() or 1.0) for u, v in user_profiles.items()}



rows = train_behaviors_df['user_id'].nunique()

tracker.start()
with Timer() as profile_time:
    user_profiles = build_user_profiles_embedding_based(train_behaviors_df, reduced_article_vectors)
    
print(f"Time per user profile: {profile_time.interval / rows:.6f} seconds")
total_kwh = tracker.stop()


print(f"Total energy consumed: {total_kwh:.6f} kWh")
print(f"Energy per user profile: {total_kwh} /, {rows:.8f} = {total_kwh / rows:.8f} kWh")    

"""
# ----- Step 5: Evaluate -----
avg_ndcg, avg_auc, avg_mrr = evaluate_model_self(
    user_profiles=user_profiles,
    cluster_to_articles=cluster_to_articles,
    train_behaviors_df=train_behaviors_df,
    test_behaviors_df=test_behaviors_df,
    article_vectors=reduced_article_vectors,
    pca=pca,
    K=5
)

# ----- Print Results -----
print(f"Evaluation Results:")
print(f"NDCG@5: {avg_ndcg:.4f}")
print(f"AUC@5:  {avg_auc:.4f}")
print(f"MRR@5:  {avg_mrr:.4f}")
"""

153727
70938


[codecarbon INFO @ 00:05:22] Energy consumed for RAM : 0.000010 kWh. RAM Power : 23.70497703552246 W
[codecarbon INFO @ 00:05:22] Energy consumed for all CPUs : 0.000018 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 00:05:22] 0.000028 kWh of electricity used since the beginning.


Time per article (embedding + PCA): 0.000029 seconds
Total energy for embedding + PCA: 0.000001 kWh
Energy per user profile: 8.352661947815002e-07 /, 51282.00000000 = 0.00000000 kWh


[codecarbon INFO @ 00:05:29] Energy consumed for RAM : 0.000051 kWh. RAM Power : 23.70497703552246 W
[codecarbon INFO @ 00:05:29] Energy consumed for all CPUs : 0.000091 kWh. Total CPU Power : 42.5 W
[codecarbon INFO @ 00:05:29] 0.000142 kWh of electricity used since the beginning.


Time per user profile: 0.000102 seconds
Total energy consumed: 0.000004 kWh
Energy per user profile: 4.260266339042304e-06 /, 49108.00000000 = 0.00000000 kWh


'\n# ----- Step 5: Evaluate -----\navg_ndcg, avg_auc, avg_mrr = evaluate_model_self(\n    user_profiles=user_profiles,\n    cluster_to_articles=cluster_to_articles,\n    train_behaviors_df=train_behaviors_df,\n    test_behaviors_df=test_behaviors_df,\n    article_vectors=reduced_article_vectors,\n    pca=pca,\n    K=5\n)\n\n# ----- Print Results -----\nprint(f"Evaluation Results:")\nprint(f"NDCG@5: {avg_ndcg:.4f}")\nprint(f"AUC@5:  {avg_auc:.4f}")\nprint(f"MRR@5:  {avg_mrr:.4f}")\n'

In [None]:
import sys
sys.path.append("..")
from utils.evaluation import evaluate_model

class ClusteredContentRecommender:
    def __init__(self, user_profiles, article_vectors):
        self.user_profiles = user_profiles
        self.article_vectors = article_vectors

    def recommend(self, user_id, candidate_articles=None, N=5):
        # If candidate articles are passed (during evaluation), only use those.
        if candidate_articles is not None:
            filtered_articles = {aid: vec for aid, vec in self.article_vectors.items() if aid in candidate_articles}
        else:
            filtered_articles = self.article_vectors
    
        return recommend_for_user(
            user_id=user_id,
            user_profiles=self.user_profiles,
            article_vectors=filtered_articles,
            candidate_articles=list(filtered_articles.keys()),  # ensure it's passed
            top_k=N
        )


# Instantiate the recommender
recommender = ClusteredContentRecommender(
    user_profiles=user_profiles,
    article_vectors=reduced_article_vectors
)
rows = test_behaviors_df.shape[0]

# Evaluate the model
with Timer() as eval_time:
    ndcg, auc, mrr = evaluate_model(recommender, test_behaviors_df, K=5)

print("\n **Evaluation Results:**")
print(f" NDCG@5: {ndcg:.4f}")
print(f" AUC@5:  {auc:.4f}")
print(f" MRR@5:  {mrr:.4f}")
print(f"Time per user impression: {eval_time.interval / rows:.6f} seconds")


