In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss
import pickle
from collections import defaultdict, deque
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import csr_matrix
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
TIME_WINDOW=2*24 * 60 * 60
ALPHA=0.9
# Load data
print("Loading datasets...")
news_train = pd.read_csv("/Users/anuj/Downloads/MINDsmall_train/news.tsv", sep='\t', header=None,
                          names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])
behaviors_train = pd.read_csv("/Users/anuj/Downloads/MINDsmall_train/behaviors.tsv", sep='\t', header=None,
                              names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])
news_dev = pd.read_csv("/Users/anuj/Downloads/MINDsmall_dev/news.tsv", sep='\t', header=None,
                          names=['NewsID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'TitleEntities', 'AbstractEntities'])
valid_behaviors = pd.read_csv("/Users/anuj/Downloads/MINDsmall_dev/behaviors.tsv", sep='\t', header=None,
                              names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])

print("Converting timestamps...")
# Convert timestamp to numeric seconds
behaviors_train['Timestamp'] = pd.to_datetime(behaviors_train['Time']).astype(int) // 10**9
valid_behaviors['Timestamp'] = pd.to_datetime(valid_behaviors['Time']).astype(int) // 10**9

# Create user and news indices mapping
print("Creating user and news mappings...")
users = list(set(behaviors_train['UserID'].tolist() + valid_behaviors['UserID'].tolist()))
news_items = list(set(news_train['NewsID'].tolist() + news_dev['NewsID'].tolist()))
first_valid_time = valid_behaviors.iloc[0]['Timestamp']
train_behaviors = behaviors_train[(behaviors_train['Timestamp'] >= first_valid_time - TIME_WINDOW) &
                                  (behaviors_train['Timestamp'] < first_valid_time)]


Loading datasets...
Converting timestamps...
Creating user and news mappings...


In [2]:

# Load user profiles
with open('/Users/anuj/Desktop/Recommender_G3P2/Baseline/ctr_global.pkl', 'rb') as f:
    ctr_global = pickle.load(f)
    
with open('/Users/anuj/Desktop/Recommender_G3P2/Baseline/time_aware_ctr.pkl', 'rb') as f:
    time_aware_ctr = pickle.load(f)

with open('/Users/anuj/Desktop/Recommender_G3P2/Content_based/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Load user profiles
with open('/Users/anuj/Desktop/Recommender_G3P2/Content_based/user_profiles.pkl', 'rb') as f:
    user_profiles = pickle.load(f)

# Load TF-IDF matrix
with open('/Users/anuj/Desktop/Recommender_G3P2/Content_based/tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

# Load news_id_to_index
with open('/Users/anuj/Desktop/Recommender_G3P2/Content_based/news_id_to_index.pkl', 'rb') as f:
    news_id_to_index = pickle.load(f)

with open('/Users/anuj/Desktop/Recommender_G3P2/Collaborative/model_bpr.pkl', 'rb') as f:
    model_cf = pickle.load(f)

with open('/Users/anuj/Desktop/Recommender_G3P2/Collaborative/user2idx.pkl', 'rb') as f:
    user2idx = pickle.load(f)

with open('/Users/anuj/Desktop/Recommender_G3P2/Collaborative/news2idx.pkl', 'rb') as f:
    news2idx = pickle.load(f)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = np.load("hybrid_data.npz")
X = data["X"]
y = data["y"]
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)
news_stats = defaultdict(lambda: {'clicks': deque(), 'impressions': deque()})

for row in tqdm(behaviors_train.itertuples(), total=len(behaviors_train), desc="Preparing training data"):
    uid = row.UserID
    if uid not in user2idx:
        continue
    uid_idx = user2idx[uid]

    impressions = row.Impressions.split()
    for imp in impressions:
        if '-' not in imp:
            continue
            
        nid, label = imp.split('-')
        label = int(label)
        news_stats[nid]['impressions'].append(row.Timestamp)
        if label == '1':
            news_stats[nid]['clicks'].append(row.Timestamp)
       


Preparing training data: 100%|██████████| 156965/156965 [00:01<00:00, 90365.47it/s]


In [4]:
# Train logistic regression
print("Fitting logistic regression...")
logreg = LogisticRegression(max_iter=1000, class_weight='balanced')
logreg.fit(X, y)

print("Trained Logistic Regression Coefficients:", logreg.coef_, logreg.intercept_)


Fitting logistic regression...
Trained Logistic Regression Coefficients: [[ 0.21275516  0.33858284 -0.0796488 ]] [-0.08261626]


In [5]:

# ============================
# 5. Evaluate Hybrid Model
# ============================
print("Evaluating hybrid model...")
all_labels, all_scores = [], []

def update_rolling_stats(current_time):
    for nid in list(news_stats.keys()):
        while news_stats[nid]['clicks'] and news_stats[nid]['clicks'][0] < current_time - TIME_WINDOW:
            news_stats[nid]['clicks'].popleft()
        while news_stats[nid]['impressions'] and news_stats[nid]['impressions'][0] < current_time - TIME_WINDOW:
            news_stats[nid]['impressions'].popleft()
        if not news_stats[nid]['clicks'] and not news_stats[nid]['impressions']:
            del news_stats[nid]


for row in tqdm(valid_behaviors.itertuples(), total=len(valid_behaviors), desc="Evaluating"):
    uid = row.UserID
    if uid not in user2idx:
        continue
    uid_idx = user2idx[uid]
    current_time = row.Timestamp
    update_rolling_stats(current_time)
    labels, scores = [], []

    impressions = row.Impressions.split()
    for imp in impressions:
        if '-' not in imp:
            continue
            
        nid, label = imp.split('-')
        labels.append(int(label))

        global_ctr = ctr_global.get(nid, 0.0)
        time_ctr = time_aware_ctr.get(nid, 0.0)
        score_ctr = ALPHA * time_ctr + (1 - ALPHA) * global_ctr        
        if uid in user_profiles and nid in news_id_to_index:
            user_vector = user_profiles[uid].reshape(1, -1)
            news_vector = tfidf_matrix[news_id_to_index[nid]]
            score_cbf = cosine_similarity(user_vector, news_vector)[0][0]
        else:
            score_cbf = 0.0
        if nid in news2idx:
            nid_idx = news2idx[nid]
            score_cf = np.dot(model_cf.user_factors[uid_idx], model_cf.item_factors[nid_idx])
        else:
            score_cf = 0.0

        features = np.array([score_ctr, score_cbf, score_cf]).reshape(1, -1)
        prob = logreg.predict_proba(features)[0][1]  # Probability of click (class 1)
        scores.append(prob)
        news_stats[nid]['impressions'].append(current_time)
        if label == '1':
            news_stats[nid]['clicks'].append(current_time)

    if scores and len(set(labels)) > 1:
        all_labels.append(labels)
        all_scores.append(scores)


Evaluating hybrid model...


Evaluating: 100%|██████████| 73152/73152 [01:26<00:00, 842.58it/s] 


In [6]:

# Define evaluation metrics
def mrr_score(labels, scores):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)[order]
    for idx, label in enumerate(labels):
        if label == 1:
            return 1.0 / (idx + 1)
    return 0.0

def dcg_score(labels, scores, k):
    order = np.argsort(scores)[::-1][:k]
    gains = np.array(labels)[order]
    discounts = np.log2(np.arange(2, len(gains) + 2))
    return np.sum(gains / discounts)

def ndcg_score(labels, scores, k):
    dcg = dcg_score(labels, scores, k)
    ideal_dcg = dcg_score(labels, labels, k)
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

# Compute final metrics
mrr, ndcg5, ndcg10, auc = [], [], [], []

for labels, scores in zip(all_labels, all_scores):
    if len(set(labels)) > 1:  # Need both positive and negative examples for AUC
        auc.append(roc_auc_score(labels, scores))
    mrr.append(mrr_score(labels, scores))
    ndcg5.append(ndcg_score(labels, scores, 5))
    ndcg10.append(ndcg_score(labels, scores, 10))

print("\n--- Hybrid Model Evaluation ---")
print(f"AUC: {np.mean(auc):.4f}")
print(f"MRR: {np.mean(mrr):.4f}")
print(f"nDCG@5: {np.mean(ndcg5):.4f}")
print(f"nDCG@10: {np.mean(ndcg10):.4f}")



--- Hybrid Model Evaluation ---
AUC: 0.6829
MRR: 0.3900
nDCG@5: 0.3755
nDCG@10: 0.4329


In [20]:

# ============================
# 6. Save trained components for future use
# ============================
print("Saving model components...")

# Save the logistic regression model
with open('hybrid_logreg_model_1.pkl', 'wb') as f:
    pickle.dump(logreg, f)

print("Done! All components saved successfully.")

Saving model components...
Done! All components saved successfully.
