In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss
import pickle
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from codecarbon import EmissionsTracker
from datetime import datetime
import os
### Start Carbon Tracker
tracker = EmissionsTracker(measure_power_secs=1, save_to_file=True, output_file="hybrid.csv")
tracker.start()
ALPHA=0.9
valid_behaviors = pd.read_csv("/Users/anuj/Downloads/MINDsmall_dev/behaviors.tsv", sep='\t', header=None,
                              names=['ImpressionID', 'UserID', 'Time', 'History', 'Impressions'])

# Load user profiles
with open('/Users/anuj/Desktop/Recommender_G3P2/Baseline/ctr_global.pkl', 'rb') as f:
    ctr_global = pickle.load(f)
    
with open('/Users/anuj/Desktop/Recommender_G3P2/Baseline/time_aware_ctr.pkl', 'rb') as f:
    time_aware_ctr = pickle.load(f)

with open('/Users/anuj/Desktop/Recommender_G3P2/Content_based/tfidf_vectorizer.pkl', 'rb') as f:
    tfidf_vectorizer = pickle.load(f)

# Load user profiles
with open('/Users/anuj/Desktop/Recommender_G3P2/Content_based/user_profiles.pkl', 'rb') as f:
    user_profiles = pickle.load(f)

# Load TF-IDF matrix
with open('/Users/anuj/Desktop/Recommender_G3P2/Content_based/tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

# Load news_id_to_index
with open('/Users/anuj/Desktop/Recommender_G3P2/Content_based/news_id_to_index.pkl', 'rb') as f:
    news_id_to_index = pickle.load(f)

with open('/Users/anuj/Desktop/Recommender_G3P2/Collaborative/model_fm.pkl', 'rb') as f:
    model_fm = pickle.load(f)

with open('/Users/anuj/Desktop/Recommender_G3P2/Collaborative/user2idx.pkl', 'rb') as f:
    user2idx = pickle.load(f)

with open('/Users/anuj/Desktop/Recommender_G3P2/Collaborative/news2idx.pkl', 'rb') as f:
    news2idx = pickle.load(f)


[codecarbon INFO @ 23:46:14] Energy consumed for RAM : 0.000053 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:46:14] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 23:46:14] Energy consumed for All CPU : 0.000746 kWh
[codecarbon INFO @ 23:46:14] 0.000798 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:46:15] Energy consumed for RAM : 0.000053 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:46:15] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 23:46:15] Energy consumed for All CPU : 0.000758 kWh
[codecarbon INFO @ 23:46:15] 0.000811 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:46:15] 0.000380 g.CO2eq/s mean an estimation of 11.971525275111478 kg.CO2eq/year
[codecarbon INFO @ 23:46:16] Energy consumed for RAM : 0.000054 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:46:16] Delta energy consumed for CPU with constant : 0.000013 kWh, power : 42.5 W
[codecarbon IN

[codecarbon INFO @ 23:46:17] Energy consumed for RAM : 0.000055 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:46:17] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 23:46:17] Energy consumed for All CPU : 0.000783 kWh
[codecarbon INFO @ 23:46:17] 0.000838 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:46:18] Energy consumed for RAM : 0.000056 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:46:18] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 23:46:18] Energy consumed for All CPU : 0.000795 kWh
[codecarbon INFO @ 23:46:18] 0.000851 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:46:19] Energy consumed for RAM : 0.000057 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:46:19] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 23:46:19] Energy consumed for All CPU : 0.000807 kWh
[codecarbon INFO @ 23:46:19] 0.000863 kWh of 

In [None]:
data = np.load("hybrid_data.npz")
X = data["X"]
y = data["y"]
with open("scaler.pkl", "rb") as f:
    scaler = pickle.load(f)     


Fitting logistic regression...


[codecarbon INFO @ 23:05:12] Energy consumed for RAM : 0.000003 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:05:12] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 23:05:12] Energy consumed for All CPU : 0.000036 kWh
[codecarbon INFO @ 23:05:12] 0.000038 kWh of electricity used since the beginning.


Trained Logistic Regression Coefficients: [[ 0.21275516  0.33858284 -0.0796488 ]] [-0.08261626]


In [None]:

# Train logistic regression
print("Fitting logistic regression...")
logreg = LogisticRegression(max_iter=1000, class_weight='balanced')
logreg.fit(X, y)

print("Trained Logistic Regression Coefficients:", logreg.coef_, logreg.intercept_)

In [None]:

# ============================
# 5. Evaluate Hybrid Model
# ============================
print("Evaluating hybrid model...")
all_labels, all_scores = [], []

for row in tqdm(valid_behaviors.itertuples(), total=len(valid_behaviors), desc="Evaluating"):
    uid = row.UserID
    if uid not in user2idx:
        continue
    uid_idx = user2idx[uid]
    labels, scores = [], []

    impressions = row.Impressions.split()
    for imp in impressions:
        if '-' not in imp:
            continue
            
        nid, label = imp.split('-')
        labels.append(int(label))

        global_ctr = ctr_global.get(nid, 0.0)
        time_ctr = time_aware_ctr.get(nid, 0.0)
        score_ctr = ALPHA * time_ctr + (1 - ALPHA) * global_ctr        
        if uid in user_profiles and nid in news_id_to_index:
            user_vector = user_profiles[uid].reshape(1, -1)
            news_vector = tfidf_matrix[news_id_to_index[nid]]
            score_cbf = cosine_similarity(user_vector, news_vector)[0][0]
        else:
            score_cbf = 0.0
        if nid in news2idx:
            nid_idx = news2idx[nid]
            score_cf = np.dot(model_fm.get_user_representations()[0][uid_idx], 
                  model_fm.get_item_representations()[0][nid_idx])

        else:
            score_cf = 0.0

        features = np.array([score_ctr, score_cbf, score_cf]).reshape(1, -1)
        prob = logreg.predict_proba(features)[0][1]  # Probability of click (class 1)
        scores.append(prob)

    if scores and len(set(labels)) > 1:
        all_labels.append(labels)
        all_scores.append(scores)



Evaluating hybrid model...


Evaluating:   0%|          | 0/73152 [00:00<?, ?it/s][codecarbon INFO @ 23:05:13] Energy consumed for RAM : 0.000003 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:05:13] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 23:05:13] Energy consumed for All CPU : 0.000048 kWh
[codecarbon INFO @ 23:05:13] 0.000051 kWh of electricity used since the beginning.
Evaluating:   1%|▏         | 964/73152 [00:01<01:21, 890.37it/s] [codecarbon INFO @ 23:05:14] Energy consumed for RAM : 0.000004 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:05:14] Delta energy consumed for CPU with constant : 0.000011 kWh, power : 42.5 W
[codecarbon INFO @ 23:05:14] Energy consumed for All CPU : 0.000059 kWh
[codecarbon INFO @ 23:05:14] 0.000063 kWh of electricity used since the beginning.
Evaluating:   3%|▎         | 1853/73152 [00:01<01:05, 1094.66it/s][codecarbon INFO @ 23:05:15] Energy consumed for RAM : 0.000005 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:05:15] Delta en

In [40]:

# Define evaluation metrics
def mrr_score(labels, scores):
    order = np.argsort(scores)[::-1]
    labels = np.array(labels)[order]
    for idx, label in enumerate(labels):
        if label == 1:
            return 1.0 / (idx + 1)
    return 0.0

def dcg_score(labels, scores, k):
    order = np.argsort(scores)[::-1][:k]
    gains = np.array(labels)[order]
    discounts = np.log2(np.arange(2, len(gains) + 2))
    return np.sum(gains / discounts)

def ndcg_score(labels, scores, k):
    dcg = dcg_score(labels, scores, k)
    ideal_dcg = dcg_score(labels, labels, k)
    return dcg / ideal_dcg if ideal_dcg > 0 else 0.0

# Compute final metrics
mrr, ndcg5, ndcg10, auc = [], [], [], []

for labels, scores in zip(all_labels, all_scores):
    if len(set(labels)) > 1:  # Need both positive and negative examples for AUC
        auc.append(roc_auc_score(labels, scores))
    mrr.append(mrr_score(labels, scores))
    ndcg5.append(ndcg_score(labels, scores, 5))
    ndcg10.append(ndcg_score(labels, scores, 10))

print("\n--- Hybrid Model Evaluation ---")
print(f"AUC: {np.mean(auc):.4f}")
print(f"MRR: {np.mean(mrr):.4f}")
print(f"nDCG@5: {np.mean(ndcg5):.4f}")
print(f"nDCG@10: {np.mean(ndcg10):.4f}")


[codecarbon INFO @ 23:06:29] Energy consumed for RAM : 0.000067 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:06:29] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 23:06:29] Energy consumed for All CPU : 0.000946 kWh
[codecarbon INFO @ 23:06:29] 0.001013 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:06:29] 0.000379 g.CO2eq/s mean an estimation of 11.948448486708168 kg.CO2eq/year
[codecarbon INFO @ 23:06:30] Energy consumed for RAM : 0.000067 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:06:30] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon INFO @ 23:06:30] Energy consumed for All CPU : 0.000958 kWh
[codecarbon INFO @ 23:06:30] 0.001025 kWh of electricity used since the beginning.
[codecarbon INFO @ 23:06:31] Energy consumed for RAM : 0.000068 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:06:31] Delta energy consumed for CPU with constant : 0.000012 kWh, power : 42.5 W
[codecarbon IN


--- Hybrid Model Evaluation ---
AUC: 0.6920
MRR: 0.3955
nDCG@5: 0.3780
nDCG@10: 0.4371


In [41]:
# Stop tracking
emissions = tracker.stop()
# --- Generate and print report ---
try:
    df = pd.read_csv("hybrid.csv")
    emissions_data = df.iloc[-1]

    duration_hr = emissions_data['duration'] / 3600
    energy_kwh = emissions_data['energy_consumed']
    cpu_power = emissions_data['cpu_power']

    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

    report = f"""\
📄 Emissions Report – {timestamp}
====================================
🌱 Total Emissions:     {emissions:.6f} kg CO2eq

🕒 Duration:            {duration_hr:.2f} hours
⚡ Energy Consumed:     {energy_kwh:.4f} kWh
🧠 CPU Power:           {cpu_power:.2f} W

🌍 Machine:             MacBook Air (CPU Only)
====================================
"""

    print(report)

    os.makedirs("emissions", exist_ok=True)
    with open("emissions/hybrid.txt", "w") as f:
        f.write(report)

except Exception as e:
    print(f"⚠️ Error generating emissions report: {e}")
os.makedirs("results", exist_ok=True)
# Save overall metrics after they are calculated
with open("results/evaluation_metrics.txt", "w") as f:
    f.write("Metric\tValue\n")
    f.write(f"AUC\t{np.mean(auc):.6f}\n")
    f.write(f"MRR\t{np.mean(mrr):.6f}\n")
    f.write(f"nDCG@5\t{np.mean(ndcg5):.6f}\n")
    f.write(f"nDCG@10\t{np.mean(ndcg10):.6f}\n")

print("Scores and metrics saved to results/ directory")

[codecarbon INFO @ 23:06:32] Energy consumed for RAM : 0.000069 kWh. RAM Power : 3.0 W
[codecarbon INFO @ 23:06:32] Delta energy consumed for CPU with constant : 0.000005 kWh, power : 42.5 W
[codecarbon INFO @ 23:06:32] Energy consumed for All CPU : 0.000974 kWh
[codecarbon INFO @ 23:06:32] 0.001042 kWh of electricity used since the beginning.


📄 Emissions Report – 2025-04-28 23:06:32
🌱 Total Emissions:     0.000031 kg CO2eq

🕒 Duration:            0.02 hours
⚡ Energy Consumed:     0.0010 kWh
🧠 CPU Power:           42.50 W

🌍 Machine:             MacBook Air (CPU Only)

Scores and metrics saved to results/ directory


In [42]:
print("Saving model components...")

# Save the logistic regression model
with open('hybrid_logreg_model_2.pkl', 'wb') as f:
    pickle.dump(logreg, f)

print("Done! All components saved successfully.")

Saving model components...
Done! All components saved successfully.
