# Abdelrahman Ayman Samy Mohamed, 222100930
# Yassmin Mohamed Mahmoud Metwally, 222101910
# Shahd Mamdouh Ali Hassan, 222102250
# Seif Amr Abdelhafez abdo , 222102312

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, vstack, load_npz  
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD

# Loading Data

In [2]:
interactions = pd.read_csv('../data/user_interactions.csv')     # columns: user_id, course_index, rating
courses = pd.read_csv('../data/courses_processed.csv')          

# Load content feature matrix (sparse)
item_feature_matrix = load_npz('../results/item_feature_matrix.npz')

print("Loaded interactions:", interactions.shape)
print("Loaded courses:", courses.shape)
print("Loaded item_feature_matrix:", item_feature_matrix.shape)

# Ensure indices valid
interactions = interactions[interactions['course_index'].between(0, len(courses) - 1)].copy()

Loaded interactions: (54944, 5)
Loaded courses: (4551, 15)
Loaded item_feature_matrix: (4551, 5032)


# Train/Validation split

In [3]:
train_df, val_df = train_test_split(interactions, test_size=0.2, random_state=42)

# CF Model (SVD) on TRAIN

In [4]:
train_users = train_df['user_id'].unique()
user_id_to_u = {u: i for i, u in enumerate(train_users)}

train_df = train_df.copy()
train_df['user_u'] = train_df['user_id'].map(user_id_to_u)

n_users = len(train_users)
n_items = len(courses)

R_train = csr_matrix(
    (train_df['rating'].values,
    (train_df['user_u'].values, train_df['course_index'].values)),
    shape=(n_users, n_items)
)

k = 20  
svd = TruncatedSVD(n_components=k, random_state=42)
U = svd.fit_transform(R_train)   # (n_users, k)
S = svd.singular_values_         # (k,)
Vt = svd.components_             # (k, n_items)

CF_pred = (U * S) @ Vt           # predicted ratings 
CF_pred = np.clip(CF_pred, 1, 5)

print("\nCF_pred shape:", CF_pred.shape)


CF_pred shape: (5000, 4551)


# CB Model

In [5]:
profiles = []
for u in train_users:
    u_rows = train_df[train_df['user_id'] == u]
    idx = u_rows['course_index'].values.astype(int)
    r = u_rows['rating'].values.astype(float)

    feats = item_feature_matrix[idx]                 # sparse
    weighted_sum = (feats.T.multiply(r)).T.sum(axis=0)
    profile = weighted_sum / (r.sum() + 1e-10)
    profiles.append(csr_matrix(profile))

user_profile_train = vstack(profiles, format='csr')

CB_scores = cosine_similarity(user_profile_train, item_feature_matrix)  # (n_users, n_items)
print("CB_scores shape:", CB_scores.shape)

CB_scores shape: (5000, 4551)


# Normalizing scores to [0,1]

In [6]:
def minmax_2d(M):
    mn = M.min()
    mx = M.max()
    if (mx - mn) < 1e-12:
        return np.zeros_like(M)
    return (M - mn) / (mx - mn)

CB_norm = minmax_2d(CB_scores)
CF_norm = minmax_2d(CF_pred)

#  Validating alphas using RMSE

In [7]:
val_known = val_df[val_df['user_id'].isin(user_id_to_u)].copy()
val_known['user_u'] = val_known['user_id'].map(user_id_to_u)

def rmse(y_true, y_pred):
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

alphas = [0.3, 0.5, 0.7]
results = []

print("\nValidation rows with known users:", len(val_known), "/", len(val_df))

for a in alphas:
    hybrid01 = a * CB_norm + (1 - a) * CF_norm      # in [0,1]
    preds01 = hybrid01[val_known['user_u'].values, val_known['course_index'].values]
    preds = 1 + preds01 * 4                         # back to [1,5] scale
    score = rmse(val_known['rating'].values, preds)
    results.append((a, score))
    print(f"alpha={a:.1f} -> RMSE={score:.4f}")

best_alpha, best_rmse = sorted(results, key=lambda x: x[1])[0]
print(f"\nBest alpha = {best_alpha:.1f} with RMSE = {best_rmse:.4f}")


Validation rows with known users: 10809 / 10809
alpha=0.3 -> RMSE=1.3193
alpha=0.5 -> RMSE=1.1214
alpha=0.7 -> RMSE=1.0355

Best alpha = 0.7 with RMSE = 1.0355


# Top-N using best alpha(0.7)

In [8]:
def recommend_hybrid(user_id, n=10):
    if user_id not in user_id_to_u:
        raise ValueError("User not in TRAIN split. Choose a user from train_users.")

    u = user_id_to_u[user_id]
    hybrid01 = best_alpha * CB_norm + (1 - best_alpha) * CF_norm
    scores = hybrid01[u].copy()

    # remove already rated courses
    rated = set(interactions.loc[interactions['user_id'] == user_id, 'course_index'].astype(int).tolist())
    if len(rated) > 0:
        scores[list(rated)] = -1

    top_idx = np.argsort(scores)[::-1][:n]

    # pick a title column that exists
    title_col = 'course_title'
    if title_col not in courses.columns:
        # fallback: use first column as "title"
        title_col = courses.columns[0]

    recs = []
    for i in top_idx:
        recs.append({
            "course_index": int(i),
            "hybrid_score_0to1": float(scores[i]),
            "title": str(courses.iloc[i][title_col])
        })
    return pd.DataFrame(recs)


test_user = 'U00000'
if test_user not in user_id_to_u:
    test_user = train_users[0]

top10_df = recommend_hybrid(test_user, n=10)
print(f"\nTop-10 hybrid recommendations for user {test_user} (alpha={best_alpha:.1f}):")
display(top10_df)


Top-10 hybrid recommendations for user U00000 (alpha=0.7):


Unnamed: 0,course_index,hybrid_score_0to1,title
0,4244,0.9575,Marketing Gerencial
1,4185,0.945849,Introduction to Typography
2,4222,0.93948,Liderazgo efectivo para el siglo XXI
3,4015,0.938529,Fundamentals of Reinforcement Learning
4,4527,0.938453,Wonders of Ancient Egypt
5,4535,0.938215,Étudier en France: French Intermediate course ...
6,3946,0.936884,Essentials in Clinical Simulations Across the ...
7,4208,0.936834,Leading: Human Resource Management and Leadership
8,4451,0.936666,The Art of Music Production
9,3738,0.93662,Autodesk CAD/CAM/CAE for Mechanical Engineering


# Loading Data

In [9]:
TITLE_COL = 'course_title' if 'course_title' in courses.columns else courses.columns[0]

interactions = interactions[interactions['course_index'].between(0, len(courses) - 1)].copy()
print("Interactions:", interactions.shape)
print("Courses:", courses.shape)
print("Item features:", item_feature_matrix.shape)

Interactions: (54044, 5)
Courses: (4551, 15)
Item features: (4551, 5032)


# Popularity Baseline

In [10]:
item_pop = interactions.groupby('course_index').size().sort_values(ascending=False)
mostpop_list = item_pop.index.to_numpy()

def recommend_mostpop(exclude_items=set(), n=10):
    recs = []
    for i in mostpop_list:
        if int(i) not in exclude_items:
            recs.append(int(i))
        if len(recs) == n:
            break
    return recs

# Train CF model (SVD)

In [11]:
users = interactions['user_id'].unique()
user_id_to_u = {u: i for i, u in enumerate(users)}

n_users = len(users)
n_items = len(courses)

df = interactions.copy()
df['user_u'] = df['user_id'].map(user_id_to_u)

R = csr_matrix(
    (df['rating'].values, (df['user_u'].values, df['course_index'].values)),
    shape=(n_users, n_items)
)

k = 20
svd = TruncatedSVD(n_components=k, random_state=42)
U = svd.fit_transform(R)
S = svd.singular_values_
Vt = svd.components_
CF_pred = np.clip((U * S) @ Vt, 1, 5)

def minmax_2d(M):
    mn = M.min()
    mx = M.max()
    if (mx - mn) < 1e-12:
        return np.zeros_like(M)
    return (M - mn) / (mx - mn)

CF_norm = minmax_2d(CF_pred)

# build cold-start user

In [12]:
def build_user_profile_from_ratings(rated_items, ratings):
    feats = item_feature_matrix[np.array(rated_items, dtype=int)]
    r = np.array(ratings, dtype=float)
    weighted_sum = (feats.T.multiply(r)).T.sum(axis=0)
    profile = weighted_sum / (r.sum() + 1e-10)
    return csr_matrix(profile)

def cb_scores_from_profile(profile_vec):
    return cosine_similarity(profile_vec, item_feature_matrix).ravel()

def minmax_1d(v):
    mn, mx = float(np.min(v)), float(np.max(v))
    if (mx - mn) < 1e-12:
        return np.zeros_like(v)
    return (v - mn) / (mx - mn)

# Hybrid recommend for cold-start user

In [13]:
def recommend_hybrid_coldstart(user_id, rated_items, ratings, alpha=0.7, n=10):
    profile = build_user_profile_from_ratings(rated_items, ratings)
    cb01 = minmax_1d(cb_scores_from_profile(profile))

    if user_id in user_id_to_u:
        cf01 = CF_norm[user_id_to_u[user_id]]
        hybrid01 = alpha * cb01 + (1 - alpha) * cf01
    else:
        hybrid01 = cb01

    hybrid01 = hybrid01.copy()
    hybrid01[np.array(rated_items, dtype=int)] = -1

    top_idx = np.argsort(hybrid01)[::-1][:n]
    return [int(i) for i in top_idx], hybrid01[top_idx]

# Picking a demo user with >=10 ratings

In [14]:
user_counts = interactions.groupby('user_id').size().sort_values(ascending=False)
eligible_users = user_counts[user_counts >= 10].index.tolist()
if len(eligible_users) == 0:
    raise ValueError("No user has >=10 ratings in the whole dataset.")

demo_user = eligible_users[0]
user_all = interactions[interactions['user_id'] == demo_user].copy()
print(f"\nDemo user: {demo_user} has {len(user_all)} total ratings.")


Demo user: U03600 has 12 total ratings.


# Runing cold-start tests: m = 3,5,10

In [15]:
alpha_used = 0.7
ms = [3, 5, 10]

for m in ms:
    # simulate that we only know m ratings about the user
    known = user_all.sample(n=m, random_state=42)
    rated_items = known['course_index'].astype(int).tolist()
    ratings = known['rating'].astype(float).tolist()

    rec_h, score_h = recommend_hybrid_coldstart(demo_user, rated_items, ratings, alpha=alpha_used, n=10)
    rec_p = recommend_mostpop(exclude_items=set(rated_items), n=10)

    print("\n" + "="*85)
    print(f"Cold-start scenario: only m={m} known ratings (alpha={alpha_used})")
    print("- Known ratings (input to model):")
    for ci, r in zip(rated_items, ratings):
        print(f"  course_index={ci:5d} | rating={r:.1f} | {courses.iloc[ci][TITLE_COL]}")

    print("\n- Hybrid Top-10:")
    for rank, ci in enumerate(rec_h, 1):
        print(f"  {rank:2d}) {ci:5d} | score01={float(score_h[rank-1]):.4f} | {courses.iloc[ci][TITLE_COL]}")

    print("\n- MostPop Top-10:")
    for rank, ci in enumerate(rec_p, 1):
        print(f"  {rank:2d}) {ci:5d} | pop_count={int(item_pop.get(ci, 0))} | {courses.iloc[ci][TITLE_COL]}")


Cold-start scenario: only m=3 known ratings (alpha=0.7)
- Known ratings (input to model):
  course_index= 3256 | rating=1.9 | JavaScript Fundamentals
  course_index= 3226 | rating=1.7 | Learn Spring Framework Practically - Hands On Recipies
  course_index= 3231 | rating=1.4 | The Most Comprehensive Web Development Course

- Hybrid Top-10:
   1)  3032 | score01=0.9231 | PHP Development with the Laravel Framework
   2)  3251 | score01=0.9131 | HTML5 And CSS3 - Build Modern Responsive Websites
   3)  2592 | score01=0.9009 | Amazon s3 Mastery - THE How-To' Guides For Amazon S3
   4)  3325 | score01=0.8859 | Learn to make an HTML 5 website with a video background
   5)  2701 | score01=0.8618 | The Ultimate Web Development Course
   6)  3385 | score01=0.7939 | Wordpress Security
   7)  3247 | score01=0.7156 | Bootstrap Tutorial - Essentials From Basic to Advanced
   8)  3175 | score01=0.7146 | PHP For Beginners : Learn PHP From Scratch!
   9)  3219 | score01=0.7107 | Ultimate Web Designer &

# Loading Data

In [16]:
# relevance threshold 
REL_TH = 4.0

K = 10
SEED = 42
ALPHA = 0.7   

print("Interactions:", interactions.shape)
print("Courses:", courses.shape)
print("Item features:", item_feature_matrix.shape)


# 1) Train/Test split (row-wise)
train_df, test_df = train_test_split(interactions, test_size=0.2, random_state=SEED)

# Only evaluate users that appear in train (so CF/CB profiles exist)
train_users = train_df['user_id'].unique()
test_df = test_df[test_df['user_id'].isin(train_users)].copy()

print("Train:", train_df.shape, "Test:", test_df.shape)


# 2) Popularity baseline (MostPop) from TRAIN
item_pop = train_df.groupby('course_index').size().sort_values(ascending=False)
mostpop_list = item_pop.index.to_numpy()

def recommend_mostpop(exclude=set(), k=10):
    recs = []
    for i in mostpop_list:
        i = int(i)
        if i not in exclude:
            recs.append(i)
        if len(recs) == k:
            break
    return recs


# 3) Train CF (SVD) on TRAIN
user_id_to_u = {u:i for i,u in enumerate(train_users)}
n_users = len(train_users)
n_items = len(courses)

train_df2 = train_df.copy()
train_df2['user_u'] = train_df2['user_id'].map(user_id_to_u)

R_train = csr_matrix(
    (train_df2['rating'].values,
    (train_df2['user_u'].values, train_df2['course_index'].values)),
    shape=(n_users, n_items)
)

k_svd = 20
svd = TruncatedSVD(n_components=k_svd, random_state=SEED)
U = svd.fit_transform(R_train)
S = svd.singular_values_
Vt = svd.components_
CF_pred = np.clip((U * S) @ Vt, 1, 5)

def minmax_2d(M):
    mn = M.min()
    mx = M.max()
    if (mx - mn) < 1e-12:
        return np.zeros_like(M)
    return (M - mn) / (mx - mn)

CF_norm = minmax_2d(CF_pred)

# 4) Train CB user profiles on TRAIN + cosine scores

profiles = []
for u in train_users:
    u_rows = train_df[train_df['user_id'] == u]
    idx = u_rows['course_index'].values.astype(int)
    r = u_rows['rating'].values.astype(float)

    feats = item_feature_matrix[idx]
    weighted_sum = (feats.T.multiply(r)).T.sum(axis=0)
    profile = weighted_sum / (r.sum() + 1e-10)
    profiles.append(csr_matrix(profile))

user_profile_train = vstack(profiles, format='csr')
CB_scores = cosine_similarity(user_profile_train, item_feature_matrix)  # (n_users, n_items)

CB_norm = minmax_2d(CB_scores)


Interactions: (54044, 5)
Courses: (4551, 15)
Item features: (4551, 5032)
Train: (43235, 5) Test: (10809, 5)


# Recommenders (return top-K indices)

In [17]:
all_items = np.arange(n_items)

def recommend_random(exclude=set(), k=10, seed=SEED):
    rng = np.random.default_rng(seed)
    pool = np.array([i for i in all_items if int(i) not in exclude], dtype=int)
    if len(pool) <= k:
        return pool.tolist()
    return rng.choice(pool, size=k, replace=False).tolist()

def recommend_cf(user_id, exclude=set(), k=10):
    u = user_id_to_u[user_id]
    s = CF_norm[u].copy()
    if exclude:
        s[list(exclude)] = -1
    top = np.argsort(s)[::-1][:k]
    return [int(i) for i in top]

def recommend_cb(user_id, exclude=set(), k=10):
    u = user_id_to_u[user_id]
    s = CB_norm[u].copy()
    if exclude:
        s[list(exclude)] = -1
    top = np.argsort(s)[::-1][:k]
    return [int(i) for i in top]

def recommend_hybrid(user_id, exclude=set(), k=10, alpha=ALPHA):
    u = user_id_to_u[user_id]
    s = alpha * CB_norm[u] + (1 - alpha) * CF_norm[u]
    s = s.copy()
    if exclude:
        s[list(exclude)] = -1
    top = np.argsort(s)[::-1][:k]
    return [int(i) for i in top]

# Metrics: Precision, Recall, NDCG

In [18]:
def precision_at_k(recs, relevant, k=10):
    if k == 0:
        return 0.0
    recs_k = recs[:k]
    hits = sum(1 for x in recs_k if x in relevant)
    return hits / k

def recall_at_k(recs, relevant, k=10):
    if len(relevant) == 0:
        return np.nan
    recs_k = recs[:k]
    hits = sum(1 for x in recs_k if x in relevant)
    return hits / len(relevant)

def ndcg_at_k(recs, relevant, k=10):
    recs_k = recs[:k]
    # binary relevance
    gains = np.array([1.0 if x in relevant else 0.0 for x in recs_k], dtype=float)
    discounts = 1.0 / np.log2(np.arange(2, k + 2))  # ranks 1..k -> log2(rank+1)
    dcg = float(np.sum(gains * discounts))
    # ideal DCG: all relevant at the top
    ideal_gains = np.sort(gains)[::-1]
    idcg = float(np.sum(ideal_gains * discounts))
    return 0.0 if idcg == 0 else dcg / idcg

# Evaluating on users with at least 1 relevant item in test

In [19]:
test_users = test_df['user_id'].unique()

rows = []
for u in test_users:
    u_test = test_df[test_df['user_id'] == u]
    relevant = set(u_test.loc[u_test['rating'] >= REL_TH, 'course_index'].astype(int).tolist())
    if len(relevant) == 0:
        continue

    # exclude items the user already interacted with in TRAIN (standard offline eval)
    exclude = set(train_df.loc[train_df['user_id'] == u, 'course_index'].astype(int).tolist())

    rec_random = recommend_random(exclude=exclude, k=K, seed=SEED)
    rec_pop = recommend_mostpop(exclude=exclude, k=K)
    rec_cb = recommend_cb(u, exclude=exclude, k=K)
    rec_hybrid = recommend_hybrid(u, exclude=exclude, k=K, alpha=ALPHA)

    for name, recs in [
        ("Random", rec_random),
        ("MostPop", rec_pop),
        ("ContentBased", rec_cb),
        ("Hybrid", rec_hybrid),
    ]:
        rows.append({
            "user_id": u,
            "model": name,
            f"P@{K}": precision_at_k(recs, relevant, k=K),
            f"R@{K}": recall_at_k(recs, relevant, k=K),
            f"NDCG@{K}": ndcg_at_k(recs, relevant, k=K),
        })

results = pd.DataFrame(rows)

# aggregate (mean over users)
summary = results.groupby("model")[[f"P@{K}", f"R@{K}", f"NDCG@{K}"]].mean().reset_index()
summary = summary.sort_values(by=f"NDCG@{K}", ascending=False)

print("\n=== Task 11.1 Results (mean over evaluated users) ===")
display(summary)

print("\nEvaluated users:", results['user_id'].nunique(), "out of", len(test_users))


=== Task 11.1 Results (mean over evaluated users) ===


Unnamed: 0,model,P@10,R@10,NDCG@10
2,MostPop,0.012979,0.087098,0.072899
1,Hybrid,0.011715,0.066096,0.051973
0,ContentBased,0.001646,0.007076,0.007505
3,Random,3.8e-05,0.000191,0.000111



Evaluated users: 2612 out of 4524


# Comparison table showing all metrics

In [20]:
from tabulate import tabulate  

summary_11_2 = summary.copy()
metric_cols = [f"P@{K}", f"R@{K}", f"NDCG@{K}"]

# sort by best metric (NDCG) and round
summary_11_2 = summary_11_2.sort_values(by=f"NDCG@{K}", ascending=False)
summary_11_2[metric_cols] = summary_11_2[metric_cols].astype(float).round(4)

# 2) Print as markdown table (paste into report)
md_table = summary_11_2.to_markdown(index=False)
print(md_table)

# 3) Save outputs 
summary_11_2.to_csv("../results/task11_baseline_comparison.csv", index=False)
with open("../results/task11_baseline_comparison.md", "w", encoding="utf-8") as f:
    f.write(md_table)

summary_11_2  


| model        |   P@10 |   R@10 |   NDCG@10 |
|:-------------|-------:|-------:|----------:|
| MostPop      | 0.013  | 0.0871 |    0.0729 |
| Hybrid       | 0.0117 | 0.0661 |    0.052  |
| ContentBased | 0.0016 | 0.0071 |    0.0075 |
| Random       | 0      | 0.0002 |    0.0001 |


Unnamed: 0,model,P@10,R@10,NDCG@10
2,MostPop,0.013,0.0871,0.0729
1,Hybrid,0.0117,0.0661,0.052
0,ContentBased,0.0016,0.0071,0.0075
3,Random,0.0,0.0002,0.0001
