# Abdelrahman Ayman Samy Mohamed, 222100930
# Yassmin Mohamed Mahmoud Metwally, 222101910
# Shahd Mamdouh Ali Hassan, 222102250
# Seif Amr Abdelhafez abdo , 222102312

In [1]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, load_npz, save_npz
from sklearn.metrics.pairwise import cosine_similarity

# Loading Data

In [2]:
interactions = pd.read_csv('../data/user_interactions.csv')
courses = pd.read_csv('../data/courses_processed.csv')

print(f"Loaded interactions: {interactions.shape}")
print(f"Loaded courses: {courses.shape}")
print(f"Interactions columns: {interactions.columns.tolist()}")

# Ensure required columns exist
required_cols = {'user_id', 'course_index', 'rating'}
missing = required_cols - set(interactions.columns)
if missing:
    raise ValueError(f"Missing required interaction columns: {missing}")

# Drop invalid indices (safety)
valid_mask = interactions['course_index'].between(0, len(courses) - 1)
invalid = (~valid_mask).sum()
if invalid > 0:
    interactions = interactions[valid_mask].copy()
print(f"Valid interactions: {len(interactions):,} (dropped {invalid:,} invalid)")


Loaded interactions: (54944, 5)
Loaded courses: (4551, 15)
Interactions columns: ['user_id', 'course_index', 'course_title', 'rating', 'course_cluster']
Valid interactions: 54,044 (dropped 900 invalid)


# User-Item Rating Matrix

In [3]:
unique_users = interactions['user_id'].unique()
user_id_to_idx = {u: i for i, u in enumerate(unique_users)}
interactions['user_idx'] = interactions['user_id'].map(user_id_to_idx)

n_users = len(unique_users)
n_items = len(courses)

user_item_matrix = csr_matrix(
    (interactions['rating'].values,
    (interactions['user_idx'].values, interactions['course_index'].values)),
    shape=(n_users, n_items)
)

print(f"\nUser-item matrix shape: {user_item_matrix.shape}")
print(f"  - Users: {n_users:,}")
print(f"  - Courses: {n_items:,}")
print(f"  - Ratings (nnz): {user_item_matrix.nnz:,}")


User-item matrix shape: (5000, 4551)
  - Users: 5,000
  - Courses: 4,551
  - Ratings (nnz): 50,002


# Item-Item Cosine Similarity

In [17]:
print("\nComputing item-item cosine similarity (from ratings)...")
item_user_matrix = user_item_matrix.T  # (n_items, n_users)


item_item_sim = cosine_similarity(item_user_matrix, dense_output=False)
item_item_sim.setdiag(0)  # remove self similarity

print(f"✓ Item-item similarity computed: {item_item_sim.shape}")
print(f"  - Non-zeros: {item_item_sim.nnz:,}")


Computing item-item cosine similarity (from ratings)...
✓ Item-item similarity computed: (4551, 4551)
  - Non-zeros: 276,955


# Predictions

In [18]:
def predict_rating_itemcf(user_id, target_item_idx, user_item_matrix, item_item_sim, user_id_to_idx, k_neighbors=30, min_neighbors=3, eps=1e-10):
    """
    Item-CF with minimum neighbor threshold.
    If fewer than min_neighbors similar items exist, fallback to user's mean rating.
    """
    if user_id not in user_id_to_idx:
        return 3.0
    
    u = user_id_to_idx[user_id]
    user_row = user_item_matrix[u]
    rated_items = user_row.indices
    rated_ratings = user_row.data
    
    if rated_items.size == 0:
        return 3.0
    
    user_mean = float(np.mean(rated_ratings))
    
    # Get similarities
    sims = item_item_sim[target_item_idx, rated_items].toarray().ravel()
    
    # Filter to non-zero similarities
    nonzero_mask = sims != 0
    if nonzero_mask.sum() < min_neighbors:
        # Too few neighbors -> fallback to user mean
        return float(np.clip(user_mean, 1, 5))
    
    sims = sims[nonzero_mask]
    rated_ratings_filtered = rated_ratings[nonzero_mask]
    
    # Use top-K if we have more than K
    if len(sims) > k_neighbors:
        top_k_idx = np.argsort(np.abs(sims))[::-1][:k_neighbors]
        sims = sims[top_k_idx]
        rated_ratings_filtered = rated_ratings_filtered[top_k_idx]
    
    denom = np.sum(np.abs(sims)) + eps
    pred = np.sum(sims * rated_ratings_filtered) / denom
    return float(np.clip(pred, 1, 5))

# Top N

In [19]:
def recommend_top_n_itemcf(user_id, user_item_matrix, item_item_sim, user_id_to_idx, n=20, k_neighbors=30, min_neighbors=3):
    """
    Recommend top-N using item-CF with neighbor threshold.
    """
    if user_id not in user_id_to_idx:
        pop = np.asarray((user_item_matrix != 0).sum(axis=0)).ravel()
        top = np.argsort(pop)[::-1][:n]
        return [(int(i), None) for i in top]
    
    u = user_id_to_idx[user_id]
    user_row = user_item_matrix[u]
    rated = set(user_row.indices.tolist())
    
    preds = []
    for item_idx in range(user_item_matrix.shape[1]):
        if item_idx in rated:
            continue
        p = predict_rating_itemcf(user_id, item_idx, user_item_matrix, item_item_sim, user_id_to_idx, 
                                   k_neighbors=k_neighbors, min_neighbors=min_neighbors)
        preds.append((item_idx, p))
    
    preds.sort(key=lambda x: x[1], reverse=True)
    return preds[:n]

# Testing

In [20]:
test_user_id = interactions['user_id'].iloc[0]
print(f"\n=== Test user: {test_user_id} (user mean={user_item_matrix[user_id_to_idx[test_user_id]].data.mean():.2f}) ===")

top20 = recommend_top_n_itemcf(test_user_id, user_item_matrix, item_item_sim, user_id_to_idx, n=20, k_neighbors=30, min_neighbors=3)

print("\nRank | Course Index | Predicted Rating | Course Title")
print("-" * 95)
for r, (idx, pred) in enumerate(top20, 1):
    title = courses.iloc[idx]['course_title'] if 'course_title' in courses.columns else str(idx)
    pred_str = f"{pred:.2f}" if pred is not None else "N/A"
    print(f"{r:4d} | {idx:12d} | {pred_str:15s} | {title[:55]}")


=== Test user: U00000 (user mean=2.25) ===

Rank | Course Index | Predicted Rating | Course Title
-----------------------------------------------------------------------------------------------
   1 |         4082 | 4.94            | IBM Data Science
   2 |         1375 | 4.94            | Photoshop - Tratamento de pele profissional
   3 |         3951 | 4.93            | Everyday Excel, Part 1
   4 |         3760 | 4.93            | Blockchain: Foundations and Use Cases
   5 |         4132 | 4.92            | Introduction to Big Data
   6 |         4181 | 4.91            | Introduction to Systems Engineering
   7 |         4182 | 4.91            | Introduction to TensorFlow for Artificial Intelligence,
   8 |         3738 | 4.82            | Autodesk CAD/CAM/CAE for Mechanical Engineering
   9 |         4077 | 4.81            | Human Resource Management: HR for People Managers
  10 |         4387 | 4.80            | Six Sigma Green Belt
  11 |         4073 | 4.79            | Hotel M

# Build USER-ITEM rating matrix R

In [42]:
R = csr_matrix(
    (interactions['rating'].values,
     (interactions['user_idx'].values, interactions['course_index'].values)),
    shape=(n_users, n_items)
)

print(f"User-item matrix R: {R.shape}, nnz={R.nnz:,}")

User-item matrix R: (5000, 4551), nnz=50,002


# SVD With 20 Latent Factor

In [None]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split as surprise_split

print("=== SVD using Surprise library (designed for collaborative filtering) ===\n")

# 1) Prepare data in Surprise format
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(interactions[['user_id', 'course_index', 'rating']], reader)

# 2) Train SVD
trainset = data.build_full_trainset()
algo = SVD(n_factors=20, random_state=42)
algo.fit(trainset)

print(f"✓ SVD trained with {algo.n_factors} factors")

=== SVD using Surprise library (designed for collaborative filtering) ===

✓ SVD trained with 20 factors


# Prediction Of Ratings

In [44]:
def recommend_svd_surprise(user_id, n=20):
    # Get all courses user hasn't rated
    user_rated = set(interactions.loc[interactions['user_id'] == user_id, 'course_index'].values)
    all_courses = set(range(len(courses)))
    unrated = all_courses - user_rated
    
    # Predict for all unrated courses
    preds = []
    for course_idx in unrated:
        pred = algo.predict(user_id, course_idx)
        preds.append((course_idx, pred.est))  # pred.est is the estimated rating
    
    # Sort by predicted rating
    preds.sort(key=lambda x: x[1], reverse=True)
    return preds[:n]

# Top-N recommendations for target users

In [45]:
test_user = 'U00000'
top20 = recommend_svd_surprise(test_user, n=20)

print(f"\n=== Top-20 SVD Recommendations for {test_user} ===\n")
print("Rank | Course Index | Predicted Rating | Course Title")
print("-" * 95)
for rank, (idx, pred) in enumerate(top20, 1):
    title = courses.iloc[idx]['course_title'] if 'course_title' in courses.columns else str(idx)
    print(f"{rank:4d} | {idx:12d} | {pred:15.2f} | {title[:55]}")



=== Top-20 SVD Recommendations for U00000 ===

Rank | Course Index | Predicted Rating | Course Title
-----------------------------------------------------------------------------------------------
   1 |         4000 |            4.51 | Full Stack Web and Multiplatform Mobile App Development
   2 |         4278 |            4.48 | Music Business Foundations
   3 |         4105 |            4.47 | Inspired Leadership
   4 |         3230 |            4.45 | Basic Concepts of Web Development, HTTP and Java Servle
   5 |         4494 |            4.44 | Understanding Cancer Metastasis
   6 |         4190 |            4.42 | Introductory Human Physiology
   7 |         4073 |            4.42 | Hotel Management: Distribution, Revenue and Demand Mana
   8 |         4362 |            4.41 | Rethinking International Tax Law
   9 |         3997 |            4.40 | From the Big Bang to Dark Energy
  10 |         4443 |            4.40 | Teach English Now! Teaching Language Online
  11 |         