In [1]:
import pandas as pd
import numpy as np
import warnings

# Suppress division by zero warnings (handled manually)
warnings.filterwarnings("ignore")

# 1. CONFIGURATION

In [2]:
class Config:
    TRAIN_INTERACTIONS = 'interactions.csv'
    TEST_USERS = 'test_users.csv'
    
    # Columns
    USER_ID_COL = 'user_id'
    BOOK_ID_COL = 'book_id'
    RATING_COL = 'rating'
    
    TOP_N = 10 

# 2. DATA LOADING & MATRIX CONSTRUCTION

In [3]:
def load_data():
    print("Loading Data...")
    interactions = pd.read_csv(Config.TRAIN_INTERACTIONS)
    test_users = pd.read_csv(Config.TEST_USERS)
    
    # 1. Standardize IDs to Integers
    interactions = interactions.dropna(subset=[Config.USER_ID_COL, Config.BOOK_ID_COL])
    interactions[Config.USER_ID_COL] = interactions[Config.USER_ID_COL].astype(int)
    interactions[Config.BOOK_ID_COL] = interactions[Config.BOOK_ID_COL].astype(int)
    
    test_users = test_users.dropna(subset=[Config.USER_ID_COL])
    test_users[Config.USER_ID_COL] = test_users[Config.USER_ID_COL].astype(int)

    # 2. Identify All Unique Users and Books
    all_users = sorted(list(set(interactions[Config.USER_ID_COL].unique()) | set(test_users[Config.USER_ID_COL].unique())))
    all_books = sorted(list(interactions[Config.BOOK_ID_COL].unique()))
    
    # 3. Create Index Mappings
    user_to_idx = {uid: i for i, uid in enumerate(all_users)}
    idx_to_user = {i: uid for i, uid in enumerate(all_users)}
    
    book_to_idx = {bid: i for i, bid in enumerate(all_books)}
    idx_to_book = {i: bid for i, bid in enumerate(all_books)}
    
    n_users = len(all_users)
    n_books = len(all_books)
    
    print(f"Matrix Dimensions: {n_users} Users x {n_books} Books")
    
    # 4. Build Interaction Matrix R and Seen Mask
    # R: Stores the ratings (0-10)
    # seen_mask: Stores True if ANY interaction happened (even if rating is 0)
    R = np.zeros((n_users, n_books), dtype=np.float32)
    seen_mask = np.zeros((n_users, n_books), dtype=bool)
    
    # Efficient filling
    u_idxs = [user_to_idx[u] for u in interactions[Config.USER_ID_COL]]
    b_idxs = [book_to_idx[b] for b in interactions[Config.BOOK_ID_COL]]
    r_vals = interactions[Config.RATING_COL].values
    
    R[u_idxs, b_idxs] = r_vals
    seen_mask[u_idxs, b_idxs] = True  # Mark as seen regardless of rating value
    
    return R, seen_mask, user_to_idx, idx_to_user, idx_to_book, test_users

# 3. COSINE SIMILARITY & PREDICTION

In [4]:
def compute_cosine_similarity(M, kind='user'):
    """
    Computes Cosine Similarity: DotProduct(A, B) / (|A|*|B|)
    """
    print(f"Calculating {kind}-based Cosine Similarity:")
    
    if kind == 'user':
        # Sim(u, v) = R[u] . R[v] / (|R[u]| * |R[v]|)
        row_norms = np.sqrt(np.sum(M**2, axis=1, keepdims=True))
        row_norms[row_norms == 0] = 1.0 # Avoid div/0
        M_norm = M / row_norms
        sim_matrix = np.dot(M_norm, M_norm.T)
        
    elif kind == 'item':
        # Sim(i, j) = R[:,i] . R[:,j] / (|R[:,i]| * |R[:,j]|)
        col_norms = np.sqrt(np.sum(M**2, axis=0, keepdims=True))
        col_norms[col_norms == 0] = 1.0
        M_norm = M / col_norms
        sim_matrix = np.dot(M_norm.T, M_norm)
        
    np.fill_diagonal(sim_matrix, 0)
    return sim_matrix

def predict(R, sim_matrix, kind='user'):
    print(f"Predicting ratings using {kind}-based method:")
    if kind == 'user':
        pred = np.dot(sim_matrix, R)
    elif kind == 'item':
        pred = np.dot(R, sim_matrix)
    return pred

# 4. GENERATE RECOMMENDATIONS

In [5]:
def save_recommendations(seen_mask, pred_matrix, user_to_idx, idx_to_book, target_uids, filename):
    results = []
    
    print(f"Extracting top {Config.TOP_N} recommendations:")
    
    for uid in target_uids:
        if uid not in user_to_idx:
            results.append({'user_id': uid, 'recommendations': ""})
            continue
            
        u_idx = user_to_idx[uid]
        user_preds = pred_matrix[u_idx]
        
        # CORRECT FILTERING:
        # Use the seen_mask to filter out ALL interacted items (including 0 ratings)
        user_seen_bool = seen_mask[u_idx]
        user_preds[user_seen_bool] = -np.inf 
        
        # Get Top N Indices
        top_indices = np.argsort(user_preds)[-Config.TOP_N:][::-1]
        
        # Map back to IDs
        recs = [idx_to_book[i] for i in top_indices]
        
        # Format string
        recs_str = " ".join(map(str, recs))
        results.append({'user_id': uid, 'recommendations': recs_str})
        
    pd.DataFrame(results).to_csv(filename, index=False)
    print(f"***Saved results to {filename}")

# 5. MAIN

In [6]:
def main():
    # 1. Load Data
    R, seen_mask, user_to_idx, idx_to_user, idx_to_book, test_users = load_data()
    target_uids = test_users[Config.USER_ID_COL].unique()
    
    # 2. User-Based CF
    print("\nMethod 1: User-Based CF")
    user_sim = compute_cosine_similarity(R, kind='user')
    user_preds = predict(R, user_sim, kind='user')
    save_recommendations(seen_mask, user_preds, user_to_idx, idx_to_book, target_uids, 'user_based_results.csv')
    
    # 3. Item-Based CF
    print("\nMethod 2: Item-Based CF")
    item_sim = compute_cosine_similarity(R, kind='item')
    item_preds = predict(R, item_sim, kind='item')
    save_recommendations(seen_mask, item_preds, user_to_idx, idx_to_book, target_uids, 'item_based_results.csv')
    
    print("\nDone. You can now evaluate 'user_based_results.csv' and 'item_based_results.csv'.")

if __name__ == "__main__":
    main()

Loading Data...
Matrix Dimensions: 2285 Users x 6880 Books

Method 1: User-Based CF
Calculating user-based Cosine Similarity:
Predicting ratings using user-based method:
Extracting top 10 recommendations:
***Saved results to user_based_results.csv

Method 2: Item-Based CF
Calculating item-based Cosine Similarity:
Predicting ratings using item-based method:
Extracting top 10 recommendations:
***Saved results to item_based_results.csv

Done. You can now evaluate 'user_based_results.csv' and 'item_based_results.csv'.
