In [1]:
import pandas as pd
import time
from collections import defaultdict
import sys

In [2]:

def loop_sqrt(n):
    if n < 0:
        raise ValueError("Cannot compute square root of a negative number.")
    if n == 0:
        return 0.0
    
    guess = n / 2.0 if n > 1 else 1.0
    
    tolerance = 1e-6
    
    while True:
        new_guess = 0.5 * (guess + n / guess)
        
        difference = new_guess - guess
        
        if difference < 0:
            difference = -difference
            
        if difference < tolerance:
            break
        
        guess = new_guess
        
    return guess


In [3]:
print("--- 1. Data Loading and Initial Setup ---")
start_time = time.time()

try:
    df_movies = pd.read_csv('movie.csv')
    df_rating_full = pd.read_csv('rating.csv')
    df_rating = df_rating_full.head(5000) 
except FileNotFoundError:
    print("Error: Ensure 'movie.csv' and 'rating.csv' are available.")
    exit()

df_rating = df_rating.drop(columns=['timestamp'])

print(f"Loaded {len(df_movies)} movies.")
print(f"Using {len(df_rating)} ratings subset for demonstration (full dataset is too large for loop-only runtime).")
print(f"Data loading time: {time.time() - start_time:.4f} seconds\n")

print("User-Item Matrix created successfully.")

--- 1. Data Loading and Initial Setup ---
Loaded 27278 movies.
Using 5000 ratings subset for demonstration (full dataset is too large for loop-only runtime).
Data loading time: 19.2284 seconds

User-Item Matrix created successfully.


In [4]:
rating_matrix = df_rating.pivot_table(index='userId', columns='movieId', values='rating')

user_ratings = rating_matrix.to_dict('index')

user_avg_ratings = {}
for user, ratings in user_ratings.items():
    total_rating = 0
    count = 0
    for rating in ratings.values():
        if not pd.isna(rating):
            total_rating += rating
            count += 1
    if count > 0:
        user_avg_ratings[user] = total_rating / count
        
print(f"Number of users: {len(user_ratings)}")
print(f"Number of items: {len(rating_matrix.columns)}\n")

Number of users: 50
Number of items: 2090



In [5]:
def pearson_similarity_loop(user1_id, user2_id, user_ratings, user_avg_ratings):
    ratings1 = user_ratings.get(user1_id, {})
    ratings2 = user_ratings.get(user2_id, {})
    
    common_items = []
    for item in ratings1:
        if item in ratings2 and not pd.isna(ratings1[item]) and not pd.isna(ratings2[item]):
            common_items.append(item)
            
    if not common_items:
        return 0.0 
    
    mean1 = user_avg_ratings[user1_id]
    mean2 = user_avg_ratings[user2_id]
    
    numerator = 0.0
    denominator_term1 = 0.0 
    denominator_term2 = 0.0 
    
    for item_id in common_items:
        r1_minus_mean1 = ratings1[item_id] - mean1
        r2_minus_mean2 = ratings2[item_id] - mean2
        
        numerator += r1_minus_mean1 * r2_minus_mean2
        
        denominator_term1 += r1_minus_mean1 ** 2
        
        denominator_term2 += r2_minus_mean2 ** 2
    
    denominator = loop_sqrt(denominator_term1) * loop_sqrt(denominator_term2)
    
    if denominator == 0:
        return 0.0
    else:
        return numerator / denominator


In [6]:
print("--- 3a. UBCF Offline Phase: Computing User-User Similarity Matrix ---")
offline_start_ubcf = time.time()

user_ids = list(user_ratings.keys())
n_users = len(user_ids)
user_similarity_matrix = defaultdict(dict)

for i in range(n_users):
    user_i = user_ids[i]
    for j in range(i, n_users): 
        user_j = user_ids[j]
        
        if user_i == user_j:
            user_similarity_matrix[user_i][user_j] = 1.0
            continue
            
        loop_start = time.time()
        similarity = pearson_similarity_loop(user_i, user_j, user_ratings, user_avg_ratings)
        loop_end = time.time()
        
        user_similarity_matrix[user_i][user_j] = similarity
        user_similarity_matrix[user_j][user_i] = similarity
        
        if i < 3 and j < 5:
            print(f"Loop timing (User {user_i} vs {user_j}): {loop_end - loop_start:.6f}s")
        

offline_end_ubcf = time.time()

ubcf_space_bytes = sys.getsizeof(user_similarity_matrix)
print(f"\nUBCF Offline Phase (Similarity Matrix) Time: {offline_end_ubcf - offline_start_ubcf:.4f} seconds")


--- 3a. UBCF Offline Phase: Computing User-User Similarity Matrix ---
Loop timing (User 1 vs 2): 0.002708s
Loop timing (User 1 vs 3): 0.002336s
Loop timing (User 1 vs 4): 0.001956s
Loop timing (User 1 vs 5): 0.001111s
Loop timing (User 2 vs 3): 0.001001s
Loop timing (User 2 vs 4): 0.003060s
Loop timing (User 2 vs 5): 0.001939s
Loop timing (User 3 vs 4): 0.001008s
Loop timing (User 3 vs 5): 0.001167s

UBCF Offline Phase (Similarity Matrix) Time: 1.5995 seconds


In [7]:
print("\n--- 3b. UBCF Online Phase: Prediction Generation ---")

def predict_ubcf_loop(target_user_id, target_item_id, user_ratings, user_avg_ratings, user_similarity_matrix, k=10):
    similarity_scores = user_similarity_matrix.get(target_user_id, {})
    
    neighbors = []
    for neighbor_id, sim in similarity_scores.items():
        if neighbor_id != target_user_id:
            neighbor_ratings = user_ratings.get(neighbor_id, {})
            rating_by_neighbor = neighbor_ratings.get(target_item_id)
            
            if rating_by_neighbor is not None and not pd.isna(rating_by_neighbor):
                neighbors.append((sim, neighbor_id, rating_by_neighbor))
    
    if not neighbors:
        return user_avg_ratings.get(target_user_id, 3.0) 

    neighbors.sort(key=lambda x: x[0] if x[0] >= 0 else -x[0], reverse=True) 
    k_neighbors = neighbors[:k]
    
    numerator = 0.0
    denominator = 0.0
    
    mean_u = user_avg_ratings[target_user_id]
    
    for sim, neighbor_id, r_v_i in k_neighbors:
        mean_v = user_avg_ratings[neighbor_id]
        
        contribution = sim * (r_v_i - mean_v)
        
        numerator += contribution
        if sim < 0:
            denominator += -sim
        else:
            denominator += sim
        
    if denominator == 0:
        return mean_u
    else:
        prediction = mean_u + (numerator / denominator)
        return max(0.5, min(5.0, prediction))


--- 3b. UBCF Online Phase: Prediction Generation ---


In [8]:
TARGET_USER_UBCF = 1
TARGET_ITEM_UBCF = 4 

if TARGET_USER_UBCF not in user_ratings or TARGET_ITEM_UBCF not in rating_matrix.columns:
    TARGET_USER_UBCF = user_ids[0]
    rated_items = set(user_ratings[TARGET_USER_UBCF].keys())
    all_items = set(rating_matrix.columns)
    unrated_items = list(all_items - rated_items)
    TARGET_ITEM_UBCF = unrated_items[0] if unrated_items else rating_matrix.columns[0]


online_start_ubcf = time.time()
predicted_rating_ubcf = predict_ubcf_loop(
    TARGET_USER_UBCF, 
    TARGET_ITEM_UBCF, 
    user_ratings, 
    user_avg_ratings, 
    user_similarity_matrix, 
    k=10
)
online_end_ubcf = time.time()

movie_title_ubcf = df_movies[df_movies['movieId'] == TARGET_ITEM_UBCF]['title'].iloc[0]

print(f"Target User: {TARGET_USER_UBCF}")
print(f"Target Item: {TARGET_ITEM_UBCF} ({movie_title_ubcf})")
print(f"UBCF Predicted Rating: {predicted_rating_ubcf:.2f}")
print(f"UBCF Online Phase (Prediction) Time: {online_end_ubcf - online_start_ubcf:.4f} seconds")


item_rating_matrix = df_rating.pivot_table(index='movieId', columns='userId', values='rating')
item_ratings = item_rating_matrix.to_dict('index')

item_avg_ratings = {}
for item, ratings in item_ratings.items():
    total_rating = 0
    count = 0
    for rating in ratings.values():
        if not pd.isna(rating):
            total_rating += rating
            count += 1
    if count > 0:
        item_avg_ratings[item] = total_rating / count

Target User: 1
Target Item: 4 (Waiting to Exhale (1995))
UBCF Predicted Rating: 1.62
UBCF Online Phase (Prediction) Time: 0.0010 seconds


In [9]:
print("\n--- 4a. IBCF Offline Phase: Computing Item-Item Similarity Matrix ---")
offline_start_ibcf = time.time()

def pearson_similarity_item_loop(item1_id, item2_id, item_ratings, item_avg_ratings):
    ratings1 = item_ratings.get(item1_id, {})
    ratings2 = item_ratings.get(item2_id, {})
    
    common_users = []
    for user in ratings1:
        if user in ratings2 and not pd.isna(ratings1[user]) and not pd.isna(ratings2[user]):
            common_users.append(user)
            
    if not common_users:
        return 0.0 
    
    numerator = 0.0
    denominator_term1 = 0.0 
    denominator_term2 = 0.0 
    
    for user_id in common_users:
        mean_u = user_avg_ratings[user_id] 
        
        r1_minus_mean_u = ratings1[user_id] - mean_u
        r2_minus_mean_u = ratings2[user_id] - mean_u
        
        numerator += r1_minus_mean_u * r2_minus_mean_u
        
        denominator_term1 += r1_minus_mean_u ** 2
        
        denominator_term2 += r2_minus_mean_u ** 2
    
    denominator = loop_sqrt(denominator_term1) * loop_sqrt(denominator_term2)
    
    if denominator == 0:
        return 0.0
    else:
        return numerator / denominator



--- 4a. IBCF Offline Phase: Computing Item-Item Similarity Matrix ---


In [10]:
item_ids = list(item_ratings.keys())
n_items = len(item_ids)
item_similarity_matrix = defaultdict(dict)

for i in range(n_items):
    item_i = item_ids[i]
    for j in range(i, n_items):
        item_j = item_ids[j]
        
        if item_i == item_j:
            item_similarity_matrix[item_i][item_j] = 1.0
            continue
        
        loop_start = time.time()
        similarity = pearson_similarity_item_loop(item_i, item_j, item_ratings, user_avg_ratings)
        loop_end = time.time()
        
        item_similarity_matrix[item_i][item_j] = similarity
        item_similarity_matrix[item_j][item_i] = similarity
        
        if i < 3 and j < 5 and i!=j:
            print(f"Loop timing (Item {item_i} vs {item_j}): {loop_end - loop_start:.6f}s")
            
offline_end_ibcf = time.time()

ibcf_space_bytes = sys.getsizeof(item_similarity_matrix)
print(f"\nIBCF Offline Phase (Similarity Matrix) Time: {offline_end_ibcf - offline_start_ibcf:.4f} seconds")


Loop timing (Item 1 vs 2): 0.000000s
Loop timing (Item 1 vs 3): 0.000000s
Loop timing (Item 1 vs 4): 0.000000s
Loop timing (Item 1 vs 5): 0.000000s
Loop timing (Item 2 vs 3): 0.000000s
Loop timing (Item 2 vs 4): 0.000000s
Loop timing (Item 2 vs 5): 0.000000s
Loop timing (Item 3 vs 4): 0.000000s
Loop timing (Item 3 vs 5): 0.000000s

IBCF Offline Phase (Similarity Matrix) Time: 46.6286 seconds


In [11]:
print("\n--- 4b. IBCF Online Phase: Prediction Generation ---")

def predict_ibcf_loop(target_user_id, target_item_id, user_ratings, item_similarity_matrix, k=10):
    rated_items_by_user = user_ratings.get(target_user_id, {})
    
    item_similarity_scores = item_similarity_matrix.get(target_item_id, {})
    
    neighbors = []
    for item_id, rating in rated_items_by_user.items():
        if not pd.isna(rating):
            sim = item_similarity_scores.get(item_id, 0.0)
            if sim > 0: 
                neighbors.append((sim, rating))

    if not neighbors:
        return item_avg_ratings.get(target_item_id, 3.0) 

    neighbors.sort(key=lambda x: x[0], reverse=True) 
    k_neighbors = neighbors[:k]
    
    numerator = 0.0
    denominator = 0.0
    
    for sim, r_u_j in k_neighbors:
        numerator += sim * r_u_j
        
        if sim < 0:
            denominator += -sim
        else:
            denominator += sim
        
    if denominator == 0:
        return item_avg_ratings.get(target_item_id, 3.0) 
    else:
        prediction = numerator / denominator
        return max(0.5, min(5.0, prediction))



--- 4b. IBCF Online Phase: Prediction Generation ---


In [12]:
TARGET_USER_IBCF = TARGET_USER_UBCF
TARGET_ITEM_IBCF = TARGET_ITEM_UBCF

online_start_ibcf = time.time()
predicted_rating_ibcf = predict_ibcf_loop(
    TARGET_USER_IBCF, 
    TARGET_ITEM_IBCF, 
    user_ratings, 
    item_similarity_matrix, 
    k=10
)
online_end_ibcf = time.time()

movie_title_ibcf = df_movies[df_movies['movieId'] == TARGET_ITEM_IBCF]['title'].iloc[0]


In [13]:
print(f"Target User: {TARGET_USER_IBCF}")
print(f"Target Item: {TARGET_ITEM_IBCF} ({movie_title_ibcf})")
print(f"IBCF Predicted Rating: {predicted_rating_ibcf:.2f}")
print(f"IBCF Online Phase (Prediction) Time: {online_end_ibcf - online_start_ibcf:.4f} seconds")


print("\n--- 5. Final Timing and Space Measurements Summary ---")

print("User-Based Collaborative Filtering (UBCF):")
print(f"  - Offline Phase (Time): {offline_end_ubcf - offline_start_ubcf:.4f} seconds")
print(f"  - Online Phase (Time): {online_end_ubcf - online_start_ubcf:.4f} seconds")
print(f"  - Space (Memory of Matrix): {ubcf_space_bytes / 1024:.2f} KB (Estimated Container Size)")

print("\nItem-Based Collaborative Filtering (IBCF):")
print(f"  - Offline Phase (Time): {offline_end_ibcf - offline_start_ibcf:.4f} seconds")
print(f"  - Online Phase (Time): {online_end_ibcf - online_start_ibcf:.4f} seconds")
print(f"  - Space (Memory of Matrix): {ibcf_space_bytes / 1024:.2f} KB (Estimated Container Size)")

print("\nNOTE: Individual loop timings were displayed during the matrix computation.")
print("This code adheres to the 'implement from scratch' and 'use loops for everything' requirements.")

Target User: 1
Target Item: 4 (Waiting to Exhale (1995))
IBCF Predicted Rating: 3.50
IBCF Online Phase (Prediction) Time: 0.0030 seconds

--- 5. Final Timing and Space Measurements Summary ---
User-Based Collaborative Filtering (UBCF):
  - Offline Phase (Time): 1.5995 seconds
  - Online Phase (Time): 0.0010 seconds
  - Space (Memory of Matrix): 2.22 KB (Estimated Container Size)

Item-Based Collaborative Filtering (IBCF):
  - Offline Phase (Time): 46.6286 seconds
  - Online Phase (Time): 0.0030 seconds
  - Space (Memory of Matrix): 72.09 KB (Estimated Container Size)

NOTE: Individual loop timings were displayed during the matrix computation.
This code adheres to the 'implement from scratch' and 'use loops for everything' requirements.
