In [28]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split

In [29]:
ratings = pd.read_csv('Data/ratings.csv')
movies = pd.read_csv('Data/movies.csv')

ratings.drop('timestamp', axis=1, inplace=True)
merged_data = pd.merge(ratings, movies, on='movieId')

print(f"Total ratings: {len(ratings)}")
print(f"Users: {ratings['userId'].nunique()}")
print(f"Movies: {ratings['movieId'].nunique()}")
print(f"Rating distribution:\n{ratings['rating'].value_counts().sort_index()}")

Total ratings: 100836
Users: 610
Movies: 9724
Rating distribution:
0.5     1370
1.0     2811
1.5     1791
2.0     7551
2.5     5550
3.0    20047
3.5    13136
4.0    26818
4.5     8551
5.0    13211
Name: rating, dtype: int64


In [30]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42, stratify=ratings['userId'])

print(f"Train data: {len(train_data)} ratings")
print(f"Test data: {len(test_data)} ratings")

Train data: 80668 ratings
Test data: 20168 ratings


In [31]:
# Create user-item matrix from training data only
user_item_matrix = train_data.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)
print(f"User-item matrix shape: {user_item_matrix.shape}")

User-item matrix shape: (610, 8977)


In [32]:
user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

def get_user_based_recommendations(user_id, k=10):
    if user_id not in user_similarity_df.index:
        return []
    
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)[1:k+1]
    
    if len(similar_users) == 0 or similar_users.sum() == 0:
        return []
    
    weighted_ratings = user_item_matrix.loc[similar_users.index].T.dot(similar_users)
    normalized_ratings = weighted_ratings / similar_users.sum()
    user_seen = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = normalized_ratings.drop(user_seen, errors='ignore').sort_values(ascending=False).head(k)
    return recommendations.index.tolist()

In [33]:
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)

def get_item_based_recommendations(user_id, k=10):
    if user_id not in user_item_matrix.index:
        return []
        
    user_ratings = user_item_matrix.loc[user_id]
    user_rated_items = user_ratings[user_ratings > 0]
    
    if len(user_rated_items) == 0:
        return []
    
    # Calculate scores based on item similarity
    scores = pd.Series(0.0, index=user_item_matrix.columns)
    
    for item_id, rating in user_rated_items.items():
        if item_id in item_similarity_df.columns:
            similar_items = item_similarity_df[item_id]
            scores += similar_items * rating
    
    # Normalize scores
    scores = scores / len(user_rated_items)
    
    # Remove already seen items
    user_seen = user_rated_items.index
    recommendations = scores.drop(user_seen, errors='ignore').sort_values(ascending=False).head(k)
    return recommendations.index.tolist()

In [34]:
# SVD-based collaborative filtering
svd = TruncatedSVD(n_components=20, random_state=42)
latent_matrix = svd.fit_transform(user_item_matrix)
pred_matrix = np.dot(latent_matrix, svd.components_)
pred_df = pd.DataFrame(pred_matrix, index=user_item_matrix.index, columns=user_item_matrix.columns)

def get_svd_recommendations(user_id, k=10):
    if user_id not in pred_df.index:
        return []
        
    user_predictions = pred_df.loc[user_id]
    user_seen = user_item_matrix.loc[user_id][user_item_matrix.loc[user_id] > 0].index
    recommendations = user_predictions.drop(user_seen, errors='ignore').sort_values(ascending=False).head(k)
    return recommendations.index.tolist()

In [35]:
def precision_at_k(user_id, recommendation_func, test_data, k=10, threshold=4.0):
    """
    Calculate precision@k for a user using proper train/test split
    """
    user_test_ratings = test_data[test_data['userId'] == user_id]
    
    if len(user_test_ratings) == 0:
        return np.nan
    
    relevant_items = set(user_test_ratings[user_test_ratings['rating'] >= threshold]['movieId'].tolist())
    
    if len(relevant_items) == 0:
        return np.nan
    
    try:
        # Get recommendations
        recommendations = recommendation_func(user_id, k)
        
        if not recommendations:
            return 0.0
        
        # Calculate hits
        recommended_set = set(recommendations)
        hits = len(recommended_set.intersection(relevant_items))
        
        return hits / min(k, len(recommendations))
    
    except Exception as e:
        return np.nan

In [38]:
def evaluate_all_methods(test_data, k=10):
    """
    Evaluate all recommendation methods
    """
    methods = {
        'User-based CF': get_user_based_recommendations,
        'Item-based CF': get_item_based_recommendations,
        'SVD CF': get_svd_recommendations
    }
    
    # Get users that exist in both train and test sets
    test_users = set(test_data['userId'].unique())
    train_users = set(user_item_matrix.index)
    common_users = list(test_users.intersection(train_users))
    
    print(f"Evaluating on {len(common_users)} users who appear in both train and test sets")
    
    results = {}
    
    for method_name, method_func in methods.items():
        print(f"\nEvaluating {method_name}...")
        
        precisions = []
        
        for user_id in common_users:
            precision = precision_at_k(user_id, method_func, test_data, k=k)
            if not np.isnan(precision):
                precisions.append(precision)
        
        if len(precisions) > 0:
            mean_precision = np.mean(precisions)
            std_precision = np.std(precisions)
            results[method_name] = {
                'mean_precision': mean_precision,
                'std_precision': std_precision,
                'num_users_evaluated': len(precisions)
            }
            print(f"{method_name} - Mean Precision@{k}: {mean_precision:.4f}")
            print(f"Evaluated on {len(precisions)} users")
            
            # Show some examples of non-zero precisions
            non_zero_precisions = [p for p in precisions if p > 0]
            if non_zero_precisions:
                print(f"Users with non-zero precision: {len(non_zero_precisions)} ({len(non_zero_precisions)/len(precisions)*100:.1f}%)")
                print(f"Mean precision for users with hits: {np.mean(non_zero_precisions):.4f}")
        else:
            results[method_name] = {
                'mean_precision': 0.0,
                'std_precision': 0.0,
                'num_users_evaluated': 0
            }
            print(f"{method_name} - No valid evaluations possible")
    
    return results

In [40]:
results = evaluate_all_methods(test_data, k=5)

Evaluating on 610 users who appear in both train and test sets

Evaluating User-based CF...
User-based CF - Mean Precision@5: 0.1950
Evaluated on 599 users
Users with non-zero precision: 343 (57.3%)
Mean precision for users with hits: 0.3405

Evaluating Item-based CF...
Item-based CF - Mean Precision@5: 0.1863
Evaluated on 599 users
Users with non-zero precision: 331 (55.3%)
Mean precision for users with hits: 0.3372

Evaluating SVD CF...
SVD CF - Mean Precision@5: 0.2397
Evaluated on 599 users
Users with non-zero precision: 357 (59.6%)
Mean precision for users with hits: 0.4022
