In [1]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
from src.utils.utils import safe_parse_feat

### Load the Data

In [2]:
# Define the path to the processed data
processed_path = "../data/processed/"

# Load the recommendations and ground truth data
content_based_recommendations = pd.read_csv(os.path.join(processed_path, "content_based_recommendations.csv"))
als_recommendations = pd.read_csv(os.path.join(processed_path, "als_recommendations.csv"))
interactions_test = pd.read_csv(os.path.join(processed_path, "interactions_test.csv"))
user_features = pd.read_csv(os.path.join(processed_path, "user_features_engineered.csv"))
video_metadata = pd.read_csv(os.path.join(processed_path, "video_metadata.csv"))

# Convert recommendations DataFrames to dictionaries
def load_recommendations(df):
    recommendations = {}
    for _, row in df.iterrows():
        user_id = row['user_id']
        recs = row.drop('user_id').dropna().astype(int).tolist()
        recommendations[user_id] = recs
    return recommendations

content_based_recommendations = load_recommendations(content_based_recommendations)
als_recommendations = load_recommendations(als_recommendations)

video_metadata["feat"] = video_metadata["feat"].apply(safe_parse_feat)
video_metadata = video_metadata[video_metadata["feat"].notnull()]

user_features["preferred_category"] = user_features["preferred_category"].apply(safe_parse_feat)
user_features = user_features[user_features["preferred_category"].notnull()]

user_features["friends_preferred_category"] = user_features["friends_preferred_category"].apply(safe_parse_feat)
user_features = user_features[user_features["friends_preferred_category"].notnull()]

### Build ground truth
(videos actually watched by each user)

In [3]:
ground_truth = defaultdict(set)
for _, row in interactions_test.iterrows():
    ground_truth[row['user_id']].add(row['video_id'])

# Create a dictionary to map video IDs to their categories
video_categories = video_metadata.set_index('video_id')['feat'].to_dict()

# Create a dictionary to map user IDs to their preferred categories and friends' preferred categories
user_preferred_category = user_features.set_index('user_id')['preferred_category'].to_dict()
user_friends_preferred_category = user_features.set_index('user_id')['friends_preferred_category'].to_dict()

### Evaluation function

In [4]:
def evaluate_recommendations(recommendations, ground_truth, video_categories, user_preferred_category, user_friends_preferred_category, k=10):
    precision_list = []
    category_precision_list = []
    hit_count = 0
    category_hit_count = 0
    user_count = 0

    for user_id, recs in recommendations.items():
        if user_id not in ground_truth:
            continue  # No test data for this user

        true_items = ground_truth[user_id]
        k_user = k
        recommended_items = recs[:k_user]
        hits = len(set(recommended_items) & true_items)

        # Get user's preferred categories and friends' preferred categories
        preferred_category = set(user_preferred_category.get(user_id, []))
        friends_preferred_category = set(user_friends_preferred_category.get(user_id, []))
        all_preferred_category = preferred_category.union(friends_preferred_category)

        # Check if recommended videos are in the user's or friends' preferred categories
        category_hits = 0
        for vid in recommended_items:
            video_category = set(video_categories.get(vid, []))
            if video_category.intersection(all_preferred_category):
                category_hits += 1

        precision = hits / k_user
        category_precision = category_hits / k_user
        hit = 1 if hits > 0 else 0
        category_hit = 1 if category_hits > 0 else 0

        precision_list.append(precision)
        category_precision_list.append(category_precision)
        hit_count += hit
        category_hit_count += category_hit
        user_count += 1

    precision_at_k = np.mean(precision_list)
    category_precision_at_k = np.mean(category_precision_list)
    hit_rate = hit_count / user_count if user_count > 0 else 0
    category_hit_rate = category_hit_count / user_count if user_count > 0 else 0

    return {
        'Precision@K': precision_at_k,
        'CatagoryPrecision@K': category_precision_at_k,
        'HitRate@K': hit_rate,
        'CategoryHitRate@K': category_hit_rate,
        'Evaluated Users': user_count
    }

### Evaluate recommendations

In [5]:
# Evaluate content-based recommendations
content_based_results = evaluate_recommendations(content_based_recommendations, ground_truth, video_categories, user_preferred_category, user_friends_preferred_category, k=30)
print('Content-Based Evaluation Results:', content_based_results)

# Evaluate ALS recommendations
als_results = evaluate_recommendations(als_recommendations, ground_truth, video_categories, user_preferred_category, user_friends_preferred_category, k=30)
print('ALS Evaluation Results:', als_results)

Content-Based Evaluation Results: {'Precision@K': np.float64(0.03939056112969156), 'CatagoryPrecision@K': np.float64(0.33333333333333326), 'HitRate@K': 0.6885451505016722, 'CategoryHitRate@K': 1.0, 'Evaluated Users': 7176}
ALS Evaluation Results: {'Precision@K': np.float64(0.02047565960609439), 'CatagoryPrecision@K': np.float64(0.33333333333333326), 'HitRate@K': 0.4488573021181717, 'CategoryHitRate@K': 1.0, 'Evaluated Users': 7176}


### Combine recommendations

In [6]:
# Combine recommendations using an ensemble method
def combine_recommendations(rec1, rec2, alpha=0.7):
    combined_recs = {}
    for user_id in rec1.keys():
        if user_id in rec2:
            # Get the scores for the recommended videos from both models
            rec1_scores = {vid: score for score, vid in enumerate(rec1[user_id])}
            rec2_scores = {vid: score for score, vid in enumerate(rec2[user_id])}

            # Combine the scores using the specified alpha
            combined_scores = {}
            all_vids = set(rec1[user_id]).union(set(rec2[user_id]))
            for vid in all_vids:
                score1 = rec1_scores.get(vid, 0)
                score2 = rec2_scores.get(vid, 0)
                combined_scores[vid] = alpha * score1 + (1 - alpha) * score2

            # Sort the videos based on the combined scores
            sorted_vids = sorted(combined_scores.keys(), key=lambda x: combined_scores[x], reverse=True)
            combined_recs[user_id] = sorted_vids
        else:
            combined_recs[user_id] = rec1[user_id]
    return combined_recs

# Combine content-based and ALS recommendations
combined_recommendations = combine_recommendations(content_based_recommendations, als_recommendations, alpha=0.7)

# Evaluate combined recommendations
combined_results = evaluate_recommendations(combined_recommendations, ground_truth, video_categories, user_preferred_category, user_friends_preferred_category, k=30)
print('Combined Evaluation Results:', combined_results)

Combined Evaluation Results: {'Precision@K': np.float64(0.05986157562244518), 'CatagoryPrecision@K': np.float64(0.6664901523597176), 'HitRate@K': 0.8078316610925307, 'CategoryHitRate@K': 1.0, 'Evaluated Users': 7176}


In [7]:
# Plot comparison of results
models = ['Content-Based', 'ALS', 'Combined']
precision_values = [content_based_results['Precision@K'], als_results['Precision@K'], combined_results['Precision@K']]
category_precision_values = [content_based_results['CategoryPrecision@K'], als_results['CategoryPrecision@K'],
    combined_results['CategoryPrecision@K']]
hit_rate_values = [content_based_results['HitRate@K'], als_results['HitRate@K'], combined_results['HitRate@K']]
category_hit_rate_values = [content_based_results['CategoryHitRate@K'], als_results['CategoryHitRate@K'], combined_results['CategoryHitRate@K']]

x = np.arange(len(models))
width = 0.2

fig1, ax1 = plt.subplots()
rects1 = ax1.bar(x - width, precision_values, width, label='Precision@K')
rects2 = ax1.bar(x + width, precision_values, width, label='CategoryPrecision@K')

ax1.set_xlabel('Models')
ax1.set_ylabel('Scores')
ax1.set_title('Comparison of Recommendation Models')
ax1.set_xticks(x)
ax1.set_xticklabels(models)
ax1.legend()

fig1.tight_layout()
plt.show()

fig2, ax2 = plt.subplots()
rects3 = ax2.bar(x, hit_rate_values, width, label='HitRate@K')
rects4 = ax2.bar(x + width, category_hit_rate_values, width, label='CategoryHitRate@K')

ax2.set_xlabel('Models')
ax2.set_ylabel('Score')
ax2.set_title('Comparison of Recommendation Models on Hit Rate')
ax2.set_xticks(x)
ax2.set_xticklabels(models)
ax2.legend()

fig2.tight_layout()
plt.show()


KeyError: 'CategoryPrecision@K'