In [1]:
# %pip install import_ipynb
import import_ipynb 
import cornac
from cornac.data import Dataset
import cornac.metrics as met
from cornac.eval_methods import BaseMethod
from data_loader import DataLoader # type: ignore

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
data_path = "data/"
data_loader = DataLoader(data_path)

In [3]:
train_dataset, test_dataset = data_loader.load_for_cornac(dataset_type='split')
print(train_dataset.shape)
train_dataset.head()

(513384, 4)


Unnamed: 0,ReviewId,RecipeId,AuthorId,Rating
0,826743,3745,345380,4
1,1247176,26217,406131,1
2,1250914,17123,355582,5
3,183560,123283,58104,4
4,1255493,110139,383795,5


In [4]:
from cornac.models import ItemKNN
from cornac.eval_methods import RatioSplit
import pandas as pd
import time

def train_item_knn(train_dataset, test_dataset, k_values=[5, 10, 20, 50], train_percentage=0.1, test_percentage=0.1, verbose=True, stratify_by='item'):
    """
    Train ItemKNN models for different k values and evaluate them.
    stratify_by: 'user', 'item', or None. If 'item', ensures all items are represented in train/test splits.
    Returns: (item_knn_models, item_knn_results, train_sample)
    """
    import numpy as np
    # Stratified sampling
    if stratify_by == 'user':
        train_sample = train_dataset.groupby('AuthorId', group_keys=False).apply(
            lambda x: x.sample(frac=train_percentage, random_state=42) if len(x) > 1 else x
        ).reset_index(drop=True)
        test_sample = test_dataset.groupby('AuthorId', group_keys=False).apply(
            lambda x: x.sample(frac=test_percentage, random_state=42) if len(x) > 1 else x
        ).reset_index(drop=True)
    elif stratify_by == 'item':
        train_sample = train_dataset.groupby('RecipeId', group_keys=False).apply(
            lambda x: x.sample(frac=train_percentage, random_state=42) if len(x) > 1 else x
        ).reset_index(drop=True)
        test_sample = test_dataset.groupby('RecipeId', group_keys=False).apply(
            lambda x: x.sample(frac=test_percentage, random_state=42) if len(x) > 1 else x
        ).reset_index(drop=True)
    else:
        train_sample = train_dataset.sample(frac=train_percentage, random_state=42)
        test_sample = test_dataset.sample(frac=test_percentage, random_state=42)

    if verbose:
        print(f"Train dataset size: {train_sample.shape}\nTest dataset size: {test_sample.shape}")

    from cornac.data import Dataset
    cornac_train_dataset = Dataset.from_uir(train_sample[['AuthorId', 'RecipeId', 'Rating']].values.tolist(), seed=42)

    import cornac.metrics as met
    metrics = [
        met.MSE(),
        met.RMSE(),
        met.MAE(),
        met.Precision(k=10),
        met.Recall(k=10),
        met.NDCG(k=10),
    ]
    from cornac.eval_methods import BaseMethod
    eval_method = BaseMethod.from_splits(train_sample[['AuthorId', 'RecipeId', 'Rating']].values, test_sample[['AuthorId', 'RecipeId', 'Rating']].values)

    item_knn_models = {}
    item_knn_results = []

    if verbose:
        print("\n" + "="*80)
        print("ITEM-BASED COLLABORATIVE FILTERING (ItemKNN) EVALUATION")
        print("="*80)

    for k in k_values:
        if verbose:
            print(f"\nTesting ItemKNN with k={k}...")
        item_knn = ItemKNN(k=k, similarity='cosine', verbose=verbose)
        start_time = time.time()
        item_knn.fit(cornac_train_dataset)
        total_time = time.time() - start_time
        # Evaluate metrics
        results = eval_method.evaluate(item_knn, metrics=metrics, user_based=False)
        model_result = {
            'model': f'ItemKNN(k={k})',
            'k': k,
            'results': results,
            'total_time': total_time,
            'item_knn': item_knn
        }
        item_knn_models[k] = item_knn
        item_knn_results.append(model_result)
        if verbose:
            print(f"  Total time: {total_time:.2f}s")
    return item_knn_models, item_knn_results, train_sample

# Diversified Recommendations: Maximal Marginal Relevance (MMR)

In this section, we implement the Maximal Marginal Relevance (MMR) algorithm to diversify recommendations. MMR balances the utility (relevance) of recommended items with their diversity, using a parameter $\alpha$ to control the trade-off. We compare the original recommendations from user-based and item-based collaborative filtering with the diversified recommendations and evaluate them using standard metrics (Precision@k, NDCG@k, Intra-list Diversity).


In [None]:
# Train the ItemKNN model (or load if already trained)
item_knn_models, item_knn_results, train_sample = train_item_knn(
    train_dataset, test_dataset, k_values=[10], train_percentage=0.5, test_percentage=0.1, verbose=False, stratify_by=None)
item_knn = item_knn_models[10]  # Use k=10 for demonstration



In [6]:
# Now we try to diversify the recommendations using MMR
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def mmr_diversify(utility_scores, item_ids, similarity_matrix, k=10, alpha=0.5, item_id_to_index=None):
    selected = [item_ids[0]]
    candidates = set(item_ids[1:])
    while len(selected) < k and candidates:
        mmr_scores = []
        for i in candidates:
            utility = utility_scores[i]
            i_idx = item_id_to_index[i]
            diversity = min(1 - similarity_matrix[i_idx, item_id_to_index[j]] for j in selected)
            mmr_score = (1 - alpha) * utility + alpha * diversity
            mmr_scores.append((i, mmr_score))
        next_item = max(mmr_scores, key=lambda x: x[1])[0]
        selected.append(next_item)
        candidates.remove(next_item)
    return selected

# Pick a user ID that is in the model and can be scored
for candidate_user in item_knn.user_ids:
    try:
        _ = item_knn.score(candidate_user, item_knn.item_ids[0])
        user_id = candidate_user
        break
    except Exception:
        continue
else:
    raise RuntimeError("No valid user found for scoring.")

# Get all items known to the model that can be scored for this user
utility_scores = {}
for item in item_knn.item_ids:
    try:
        utility_scores[item] = item_knn.score(user_id, item)
    except Exception:
        continue

# Recompute the similarity matrix using the train_sample
item_user_matrix = train_sample.pivot_table(index='RecipeId', columns='AuthorId', values='Rating', fill_value=0)
similarity_matrix = cosine_similarity(item_user_matrix)
item_id_to_index = {item_id: idx for idx, item_id in enumerate(item_user_matrix.index)}

# Get items not yet rated by the user
rated_items = set(train_sample[train_sample['AuthorId'] == user_id]['RecipeId'])
candidate_items = [item for item in item_user_matrix.index if item not in rated_items and item in utility_scores]

# Sort candidate items by utility
item_ids = sorted(candidate_items, key=lambda x: utility_scores[x], reverse=True)

# Perform MMR diversification
k = 10  # Number of items to recommend
alpha = 0.5  # Trade-off between utility and diversity
diversified_recommendations = mmr_diversify(utility_scores, item_ids, similarity_matrix, k=k, alpha=alpha, item_id_to_index=item_id_to_index)
# Print the diversified recommendations
print("Diversified Recommendations (MMR):")
for item in diversified_recommendations:
    print(f"Item ID: {item}, Utility Score: {utility_scores[item]:.4f}")

Diversified Recommendations (MMR):
Item ID: 3400, Utility Score: 5.0000
Item ID: 7682, Utility Score: 5.0000
Item ID: 8494, Utility Score: 5.0000
Item ID: 8603, Utility Score: 5.0000
Item ID: 12656, Utility Score: 5.0000
Item ID: 17232, Utility Score: 5.0000
Item ID: 18481, Utility Score: 5.0000
Item ID: 26121, Utility Score: 5.0000
Item ID: 33242, Utility Score: 5.0000
Item ID: 2670, Utility Score: 5.0000


In [7]:
# compare diversity reco with the original recommendations
original_recommendations = item_knn.recommend(user_id, k=k)
print("\nOriginal Recommendations:")
for item in original_recommendations:
    try:
        score = item_knn.score(user_id, item)
        print(f"Item ID: {item}, Utility Score: {score:.4f}")
    except Exception:
        # Skip items that cannot be scored
        print(f"Item ID: {item}, Utility Score: No score available for this item")
        continue


Original Recommendations:
Item ID: 78284, Utility Score: No score available for this item
Item ID: 244058, Utility Score: No score available for this item
Item ID: 75453, Utility Score: No score available for this item
Item ID: 373159, Utility Score: No score available for this item
Item ID: 56471, Utility Score: No score available for this item
Item ID: 13244, Utility Score: 4.5000
Item ID: 378571, Utility Score: No score available for this item
Item ID: 252536, Utility Score: No score available for this item
Item ID: 72614, Utility Score: No score available for this item
Item ID: 64599, Utility Score: No score available for this item
