In [1]:
!pip install recommenders

Collecting recommenders
  Downloading recommenders-1.2.1-py3-none-any.whl.metadata (13 kB)
Collecting category-encoders<3,>=2.6.0 (from recommenders)
  Downloading category_encoders-2.8.1-py3-none-any.whl.metadata (7.9 kB)
Collecting cornac<3,>=1.15.2 (from recommenders)
  Downloading cornac-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (37 kB)
Collecting locust<3,>=2.12.2 (from recommenders)
  Downloading locust-2.33.2-py3-none-any.whl.metadata (9.6 kB)
Collecting memory-profiler<1,>=0.61.0 (from recommenders)
  Downloading memory_profiler-0.61.0-py3-none-any.whl.metadata (20 kB)
Collecting retrying<2,>=1.3.4 (from recommenders)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Collecting scikit-surprise>=1.1.3 (from recommenders)
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Gettin

In [16]:
import numpy as np
import pandas as pd
from collections import Counter
import os
import pickle
from recommenders.models.sar import SAR
from recommenders.datasets import movielens
from recommenders.datasets.python_splitters import python_stratified_split
from recommenders.evaluation.python_evaluation import map_at_k, ndcg_at_k, precision_at_k, recall_at_k, rmse, mae


In [3]:
def load_data(size="100k"):
    """Load the MovieLens data and split it into train and test sets."""
    # Load the MovieLens data
    data = movielens.load_pandas_df(
        size=size,
        header=["UserId", "MovieId", "Rating", "Timestamp"],
        title_col="Title",
    )

    # Convert the float precision to 32-bit
    data["Rating"] = data["Rating"].astype(np.float32)

    # Split the data
    header = {
        "col_user": "UserId",
        "col_item": "MovieId",
        "col_rating": "Rating",
        "col_timestamp": "Timestamp",
        "col_prediction": "Prediction",
    }

    train, test = python_stratified_split(
        data, ratio=0.80, col_user=header["col_user"], col_item=header["col_item"], seed=42
    )

    return data, train, test, header

In [4]:
def train_sar_model(similarity_type="jaccard", time_decay_coefficient=30, timedecay_formula=True):
    """Train and save a SAR model."""
    # Load data
    data, train, test, header = load_data()

    # Create the model
    model = SAR(
        similarity_type=similarity_type,
        time_decay_coefficient=time_decay_coefficient,
        time_now=None,
        timedecay_formula=timedecay_formula,
        **header
    )

    # Train the model
    model.fit(train)

    # Save the model
    model_dir = "models"
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    model_path = os.path.join(model_dir, f"sar_{similarity_type}_model.pkl")
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)

    return model, data, train, test

In [5]:
def load_sar_model(similarity_type="jaccard"):
    """Load a trained SAR model or train one if it doesn't exist."""
    model_dir = "models"
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    model_path = os.path.join(model_dir, f"sar_{similarity_type}_model.pkl")
    if not os.path.exists(model_path):
        print(f"No trained model found for SAR with {similarity_type} similarity. Training now...")
        model, data, train, test = train_sar_model(similarity_type)
    else:
        # Load the model
        with open(model_path, 'rb') as f:
            model = pickle.load(f)
        data, train, test, _ = load_data()
        print(f"Loaded SAR model with {similarity_type} similarity")

    return model, data, train, test

In [6]:
def get_top_n_recommendations_sar(model, user_id, n=10, exclude_rated=True):
    """Generate top-N recommendations for a specific user using SAR."""
    # Load the data
    data, train, _, _ = load_data()

    # Create a test set for the specific user
    user_data = data[data["UserId"] == user_id]

    if exclude_rated:
        # Get movies that the user has already rated
        rated_movies = set(user_data["MovieId"].tolist())

        # Create a new dataframe with just the user and movies they haven't rated
        all_movies = set(data["MovieId"].unique())
        unrated_movies = all_movies - rated_movies

        # Create a test dataframe with the user and unrated movies
        test_user = pd.DataFrame({
            "UserId": [user_id] * len(unrated_movies),
            "MovieId": list(unrated_movies),
            "Rating": [0] * len(unrated_movies)  # Placeholder, not used for recommendations
        })
    else:
        # Return recommendations for all movies (including rated ones)
        test_user = pd.DataFrame({
            "UserId": [user_id] * len(data["MovieId"].unique()),
            "MovieId": data["MovieId"].unique(),
            "Rating": [0] * len(data["MovieId"].unique())
        })

    # Get top-k recommendations
    top_k = model.recommend_k_items(test_user, top_k=n, remove_seen=exclude_rated)

    # Join with movie titles
    top_k_with_titles = top_k.join(
        data[["MovieId", "Title"]].drop_duplicates().set_index("MovieId"),
        on="MovieId",
        how="inner",
    ).sort_values(by=["UserId", "Prediction"], ascending=False)

    # Format the results
    results = []
    for i, (_, row) in enumerate(top_k_with_titles.iterrows(), 1):
        results.append((i, row["MovieId"], row["Title"], row["Prediction"]))

    return results


In [7]:
def explain_recommendation_sar(model, user_id, movie_id, data=None, similarity_type="jaccard"):
    """Generate explanation for why a movie was recommended by SAR algorithm."""
    if data is None:
        data, _, _, _ = load_data()

    # Get the movie title
    movie_title = data[data["MovieId"] == movie_id]["Title"].values[0]

    # Start building explanation
    explanation = f"Why '{movie_title}' was recommended\n\n"
    reasons = []

    try:
        # 1. Access internal model components
        # Get the affinity matrix (user-item interactions)
        user_affinity = model.user_affinity
        # Get the similarity matrix (item-item similarity)
        item_similarity = model.item_similarity

        # 2. Find similar items that the user has interacted with
        # Get the index for the specified user and movie
        user_idx = model.user2index[user_id]
        item_idx = model.item2index[movie_id]

        # Find items the user has interacted with
        user_items = []
        # Check if it's a sparse matrix or dense ndarray
        if hasattr(user_affinity, 'toarray'):
            user_affinity_row = user_affinity[user_idx].toarray()[0]
        else:
            user_affinity_row = user_affinity[user_idx]

        for i, val in enumerate(user_affinity_row):
            if val > 0:
                user_items.append((i, val))

        # Sort by affinity score (highest first)
        user_items.sort(key=lambda x: x[1], reverse=True)

        # 3. Find similar items to the recommended movie
        similar_items = []
        # Check if it's a sparse matrix or dense ndarray
        if hasattr(item_similarity, 'toarray'):
            item_similarity_row = item_similarity[item_idx].toarray()[0]
        else:
            item_similarity_row = item_similarity[item_idx]

        for i, val in enumerate(item_similarity_row):
            if i != item_idx and val > 0:
                similar_items.append((i, val))

        # Sort by similarity score (highest first)
        similar_items.sort(key=lambda x: x[1], reverse=True)

        # 4. Find common items that the user liked and are similar to the recommended item
        common_items = []
        user_item_indices = [idx for idx, _ in user_items]

        for idx, sim_val in similar_items:
            if idx in user_item_indices:
                # Get the actual item/movie ID
                item_id = model.index2item[idx]
                # Get the movie title
                title = data[data["MovieId"] == item_id]["Title"].values[0]
                # Get the user's affinity score for this item
                affinity_val = user_affinity[user_idx, idx] if hasattr(user_affinity, 'toarray') else user_affinity[user_idx][idx]
                common_items.append((title, sim_val, affinity_val))

        # Only keep the top items
        common_items = common_items[:3]

        # 5. Calculate overall item popularity
        item_counts = np.bincount(data["MovieId"].astype(int))
        movie_count = item_counts[movie_id] if movie_id < len(item_counts) else 0
        total_users = data["UserId"].nunique()
        popularity_percentage = (movie_count / total_users) * 100

        # Add reasons based on our analysis

        # Movie popularity
        if popularity_percentage > 50:
            reasons.append(f"This movie is very popular, rated by {movie_count} users ({popularity_percentage:.1f}% of all users).")
        elif popularity_percentage > 30:
            reasons.append(f"This movie is quite popular, rated by {movie_count} users ({popularity_percentage:.1f}% of all users).")
        elif popularity_percentage > 10:
            reasons.append(f"This movie has been rated by {movie_count} users ({popularity_percentage:.1f}% of all users).")

        # Similarity to movies the user has rated
        if common_items:
            if len(common_items) == 1:
                reasons.append(f"This movie is similar to '{common_items[0][0]}', which you've rated before.")
            elif len(common_items) == 2:
                reasons.append(f"This movie is similar to '{common_items[0][0]}' and '{common_items[1][0]}', which you've rated before.")
            else:
                reasons.append(f"This movie is similar to '{common_items[0][0]}', '{common_items[1][0]}', and '{common_items[2][0]}', which you've rated before.")

        # Explain the similarity metric used
        if similarity_type == "jaccard":
            reasons.append("The recommendation is based on Jaccard similarity, which measures the proportion of users who rated both movies among all users who rated either movie.")
        elif similarity_type == "lift":
            reasons.append("The recommendation is based on Lift similarity, which gives higher weight to rare co-occurrences, favoring more unique or serendipitous recommendations.")
        elif similarity_type == "cooccurrence":
            reasons.append("The recommendation is based on raw co-occurrence counts, which tends to favor popular items that many users have rated.")
        elif similarity_type == "cosine":
            reasons.append("The recommendation is based on Cosine similarity, which measures the cosine of the angle between the item vectors.")
        else:
            reasons.append(f"The recommendation is based on {similarity_type} similarity.")

        # Find users with similar tastes who liked this movie
        # Get users who rated this movie highly
        high_ratings = data[(data["MovieId"] == movie_id) & (data["Rating"] >= 4)]
        if len(high_ratings) > 0:
            high_rating_users = high_ratings["UserId"].tolist()

            # Find overlapping movies between recommended user and users who liked this movie
            overlap_counts = []
            for high_rating_user in high_rating_users[:20]:  # Limit to 20 users for efficiency
                user_rated_movies = set(data[data["UserId"] == user_id]["MovieId"].tolist())
                other_user_rated_movies = set(data[data["UserId"] == high_rating_user]["MovieId"].tolist())
                overlap = len(user_rated_movies & other_user_rated_movies)
                if overlap > 5:  # Only consider users with significant overlap
                    overlap_counts.append((high_rating_user, overlap))

            # Sort by overlap
            overlap_counts.sort(key=lambda x: x[1], reverse=True)

            if overlap_counts:
                similar_users_count = len(overlap_counts)
                if similar_users_count == 1:
                    reasons.append(f"One user with similar movie tastes to yours rated this movie highly.")
                else:
                    reasons.append(f"{similar_users_count} users with similar movie tastes to yours rated this movie highly.")

        # Add reasons to explanation
        explanation += f"Based on the SAR algorithm with {similarity_type} similarity:\n\n"
        for i, reason in enumerate(reasons, 1):
            explanation += f"{i}. {reason}\n"

        if not reasons:
            explanation += "This movie matches patterns in your rating history based on the SAR algorithm, which finds items similar to ones you've rated highly."

        # Add technical details about the prediction score
        prediction_score = 0
        if hasattr(model, 'recommend_k_items'):
            # Create a small test dataframe for just this movie and user
            test_df = pd.DataFrame({
                "UserId": [user_id],
                "MovieId": [movie_id],
                "Rating": [0]  # Placeholder
            })
            recommendations = model.recommend_k_items(test_df)
            if not recommendations.empty:
                prediction_score = recommendations["Prediction"].values[0]

        explanation += f"\nTechnical details:\n"
        explanation += f"- Recommendation score: {prediction_score:.2f}\n"
        explanation += f"- Similarity metric: {similarity_type}\n"

        return explanation

    except Exception as e:
        import traceback
        traceback.print_exc()
        return f"Error creating explanation: {str(e)}"

In [18]:
def evaluate_sar_model(similarity_type="jaccard", time_decay_coefficient=30, top_k=10):
    """Evaluate a SAR model using standard ranking metrics including RMSE."""
    # Load data
    data, train, test, header = load_data()

    # Create and train the model
    model = SAR(
        similarity_type=similarity_type,
        time_decay_coefficient=time_decay_coefficient,
        time_now=None,
        timedecay_formula=True,
        **header
    )

    # Train the model
    model.fit(train)

    # Generate recommendations for all users in the test set
    top_k_recommendations = model.recommend_k_items(test, top_k=top_k, remove_seen=True)

    # Calculate evaluation metrics
    args = [test, top_k_recommendations]
    kwargs = dict(
        col_user="UserId",
        col_item="MovieId",
        col_rating="Rating",
        col_prediction="Prediction",
        relevancy_method="top_k",
        k=top_k,
    )

    eval_map = map_at_k(*args, **kwargs)
    eval_ndcg = ndcg_at_k(*args, **kwargs)
    eval_precision = precision_at_k(*args, **kwargs)
    eval_recall = recall_at_k(*args, **kwargs)

    # Calculate RMSE manually
    # First, prepare dataframes for RMSE calculation by merging test data with predictions
    test_df = test.copy()
    pred_df = top_k_recommendations.copy()

    # Merge test and prediction dataframes on UserId and MovieId
    merged_df = test_df.merge(
        pred_df,
        on=["UserId", "MovieId"],
        how="inner"
    )

    # Calculate RMSE manually using numpy
    true_ratings = merged_df["Rating"].values
    pred_ratings = merged_df["Prediction"].values
    eval_rmse = np.sqrt(np.mean((true_ratings - pred_ratings) ** 2))

    # Calculate MAE manually
    eval_mae = np.mean(np.abs(true_ratings - pred_ratings))

    # Print the results
    print(f"Model: SAR with {similarity_type} similarity")
    print(f"Top K:\t\t {top_k}")
    print(f"MAP:\t\t {eval_map:f}")
    print(f"NDCG:\t\t {eval_ndcg:f}")
    print(f"Precision@K:\t {eval_precision:f}")
    print(f"Recall@K:\t {eval_recall:f}")
    print(f"RMSE:\t\t {eval_rmse:f}")
    print(f"MAE:\t\t {eval_mae:f}")

    return eval_map, eval_ndcg, eval_precision, eval_recall, eval_rmse, eval_mae

In [9]:
evaluate_sar_model(similarity_type="jaccard", time_decay_coefficient=30, top_k=10)

100%|██████████| 4.81k/4.81k [00:01<00:00, 3.20kKB/s]


Model: SAR with jaccard similarity
Top K:		 10
MAP:		 0.211907
NDCG:		 0.340329
Precision@K:	 0.283033
Recall@K:	 0.184676


(0.21190701157726358,
 0.34032940522506155,
 0.28303287380699893,
 0.18467627295007502)

In [10]:
def recommend_movies_sar(similarity_type="jaccard", user_id=1, n=10, explain=False, time_decay=30):
    """Load a trained SAR model and generate recommendations for a user with explanations."""
    # Load or train model
    model, data, _, _ = load_sar_model(similarity_type)

    # Get recommendations
    recommendations = get_top_n_recommendations_sar(model, user_id, n)

    print(f"\nTop {n} movie recommendations for User {user_id} using SAR ({similarity_type}):")
    print("-" * 80)
    print(f"{'Rank':<5} {'Movie ID':<10} {'Title':<40} {'Score':<10}")
    print("-" * 80)

    for rank, movie_id, title, score in recommendations:
        print(f"{rank:<5} {movie_id:<10} {title:<40} {score:<10.2f}")

    # If explanation is requested, automatically explain top recommendations
    if explain and recommendations:
        for i in range(min(3, len(recommendations))):
            rank, movie_id, title, _ = recommendations[i]
            print("\n" + "=" * 80)
            print(f"Recommendation #{rank}: {title}")
            explanation = explain_recommendation_sar(model, user_id, movie_id, data, similarity_type)
            print(explanation)
            print("=" * 80)

    return recommendations

In [20]:
# Compare different similarity types
print("\n===== Comparing Different Similarity Types =====")
evaluate_sar_model(similarity_type="jaccard", top_k=10)
evaluate_sar_model(similarity_type="lift", top_k=10)
evaluate_sar_model(similarity_type="cooccurrence", top_k=10)


===== Comparing Different Similarity Types =====


100%|██████████| 4.81k/4.81k [00:01<00:00, 2.87kKB/s]


Model: SAR with jaccard similarity
Top K:		 10
MAP:		 0.211907
NDCG:		 0.340329
Precision@K:	 0.283033
Recall@K:	 0.184676
RMSE:		 29.379553
MAE:		 13.645811


100%|██████████| 4.81k/4.81k [00:01<00:00, 3.18kKB/s]


Model: SAR with lift similarity
Top K:		 10
MAP:		 0.000232
NDCG:		 0.000716
Precision@K:	 0.000636
Recall@K:	 0.000146
RMSE:		 2.755828
MAE:		 2.546989


100%|██████████| 4.81k/4.81k [00:01<00:00, 3.22kKB/s]


Model: SAR with cooccurrence similarity
Top K:		 10
MAP:		 0.185627
NDCG:		 0.307772
Precision@K:	 0.249947
Recall@K:	 0.152381
RMSE:		 10767.073747
MAE:		 5623.734445


(0.1856267605624395,
 0.3077720056272961,
 0.24994697773064686,
 0.15238144595186617,
 10767.073746594302,
 5623.734445384878)

In [19]:
recommend_movies_sar(similarity_type="jaccard", user_id=1, n=10, explain=True)
print("-" * 80)
recommend_movies_sar(similarity_type="lift", user_id=1, n=10, explain=True)
print("-" * 80)
recommend_movies_sar(similarity_type="cooccurrence", user_id=1, n=10, explain=True)

100%|██████████| 4.81k/4.81k [00:01<00:00, 2.86kKB/s]


Loaded SAR model with jaccard similarity


100%|██████████| 4.81k/4.81k [00:01<00:00, 2.91kKB/s]



Top 10 movie recommendations for User 1 using SAR (jaccard):
--------------------------------------------------------------------------------
Rank  Movie ID   Title                                    Score     
--------------------------------------------------------------------------------
1     204        Back to the Future (1985)                3.26      
2     403        Batman (1989)                            3.26      
3     433        Heathers (1989)                          3.21      
4     174        Raiders of the Lost Ark (1981)           3.21      
5     4          Get Shorty (1995)                        3.17      
6     98         Silence of the Lambs, The (1991)         3.15      
7     70         Four Weddings and a Funeral (1994)       3.15      
8     228        Star Trek: The Wrath of Khan (1982)      3.14      
9     367        Clueless (1995)                          3.12      
10    423        E.T. the Extra-Terrestrial (1982)        3.11      

Recommendation #

100%|██████████| 4.81k/4.81k [00:01<00:00, 2.93kKB/s]


Loaded SAR model with lift similarity


100%|██████████| 4.81k/4.81k [00:01<00:00, 2.92kKB/s]



Top 10 movie recommendations for User 1 using SAR (lift):
--------------------------------------------------------------------------------
Rank  Movie ID   Title                                    Score     
--------------------------------------------------------------------------------
1     1650       Butcher Boy, The (1998)                  0.31      
2     1634       Etz Hadomim Tafus (Under the Domin Tree) (1994) 0.31      
3     1532       Foreign Student (1994)                   0.31      
4     1635       Two Friends (1986)                       0.31      
5     1638       Normal Life (1996)                       0.31      
6     1106       Newton Boys, The (1998)                  0.31      
7     1639       Bitter Sugar (Azucar Amargo) (1996)      0.31      
8     1648       Niagara, Niagara (1997)                  0.31      
9     1651       Spanish Prisoner, The (1997)             0.31      
10    1637       Girls Town (1996)                        0.31      

Recommendati

100%|██████████| 4.81k/4.81k [00:01<00:00, 3.15kKB/s]


Loaded SAR model with cooccurrence similarity


100%|██████████| 4.81k/4.81k [00:01<00:00, 2.84kKB/s]



Top 10 movie recommendations for User 1 using SAR (cooccurrence):
--------------------------------------------------------------------------------
Rank  Movie ID   Title                                    Score     
--------------------------------------------------------------------------------
1     174        Raiders of the Lost Ark (1981)           1269.62   
2     181        Return of the Jedi (1983)                1238.93   
3     98         Silence of the Lambs, The (1991)         1170.24   
4     258        Contact (1997)                           1164.14   
5     204        Back to the Future (1985)                1118.60   
6     121        Independence Day (ID4) (1996)            1107.28   
7     288        Scream (1996)                            1066.90   
8     294        Liar Liar (1997)                         1000.99   
9     286        English Patient, The (1996)              994.41    
10    405        Mission: Impossible (1996)               993.19    

Recommendat

[(1, 174, 'Raiders of the Lost Ark (1981)', 1269.6239935452586),
 (2, 181, 'Return of the Jedi (1983)', 1238.9257878659648),
 (3, 98, 'Silence of the Lambs, The (1991)', 1170.2432807647017),
 (4, 258, 'Contact (1997)', 1164.1424516626464),
 (5, 204, 'Back to the Future (1985)', 1118.5957891404662),
 (6, 121, 'Independence Day (ID4) (1996)', 1107.2804050934321),
 (7, 288, 'Scream (1996)', 1066.9000458384182),
 (8, 294, 'Liar Liar (1997)', 1000.9928664157234),
 (9, 286, 'English Patient, The (1996)', 994.4112194684725),
 (10, 405, 'Mission: Impossible (1996)', 993.1863552621413)]