# Task
Build a movie recommendation system using the MovieLens 100K dataset ("https://www.kaggle.com/datasets/prajitdatta/movielens-100k-dataset"). The system should recommend movies based on user similarity computed from a user-item matrix. Evaluate the system using precision at K. Optionally, implement item-based collaborative filtering and matrix factorization (SVD).

## Load the data

### Subtask:
Load the movie ratings data and the movie information data.


**Reasoning**:
Load the movie ratings data and the movie information data into pandas DataFrames as instructed.



In [5]:
%pip install mlcroissant

Collecting mlcroissant
  Downloading mlcroissant-1.0.21-py2.py3-none-any.whl.metadata (10 kB)
Collecting jsonpath-rw (from mlcroissant)
  Downloading jsonpath-rw-1.4.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting rdflib (from mlcroissant)
  Downloading rdflib-7.1.4-py3-none-any.whl.metadata (11 kB)
Downloading mlcroissant-1.0.21-py2.py3-none-any.whl (144 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m144.4/144.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rdflib-7.1.4-py3-none-any.whl (565 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.1/565.1 kB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: jsonpath-rw
  Building wheel for jsonpath-rw (setup.py) ... [?25l[?25hdone
  Created wheel for jsonpath-rw: filename=jsonpath_rw-1.4.0-py3-none-any.whl size=15127 sha256=4811f1b912ca700706a73a995588e21db4cab564c14e2d8e05e358dcf8cce9e6
  Stored in directory: 

In [9]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("prajitdatta/movielens-100k-dataset")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/prajitdatta/movielens-100k-dataset?dataset_version_number=1...


100%|██████████| 4.77M/4.77M [00:00<00:00, 125MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/prajitdatta/movielens-100k-dataset/versions/1


In [11]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("prajitdatta/movielens-100k-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/movielens-100k-dataset


In [12]:
import pandas as pd
import os

# Construct the full path to the u.data file
ratings_file_path = os.path.join(path, 'ml-100k/u.data')

# Load the ratings data
# The u.data file has no header and the columns are user id, item id, rating, timestamp
ratings_df = pd.read_csv(ratings_file_path, sep='\t', header=None, names=['user_id', 'movie_id', 'rating', 'timestamp'])

display(ratings_df.head())

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [13]:
# Construct the full path to the u.item file
movies_file_path = os.path.join(path, 'ml-100k/u.item')

# Load the movies data
# The u.item file has no header and the columns are movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western |
# The genres are binary (0 or 1)
movies_df = pd.read_csv(movies_file_path, sep='|', header=None, encoding='latin-1', names=['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'])

display(movies_df.head())

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [15]:
# Create the user-item matrix using pivot_table
user_item_matrix = ratings_df.pivot_table(index='user_id', columns='movie_id', values='rating')

display(user_item_matrix.head())

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [16]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Fill NaN values with 0 for similarity calculation (or use a different strategy)
user_item_matrix_filled = user_item_matrix.fillna(0)

# Calculate cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix_filled)

# Convert the similarity matrix to a DataFrame for easier handling
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

display(user_similarity_df.head())

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.166931,0.04746,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.0,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.22679,0.161485,0.172268,0.105798
3,0.04746,0.110591,1.0,0.344151,0.021245,0.072415,0.066137,0.08306,0.06104,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.16189,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.0,0.031804,0.068044,0.09123,0.18806,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.0,0.237286,0.3736,0.24893,0.056847,0.201427,...,0.338794,0.08058,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941


In [32]:
def get_user_recommendations(user_id, n_recommendations=10):
    """
    Generates movie recommendations for a given user based on user similarity,
    considering movies not rated in the training set.

    Args:
        user_id (int): The ID of the user for whom to generate recommendations.
        n_recommendations (int): The number of recommendations to generate.

    Returns:
        list: A list of recommended movie titles.
    """
    if user_id not in user_similarity_df.index:
        return "User ID not found in the dataset."

    # Get the similarity scores for the target user
    user_similarities = user_similarity_df[user_id].sort_values(ascending=False)

    # Exclude the user themselves
    user_similarities = user_similarities.drop(user_id)

    # Get the movies rated by the target user in the TRAINING set
    rated_movies_train = train_user_item_matrix.loc[user_id].dropna().index

    # Initialize a list to store recommended movie IDs and their weighted ratings
    recommended_movies = {}

    # Iterate through similar users
    for similar_user_id, similarity_score in user_similarities.items():
        if similarity_score > 0: # Consider users with positive similarity
            # Get movies rated by the similar user
            similar_user_rated_movies = user_item_matrix.loc[similar_user_id].dropna()

            # Filter out movies already rated by the target user in the TRAINING set
            unseen_movies = similar_user_rated_movies.drop(rated_movies_train, errors='ignore')

            # Calculate weighted rating for unseen movies and add to recommended_movies
            for movie_id, rating in unseen_movies.items():
                if movie_id not in recommended_movies:
                    recommended_movies[movie_id] = 0
                recommended_movies[movie_id] += similarity_score * rating

    # Sort recommended movies by weighted rating
    recommended_movies = sorted(recommended_movies.items(), key=lambda item: item[1], reverse=True)

    # Get the top N recommended movie IDs
    top_n_movie_ids = [movie_id for movie_id, weighted_rating in recommended_movies[:n_recommendations]]

    # Get the movie titles for the recommended movie IDs
    recommended_movie_titles = movies_df[movies_df['movie_id'].isin(top_n_movie_ids)]['movie_title'].tolist()

    return recommended_movie_titles

In [18]:
from sklearn.model_selection import train_test_split

# Split the ratings data into training and testing sets
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

# Create user-item matrices for training and testing
train_user_item_matrix = train_df.pivot_table(index='user_id', columns='movie_id', values='rating')
test_user_item_matrix = test_df.pivot_table(index='user_id', columns='movie_id', values='rating')

display(train_user_item_matrix.head())
display(test_user_item_matrix.head())

movie_id,1,2,3,4,5,6,7,8,9,10,...,1668,1670,1671,1672,1673,1676,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,4.0,,3.0,,4.0,,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


movie_id,1,2,3,4,5,6,7,8,9,10,...,1648,1649,1655,1656,1658,1669,1674,1675,1677,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,3.0,,5.0,,1.0,,,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,


In [31]:
def precision_at_k(recommendations, test_user_item_matrix, k=10):
    """
    Calculates Precision at K for a list of recommendations for multiple users.

    Args:
        recommendations (dict): A dictionary where keys are user_ids and values are lists of recommended movie titles.
        test_user_item_matrix (pd.DataFrame): The user-item matrix for the test set.
        k (int): The number of recommendations to consider.

    Returns:
        float: The average Precision at K across all users in the recommendations.
    """
    total_precision = 0
    num_users = 0

    for user_id, recommended_movie_titles in recommendations.items():
        # Check if the user exists in the test set and has actual ratings
        if user_id in test_user_item_matrix.index and test_user_item_matrix.loc[user_id].dropna().shape[0] > 0:
            num_users += 1
            # Get the movies the user actually rated in the test set
            actual_rated_movies = test_user_item_matrix.loc[user_id].dropna().index.tolist()

            # Get the movie IDs for the recommended movie titles
            recommended_movie_ids = movies_df[movies_df['movie_title'].isin(recommended_movie_titles)]['movie_id'].tolist()

            # Calculate the number of relevant recommendations in the top K
            # A recommendation is relevant if the user rated the movie in the test set
            relevant_recommendations = len(set(recommended_movie_ids[:k]) & set(actual_rated_movies))

            # Calculate precision for the user
            precision = relevant_recommendations / min(k, len(recommended_movie_ids)) if len(recommended_movie_ids) > 0 else 0
            total_precision += precision

    return total_precision / num_users if num_users > 0 else 0

In [20]:
# Generate recommendations for users in the test set
# For simplicity, we will generate recommendations for a subset of users in the test set
# You can adjust the number of users as needed
test_users = test_user_item_matrix.index.tolist()[:50] # Evaluate for the first 50 users in the test set

test_recommendations = {}
for user_id in test_users:
    test_recommendations[user_id] = get_user_recommendations(user_id, n_recommendations=10)

# Calculate Precision at K
precision_k = precision_at_k(test_recommendations, test_user_item_matrix, k=10)

print(f"Precision at K (K=10): {precision_k}")

Precision at K (K=10): 0.0


In [21]:
# Create the item-user matrix (transpose of the user-item matrix)
item_user_matrix = user_item_matrix.T

# Fill NaN values with 0 for similarity calculation (or use a different strategy)
item_user_matrix_filled = item_user_matrix.fillna(0)

# Calculate cosine similarity between items
item_similarity = cosine_similarity(item_user_matrix_filled)

# Convert the similarity matrix to a DataFrame for easier handling
item_similarity_df = pd.DataFrame(item_similarity, index=item_user_matrix.index, columns=item_user_matrix.index)

display(item_similarity_df.head())

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.402382,0.330245,0.454938,0.286714,0.116344,0.620979,0.481114,0.496288,0.273935,...,0.035387,0.0,0.0,0.0,0.035387,0.0,0.0,0.0,0.047183,0.047183
2,0.402382,1.0,0.273069,0.502571,0.318836,0.083563,0.383403,0.337002,0.255252,0.171082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078299,0.078299
3,0.330245,0.273069,1.0,0.324866,0.212957,0.106722,0.372921,0.200794,0.273669,0.158104,...,0.0,0.0,0.0,0.0,0.032292,0.0,0.0,0.0,0.0,0.096875
4,0.454938,0.502571,0.324866,1.0,0.334239,0.090308,0.489283,0.490236,0.419044,0.252561,...,0.0,0.0,0.094022,0.094022,0.037609,0.0,0.0,0.0,0.056413,0.075218
5,0.286714,0.318836,0.212957,0.334239,1.0,0.037299,0.334769,0.259161,0.272448,0.055453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094211


In [22]:
def get_item_recommendations(user_id, n_recommendations=10):
    """
    Generates movie recommendations for a given user based on item similarity.

    Args:
        user_id (int): The ID of the user for whom to generate recommendations.
        n_recommendations (int): The number of recommendations to generate.

    Returns:
        list: A list of recommended movie titles.
    """
    if user_id not in user_item_matrix.index:
        return "User ID not found in the dataset."

    # Get the movies rated by the target user and their ratings
    user_ratings = user_item_matrix.loc[user_id].dropna()

    # Initialize a dictionary to store potential recommendations and their scores
    item_scores = {}

    # Iterate through the movies rated by the user
    for movie_id, rating in user_ratings.items():
        # Get the similarity scores for this movie with all other movies
        item_similarities = item_similarity_df[movie_id].sort_values(ascending=False)

        # Iterate through similar items
        for similar_movie_id, similarity_score in item_similarities.items():
            if similar_movie_id not in user_ratings.index: # Exclude movies already rated by the user
                if similar_movie_id not in item_scores:
                    item_scores[similar_movie_id] = 0
                # Weight the similarity score by the user's rating of the original item
                item_scores[similar_movie_id] += similarity_score * rating

    # Sort the potential recommendations by their scores
    recommended_movies = sorted(item_scores.items(), key=lambda item: item[1], reverse=True)

    # Get the top N recommended movie IDs
    top_n_movie_ids = [movie_id for movie_id, score in recommended_movies[:n_recommendations]]

    # Get the movie titles for the recommended movie IDs
    recommended_movie_titles = movies_df[movies_df['movie_id'].isin(top_n_movie_ids)]['movie_title'].tolist()

    return recommended_movie_titles

In [23]:
# Generate item-based recommendations for users in the test set
test_item_recommendations = {}
for user_id in test_users:
    test_item_recommendations[user_id] = get_item_recommendations(user_id, n_recommendations=10)

# Calculate Precision at K for item-based recommendations
precision_k_item_based = precision_at_k(test_item_recommendations, test_user_item_matrix, k=10)

print(f"Precision at K (K=10) for Item-Based CF: {precision_k_item_based}")
print(f"Precision at K (K=10) for User-Based CF: {precision_k}")

Precision at K (K=10) for Item-Based CF: 0.0
Precision at K (K=10) for User-Based CF: 0.0


In [24]:
from sklearn.decomposition import TruncatedSVD

# Fill NaN values with 0 for SVD (or use a different strategy like mean imputation)
user_item_matrix_svd = user_item_matrix.fillna(0)

# Apply Truncated SVD
# We'll choose a number of components (latent factors)
n_components = 50 # This is a hyperparameter that can be tuned
svd = TruncatedSVD(n_components=n_components, random_state=42)
svd.fit(user_item_matrix_svd)

# Get the decomposed matrices
U = svd.transform(user_item_matrix_svd)
Vt = svd.components_

print("Shape of U (User-Latent Factor Matrix):", U.shape)
print("Shape of Vt (Latent Factor-Item Matrix):", Vt.shape)

Shape of U (User-Latent Factor Matrix): (943, 50)
Shape of Vt (Latent Factor-Item Matrix): (50, 1682)


In [25]:
# Predict ratings by reconstructing the matrix (approximately)
predicted_ratings_svd = U @ Vt

# Convert the predicted ratings array to a DataFrame
predicted_ratings_svd_df = pd.DataFrame(predicted_ratings_svd, index=user_item_matrix.index, columns=user_item_matrix.columns)

display(predicted_ratings_svd_df.head())

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.107651,2.517427,1.380815,2.980384,1.608721,1.750464,5.131549,1.615031,3.013657,2.978584,...,0.076903,-0.032477,-0.033798,-0.022532,0.079559,0.001434,0.004302,0.002868,0.04475,0.068175
2,2.055177,0.01848,-0.04056,0.577979,0.095909,0.329084,0.791529,0.198524,2.03517,0.575685,...,0.022124,0.013903,-0.004798,-0.003199,-0.019724,0.004629,0.013886,0.009257,0.027885,-0.018599
3,0.306473,-0.147295,-0.110987,-0.284082,0.067498,-0.005538,0.025827,0.577107,-0.241616,0.386882,...,-0.020658,-0.006683,0.05663,0.037753,-0.01457,0.014828,0.044484,0.029656,-0.002609,-0.00247
4,0.107305,-0.297067,0.045807,0.189322,0.059119,-0.130069,0.101155,-0.085816,-0.070552,-0.158555,...,-0.003696,0.006513,0.01149,0.00766,0.004451,0.006005,0.018015,0.01201,-0.009072,-0.031777
5,4.184258,2.049577,-0.138803,1.128229,0.776645,0.070275,1.705621,0.635517,1.067316,0.247696,...,0.010844,-0.093746,-0.040286,-0.026858,-0.030547,-0.002485,-0.007456,-0.004971,-0.017851,-0.011578


In [26]:
def get_svd_recommendations(user_id, n_recommendations=10):
    """
    Generates movie recommendations for a given user based on SVD predicted ratings.

    Args:
        user_id (int): The ID of the user for whom to generate recommendations.
        n_recommendations (int): The number of recommendations to generate.

    Returns:
        list: A list of recommended movie titles.
    """
    if user_id not in predicted_ratings_svd_df.index:
        return "User ID not found in the dataset."

    # Get the predicted ratings for the target user
    user_predicted_ratings = predicted_ratings_svd_df.loc[user_id]

    # Get the movies already rated by the user
    user_rated_movies = user_item_matrix.loc[user_id].dropna().index

    # Filter out movies already rated by the user and sort the remaining by predicted rating
    unseen_movies_predicted_ratings = user_predicted_ratings.drop(user_rated_movies, errors='ignore').sort_values(ascending=False)

    # Get the top N recommended movie IDs
    top_n_movie_ids = unseen_movies_predicted_ratings.head(n_recommendations).index.tolist()

    # Get the movie titles for the recommended movie IDs
    recommended_movie_titles = movies_df[movies_df['movie_id'].isin(top_n_movie_ids)]['movie_title'].tolist()

    return recommended_movie_titles

In [33]:
# Generate SVD-based recommendations for users in the test set
test_svd_recommendations = {}
for user_id in test_users:
    test_svd_recommendations[user_id] = get_svd_recommendations(user_id, n_recommendations=10)

# Calculate Precision at K for SVD recommendations
precision_k_svd = precision_at_k(test_svd_recommendations, test_user_item_matrix, k=10)

print(f"Precision at K (K=10) for SVD: {precision_k_svd}")
print(f"Precision at K (K=10) for Item-Based CF: {precision_k_item_based}")
print(f"Precision at K (K=10) for User-Based CF: {precision_k}")

Precision at K (K=10) for SVD: 0.0
Precision at K (K=10) for Item-Based CF: 0.0
Precision at K (K=10) for User-Based CF: 0.0


In [34]:
# Select a few test users to inspect
users_to_inspect = test_users[:5]

for user_id in users_to_inspect:
    print(f"Recommendations for User {user_id} (SVD):")
    svd_recommendations = get_svd_recommendations(user_id, n_recommendations=10)
    print(svd_recommendations)

    print(f"\nActual Rated Movies for User {user_id} in Test Set:")
    actual_rated_movies_ids = test_user_item_matrix.loc[user_id].dropna().index.tolist()
    actual_rated_movie_titles = movies_df[movies_df['movie_id'].isin(actual_rated_movies_ids)]['movie_title'].tolist()
    print(actual_rated_movie_titles)
    print("-" * 30)

Recommendations for User 1 (SVD):
['Heat (1995)', 'Secrets & Lies (1996)', "Schindler's List (1993)", 'Adventures of Priscilla, Queen of the Desert, The (1994)', 'True Lies (1994)', 'E.T. the Extra-Terrestrial (1982)', 'Trainspotting (1996)', 'Gandhi (1982)', 'In the Line of Fire (1993)', 'Dave (1993)']

Actual Rated Movies for User 1 in Test Set:
['Toy Story (1995)', 'Get Shorty (1995)', 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)', 'Babe (1995)', 'Angels and Insects (1995)', 'Muppet Treasure Island (1996)', 'Brothers McMullen, The (1995)', 'Crimson Tide (1995)', 'Crumb (1994)', 'Net, The (1995)', 'Legends of the Fall (1994)', 'Quiz Show (1994)', 'Santa Clause, The (1994)', 'Shawshank Redemption, The (1994)', 'While You Were Sleeping (1995)', 'Crow, The (1994)', 'Firm, The (1993)', 'Free Willy (1993)', 'Hot Shots! Part Deux (1993)', 'Jurassic Park (1993)', 'Home Alone (1990)', 'Aladdin (1992)', 'Fargo (1996)', 'All Dogs Go to Heaven 2 (1996)', 'Operation Dumbo Drop (1995)', 

In [35]:
import joblib
import os

# Define the directory to save the models
model_dir = 'saved_models'
os.makedirs(model_dir, exist_ok=True)

# Save the SVD model
joblib.dump(svd, os.path.join(model_dir, 'svd_model.joblib'))
print(f"SVD model saved to {model_dir}/svd_model.joblib")

# Save the user similarity matrix
joblib.dump(user_similarity_df, os.path.join(model_dir, 'user_similarity_matrix.joblib'))
print(f"User similarity matrix saved to {model_dir}/user_similarity_matrix.joblib")

# Save the item similarity matrix
joblib.dump(item_similarity_df, os.path.join(model_dir, 'item_similarity_matrix.joblib'))
print(f"Item similarity matrix saved to {model_dir}/item_similarity_matrix.joblib")

SVD model saved to saved_models/svd_model.joblib
User similarity matrix saved to saved_models/user_similarity_matrix.joblib
Item similarity matrix saved to saved_models/item_similarity_matrix.joblib


In [36]:
%pip install gradio



In [37]:
import joblib
import os
import pandas as pd

# Define the directory where models are saved
model_dir = 'saved_models'

# Load the SVD model
svd = joblib.load(os.path.join(model_dir, 'svd_model.joblib'))
print("SVD model loaded successfully.")

# Load necessary data
# Assuming movies_df and user_item_matrix are needed for recommendations
# If these were not saved, you would need to regenerate them from the raw data files
# For this example, let's assume user_item_matrix is needed to identify rated movies
# and movies_df is needed to get movie titles.
# We'll load them from the notebook's current state or regenerate if necessary.
# As they are available in the current notebook state, we will use them directly.
# In a real application, you would save and load these as well.

print("Assuming 'movies_df' and 'user_item_matrix' DataFrames are available in the environment.")
# If not available, you would load them like this:
# movies_df = pd.read_csv('path/to/movies.csv', ...)
# user_item_matrix = pd.read_csv('path/to/user_item_matrix.csv', ...)

# We also need the predicted_ratings_svd_df as the get_svd_recommendations function uses it
# Let's regenerate it from the loaded SVD model and user_item_matrix if it's not available
# If it's available in the environment, we'll use that.

if 'predicted_ratings_svd_df' not in globals() or predicted_ratings_svd_df is None:
    print("Regenerating predicted_ratings_svd_df...")
    # Fill NaN values with 0 for SVD (same as training)
    user_item_matrix_svd = user_item_matrix.fillna(0)
    # Predict ratings by reconstructing the matrix
    predicted_ratings_svd = svd.transform(user_item_matrix_svd) @ svd.components_
    # Convert the predicted ratings array to a DataFrame
    predicted_ratings_svd_df = pd.DataFrame(predicted_ratings_svd, index=user_item_matrix.index, columns=user_item_matrix.columns)
    print("predicted_ratings_svd_df regenerated.")
else:
    print("'predicted_ratings_svd_df' is already available.")

SVD model loaded successfully.
Assuming 'movies_df' and 'user_item_matrix' DataFrames are available in the environment.
'predicted_ratings_svd_df' is already available.


In [38]:
def get_svd_recommendations_gradio(user_id, n_recommendations=10):
    """
    Generates movie recommendations for a given user based on SVD predicted ratings
    for the Gradio interface.

    Args:
        user_id (int): The ID of the user for whom to generate recommendations.
        n_recommendations (int): The number of recommendations to generate.

    Returns:
        str: A formatted string of recommended movie titles or an error message.
    """
    # Ensure user_id is an integer
    try:
        user_id = int(user_id)
    except ValueError:
        return "Please enter a valid integer User ID."

    if user_id not in predicted_ratings_svd_df.index:
        return f"User ID {user_id} not found in the dataset. Please try a User ID between {predicted_ratings_svd_df.index.min()} and {predicted_ratings_svd_df.index.max()}."

    # Get the predicted ratings for the target user
    user_predicted_ratings = predicted_ratings_svd_df.loc[user_id]

    # Get the movies already rated by the user
    user_rated_movies = user_item_matrix.loc[user_id].dropna().index

    # Filter out movies already rated by the user and sort the remaining by predicted rating
    unseen_movies_predicted_ratings = user_predicted_ratings.drop(user_rated_movies, errors='ignore').sort_values(ascending=False)

    # Get the top N recommended movie IDs
    top_n_movie_ids = unseen_movies_predicted_ratings.head(n_recommendations).index.tolist()

    # Get the movie titles for the recommended movie IDs
    recommended_movie_titles = movies_df[movies_df['movie_id'].isin(top_n_movie_ids)]['movie_title'].tolist()

    if not recommended_movie_titles:
        return f"No recommendations found for User ID {user_id}."

    # Format the output for Gradio
    return "Recommended Movies:\n\n" + "\n".join([f"- {title}" for title in recommended_movie_titles])

In [39]:
import gradio as gr

# Create the Gradio interface
iface = gr.Interface(
    fn=get_svd_recommendations_gradio,
    inputs=gr.Number(label="Enter User ID (e.g., 1, 2, 3...)"),
    outputs="text",
    title="Movie Recommendation System (SVD)",
    description="Enter a User ID to get movie recommendations based on the SVD model."
)

# Launch the interface
iface.launch(debug=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://fab7fe3079a3cebab8.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.


KeyboardInterrupt: 