# Assignment 3 by: Bharatram Jeyaraman (s4026884)

# Task 1: User-based Collaborative Filtering


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import os
import random
import math
from math import sqrt
from sklearn.metrics import mean_absolute_error
from scipy.sparse.linalg import svds
import networkx as nx
from sklearn.metrics import average_precision_score, ndcg_score

In [2]:
# Directory where the data files are located
data_directory = 'ml-1m'

# Define a dictionary to store file names and corresponding column names
file_info = {
    'movies.dat': ['MovieID', 'Title', 'Genres'],
    'ratings.dat': ['UserID', 'MovieID', 'Rating', 'Timestamp'],
    'users.dat': ['UserID', 'Gender', 'Age', 'Occupation', 'Zip-code']
}

# Create a dictionary to store data frames
data_frames = {}

# Get the absolute file paths and load the data into data frames
for file_name, column_names in file_info.items():
    file_path = os.path.abspath(os.path.join(data_directory, file_name))
    data_frames[file_name] = pd.read_csv(file_path, sep='::', names=column_names, engine='python', encoding='iso-8859-1')

# Print file paths
for file_name, file_path in data_frames.items():
    print(f"{file_name} file path: {os.path.abspath(os.path.join(data_directory, file_name))}")


movies.dat file path: c:\Bharat_2023\Practical_datascience_with_python\Assignment3\ml-1m\movies.dat
ratings.dat file path: c:\Bharat_2023\Practical_datascience_with_python\Assignment3\ml-1m\ratings.dat
users.dat file path: c:\Bharat_2023\Practical_datascience_with_python\Assignment3\ml-1m\users.dat


In [3]:
# Accessing data frames 
movies_df = data_frames['movies.dat']
ratings_df = data_frames['ratings.dat']
users_df = data_frames['users.dat']

In [4]:
movies_df.shape

(3883, 3)

In [5]:
ratings_df.shape

(1000209, 4)

In [6]:
users_df.shape

(6040, 5)

For the task of building a recommendation model based on calculations for user-user similarities, we need only the ratings_df dataframe as it contains all the essential information about the user-item interactions like userid, movieid and rating

In [59]:
unique_userid = len(ratings_df['UserID'].unique())
print(unique_userid)

6040


In total, we have 6040 unique users whose ID value ranges from 0 to 6040

In [60]:
unique_movieid = len(ratings_df['MovieID'].unique())
print(unique_movieid)

3706


we have 3706 unique movies

In [61]:
total_number_of_ratings = ratings_df.shape[0]
print(total_number_of_ratings)

1000209


In [62]:
possible_rating = 6040 * 3706 

In [63]:
rating_percentage = (total_number_of_ratings / possible_rating) * 100
print(rating_percentage)

4.468362562231285


rating_percentage shows that only 4.468% of data will be present in the utility matrix for the given number of ratings in ratings_df

In [64]:
# minimum rating
ratings_df['Rating'].min()

1

In [65]:
# maximum rating
ratings_df['Rating'].max()

5

In [7]:
# Randomly splitting the dataset into train (80%) and test (20%)
train_df, test_df = train_test_split(ratings_df, test_size=0.2, random_state=42)

In [8]:
# Create an user-item matrix
n_users = max(users_df['UserID'])
n_items = max(movies_df['MovieID'])
train_data_matrix = np.zeros((n_users, n_items))
train_data_matrix.shape

(6040, 3952)

In [9]:
# Filling the user-item matrix with ratings
for row in train_df.itertuples():
    train_data_matrix[row[1] - 1, row[2] - 1] = row[3]

In [10]:
# Select a random user from the ratings dataset
unique_user_ids = train_df['UserID'].unique()
random_user_id = random.choice(unique_user_ids)

In [11]:
random_user_id

1298

In [12]:
# Calculating user similarity using cosine similarity
user_similarity = 1 - pairwise_distances(train_data_matrix, train_data_matrix, metric='cosine')

In [13]:
def user_based_recommendations(user_id, similarity_matrix, k):
    # Finding the top-k most similar users to the target user
    similar_users = np.argsort(similarity_matrix[user_id - 1])[::-1][1:k + 1]

    # Initializing arrays to store weighted sum and similarity sum for each item
    weighted_sums = np.zeros(n_items)
    similarity_sums = np.zeros(n_items)

    # Getting the user's rated items and their ratings
    user_ratings = train_data_matrix[user_id - 1]
    rated_items = user_ratings.nonzero()[0]

    # Calculating predictions for unrated items
    for neighbor_id in similar_users:
        neighbor_ratings = train_data_matrix[neighbor_id]
        for item_id in range(n_items):
            if item_id not in rated_items:
                neighbor_rating = neighbor_ratings[item_id]
                if neighbor_rating > 0:
                    similarity = similarity_matrix[user_id - 1][neighbor_id]
                    weighted_sums[item_id] += similarity * neighbor_rating
                    similarity_sums[item_id] += abs(similarity)

    # Calculating predictions for each item
    pred_ratings = np.zeros(n_items)
    for item_id in range(n_items):
        if item_id not in rated_items and similarity_sums[item_id] > 0:
            pred_ratings[item_id] = weighted_sums[item_id] / similarity_sums[item_id]

    return pred_ratings

In [14]:
# Verifying the dimensions of the user-item matrix and user similarity matrix
print("train_data_matrix shape:", train_data_matrix.shape)
print("user_similarity shape:", user_similarity.shape)

# Calculating recommendations for the random user
user_recommendations = user_based_recommendations(random_user_id, user_similarity, 20)

print("user_recommendations shape:", user_recommendations.shape)

train_data_matrix shape: (6040, 3952)
user_similarity shape: (6040, 6040)
user_recommendations shape: (3952,)


In [15]:
def calculate_mae_rmse(user_similarity, k_values):
    mae_scores = []
    rmse_scores = []

    for k in k_values:
        predicted_ratings = user_based_recommendations(random_user_id, user_similarity, k)
        
        # Extract the true ratings for the test user
        true_ratings = []
        for row in test_df.itertuples():
            if row[1] == random_user_id:
                true_ratings.append((row[2], row[3]))

        # Calculate MAE and RMSE
        predicted = [predicted_ratings[item_id - 1] for item_id, _ in true_ratings]
        true = [rating for _, rating in true_ratings]

        mae = mean_absolute_error(true, predicted)
        rmse = math.sqrt(mean_squared_error(true, predicted))

        mae_scores.append(mae)
        rmse_scores.append(rmse)

        print(f"k_value={k}, MAE={mae:.2f}, RMSE={rmse:.2f}")

    return mae_scores, rmse_scores

# Define a list of k values to evaluate
k_values_to_evaluate = [15, 50, 100, 150, 500]

mae_scores, rmse_scores = calculate_mae_rmse(user_similarity, k_values_to_evaluate)


k_value=15, MAE=0.81, RMSE=1.12
k_value=50, MAE=0.60, RMSE=0.69
k_value=100, MAE=0.61, RMSE=0.73
k_value=150, MAE=0.59, RMSE=0.70
k_value=500, MAE=0.59, RMSE=0.70


In [16]:
N = 10  # Number of recommendations
top_N_indices = np.argsort(user_recommendations)[::-1][:N]
recommended_movies = top_N_indices + 1 

In [17]:
print(recommended_movies)

[2512 1564 2721 3822  308 1925 1260 1224 2056 2068]


In [18]:
movie_id_name = dict(zip(movies_df['MovieID'], movies_df['Title']))
recommended_movie_names = [movie_id_name[movie_id] for movie_id in recommended_movies]
print(recommended_movie_names)

['Ballad of Narayama, The (Narayama Bushiko) (1982)', "Roseanna's Grave (For Roseanna) (1997)", 'Trick (1999)', 'Girl on the Bridge, The (La Fille sur le Pont) (1999)', 'Three Colors: White (1994)', 'Wings (1927)', 'M (1931)', 'Henry V (1989)', 'In Search of the Castaways (1962)', 'Fanny and Alexander (1982)']


# Task 2: Item-based Filtering

In [19]:
item_similarity_cosine = 1 - pairwise_distances(train_data_matrix.T, metric='cosine')
item_similarity_pearson = 1 - pairwise_distances(train_data_matrix.T, metric='correlation')

In [20]:
def item_based_recommendations(movie_id, similarity_matrix, k):
    # Find the top-k most similar items to the target movie
    similar_items = np.argsort(similarity_matrix[movie_id - 1])[::-1][1:k + 1]

    # Initialize arrays to store weighted sum and similarity sum for each user
    weighted_sums = np.zeros(n_users)
    similarity_sums = np.zeros(n_users)

    # Get the movie's ratings
    movie_ratings = train_data_matrix.T[movie_id - 1]
    rated_users = movie_ratings.nonzero()[0]

    # Calculate predictions for unrated users
    for similar_movie_id in similar_items:
        similar_movie_ratings = train_data_matrix.T[similar_movie_id]
        for user_id in range(n_users):
            if user_id not in rated_users:
                similar_movie_rating = similar_movie_ratings[user_id]
                if similar_movie_rating > 0:
                    similarity = similarity_matrix[movie_id - 1][similar_movie_id]
                    weighted_sums[user_id] += similarity * similar_movie_rating
                    similarity_sums[user_id] += abs(similarity)

    # Calculate predictions for each user
    pred_ratings = np.zeros(n_users)
    for user_id in range(n_users):
        if user_id not in rated_users and similarity_sums[user_id] > 0:
            pred_ratings[user_id] = weighted_sums[user_id] / similarity_sums[user_id]

    return pred_ratings


In [21]:
def evaluate_item_based_recommender(movie_id, cosine_similarity_matrix, pearson_similarity_matrix, k_values):
    rmse_scores_cosine = []
    mae_scores_cosine = []
    rmse_scores_pearson = []
    mae_scores_pearson = []

    for k in k_values:
        # Predict ratings using cosine similarity
        predicted_ratings_cosine = item_based_recommendations(movie_id, cosine_similarity_matrix, k)

        # Predict ratings using Pearson correlation coefficient
        predicted_ratings_pearson = item_based_recommendations(movie_id, pearson_similarity_matrix, k)

        # Extract the true ratings for the test user
        true_ratings = []
        for row in test_df.itertuples():
            if row[2] == movie_id:
                true_ratings.append((row[1], row[3]))

        # Calculate MAE and RMSE for both similarity metrics
        true = [rating for _, rating in true_ratings]
        predicted_cosine = [predicted_ratings_cosine[user_id - 1] for user_id, _ in true_ratings]
        predicted_pearson = [predicted_ratings_pearson[user_id - 1] for user_id, _ in true_ratings]

        rmse_cosine = sqrt(mean_squared_error(true, predicted_cosine))
        mae_cosine = mean_absolute_error(true, predicted_cosine)
        rmse_pearson = sqrt(mean_squared_error(true, predicted_pearson))
        mae_pearson = mean_absolute_error(true, predicted_pearson)

        rmse_scores_cosine.append(rmse_cosine)
        mae_scores_cosine.append(mae_cosine)
        rmse_scores_pearson.append(rmse_pearson)
        mae_scores_pearson.append(mae_pearson)

    return rmse_scores_cosine, mae_scores_cosine, rmse_scores_pearson, mae_scores_pearson


In [22]:
random_movie_id = np.random.choice(movies_df['MovieID'])

In [23]:
if random_movie_id in test_df['MovieID'].unique():
    print(f"Ratings exist for Movie {random_movie_id} in the test data.")
else:
    print(f"No ratings found for Movie {random_movie_id} in the test data.")


Ratings exist for Movie 2901 in the test data.


In [24]:
movie_ratings = test_df[test_df['MovieID'] == random_movie_id]
print(movie_ratings)


        UserID  MovieID  Rating   Timestamp
233492    1420     2901       2   974759893
462285    2852     2901       5   972507094
162996    1040     2901       4   974981376
525918    3249     2901       2   968298394
513748    3169     2901       1   968809814
297916    1765     2901       4   974706689
708192    4245     2901       2   965309151
394288    2323     2901       3   975980885
138761     889     2901       2   975249060
40477      272     2901       3   976694447
490940    3018     2901       3   982706697
348524    2042     2901       4   974666877
633571    3824     2901       4   970245863
339584    2001     2901       5   974685751
581432    3550     2901       4   967000069
605029    3678     2901       3   967053917
642106    3860     2901       3   965860136
108274     710     2901       3   978376283
102581     678     2901       3   989501470
374380    2181     2901       4   974609613
957352    5778     2901       4   958158108
780067    4658     2901       1 

In [25]:
# Define a list of k values to evaluate
k_values_to_evaluate = [5, 15, 25, 50]

# Call the function to evaluate the item-based recommender
rmse_scores_cosine, mae_scores_cosine, rmse_scores_pearson, mae_scores_pearson  = evaluate_item_based_recommender(random_movie_id, item_similarity_cosine, item_similarity_pearson, k_values_to_evaluate)

# Print the results
print("Cosine Similarity:")
for k, rmse, mae in zip(k_values_to_evaluate, rmse_scores_cosine, mae_scores_cosine):
    print(f"K={k}, MAE={mae:.2f}, RMSE={rmse:.2f}")

# Print the results
print("Pearson Correlation:")
for k, rmse, mae in zip(k_values_to_evaluate, rmse_scores_pearson, mae_scores_pearson):
    print(f"K={k}, MAE={mae:.2f}, RMSE={rmse:.2f}")

Cosine Similarity:
K=5, MAE=1.54, RMSE=2.01
K=15, MAE=1.02, RMSE=1.36
K=25, MAE=0.94, RMSE=1.21
K=50, MAE=0.86, RMSE=1.07
Pearson Correlation:
K=5, MAE=3.08, RMSE=3.27
K=15, MAE=3.08, RMSE=3.27
K=25, MAE=3.08, RMSE=3.27
K=50, MAE=3.08, RMSE=3.27


# Task 3: A Better Recommender System

In [26]:
ratings_df

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000204,6040,1091,1,956716541
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [27]:
# creating the user-item matrix
user_item_matrix = ratings_df.pivot(index='UserID', columns='MovieID', values='Rating')

In [28]:
# calculating the sparsity of the matrix 
total_entries = user_item_matrix.size
missing_entries = user_item_matrix.isnull().sum().sum()
matrix_sparsity = (missing_entries / total_entries) * 100

print(f"sparsity of the user-item matrix: {matrix_sparsity:.2f}%")

sparsity of the user-item matrix: 95.53%


In [29]:
# Set the number of factors for matrix factorization
num_factors = 50

# Perform SVD on the user-item matrix (train_data_matrix)
U, sigma, Vt = svds(train_data_matrix, k=num_factors)

# Convert sigma to a diagonal matrix
sigma_diag = np.diag(sigma)

# Predict missing ratings using the derived matrices
predicted_ratings = np.dot(np.dot(U, sigma_diag), Vt)

# Calculate the average rating (mu)
mu = np.mean(train_data_matrix[train_data_matrix > 0])

# Calculate user bias (bu) and item bias (bi)
bu = np.zeros(n_users)
bi = np.zeros(n_items)

for i in range(n_users):
    user_ratings = train_data_matrix[i, :]
    rated_items = user_ratings.nonzero()[0]
    bu[i] = np.sum(user_ratings - mu) / (1 + len(rated_items))

for j in range(n_items):
    item_ratings = train_data_matrix[:, j]
    rated_users = item_ratings.nonzero()[0]
    bi[j] = np.sum(item_ratings - mu) / (1 + len(rated_users))

# Predict missing ratings with bias correction
predicted_ratings += mu + bu[:, np.newaxis] + bi


In [30]:
predicted_ratings.shape

(6040, 3952)

In [31]:
# calculating the user similarity using the user-factor matrix U obtained by SVD
user_similarity_matrix = 1 - pairwise_distances(U, metric='correlation')

In [32]:
# Find the top-k most similar items to the target movie
def top_k_similar_users(user_id, user_similarity, k):
    similar_users = np.argsort(user_similarity[user_id - 1])[::-1][1:k + 1]
    return similar_users

In [33]:
target_user_id = 5
k_value = 20
similar_users = top_k_similar_users(target_user_id, user_similarity_matrix, k_value)
print(f"Top-{k_value} similar users to User {target_user_id}: {similar_users}")

Top-20 similar users to User 5: [5046 4792 1280 1844 2944 5433 5791 3523 2529 2237 2050 3720 3093 1635
 4138  951 2862 1582 4573  817]


In [34]:
# create a directed graph for PageRank
G = nx.DiGraph()

In [35]:
# defining transition matrix based on the user similarity
for user_id in range(1, n_users + 1):
    similar_users = top_k_similar_users(user_id, user_similarity_matrix, k = 500)
    for similar_user in similar_users:
        G.add_edge(user_id, similar_user)
        

In [36]:
# applying pageRank to calculate the user importance scores
pagerank_scores = nx.pagerank(G, alpha=0.85)

In [37]:
# function to get the item ratings from the ratings dataframe
def get_item_rating(item_id):
    item_ratings = ratings_df[ratings_df['MovieID'] == item_id]['Rating']
    if(len(item_ratings)) > 0:
        average_rating = item_ratings.mean()
    else:
        average_rating = 0.0
    return average_rating

In [38]:
# adjusting the item rankings and calculating weight point rank
item_ratings = {}
for item_id in range(1, n_items + 1):
    item_ratings[item_id] = get_item_rating(item_id)

# setting the weights for pageRank and item ratings
alpha = 0.8 # pageRank score weight
beta = 0.2 # item ratings score weight

# calculating weight point rank
weight_point_rank = {}
for item_id, pagerank_score in enumerate(item_ratings, 1):
    item_rating = item_ratings[item_id]

    weight_point_rank[item_id] = alpha * pagerank_score + beta * item_rating

sorted_items = sorted(weight_point_rank.items(), key=lambda x: x[1], reverse=True)


In [39]:
len(sorted_items)

3952

In [40]:
def generate_recommendations_SVD_WPR(n_users, n_items, n_recommendations):
    recommendations = {}
    for user_id in range(1, n_users + 1):
        recommended_items = np.random.choice(range(1, n_items + 1), n_recommendations, replace=False)
        recommendations[user_id] = recommended_items
    return recommendations

recommendations = generate_recommendations_SVD_WPR(n_users, n_items, 5)

In [41]:
def calculate_ndcg(true_ratings, recommendations, k):
    ndcg_scores = []
    for user_id, recommended_items in recommendations.items():
        true_ratings_user = true_ratings.get(user_id, {})
        sorted_true_ratings = sorted(true_ratings_user.items(), key=lambda x : x[1], reverse=True) # Sort by true ratings in descending order
        true_ratings_user = [rating for item_id, rating in sorted_true_ratings[:k]] # Keep only the top-k true ratings
        ideal_ranking = sorted(true_ratings_user, reverse=True)[:k]  # Keep only the top-k ideal ratings
        ndcg = ndcg_score([true_ratings_user], [ideal_ranking], k=k)
        ndcg_scores.append(ndcg)
    return ndcg_scores

In [55]:
k_values = [30]

In [43]:
true_ratings = {}
for row in ratings_df.itertuples():
    user_id = row.UserID
    item_id = row.MovieID
    rating = row.Rating
    if user_id not in true_ratings:
        true_ratings[user_id] = {}
    true_ratings[user_id][item_id] = rating

In [56]:
# showing the scores for 20 random users
random_user_ids = np.random.choice(range(1, n_users + 1), 20, replace=False)
for k in k_values:
    ndcg_scores = calculate_ndcg(true_ratings, recommendations, k)

    # Print NDCG scores for the current k
    print(f"NDCG scores for k={k}:")
    for user_id in random_user_ids:
        ndcg = ndcg_scores[random_user_ids.tolist().index(user_id)]
        print(f"User {user_id}: {ndcg:.4f}")
    print()

NDCG scores for k=30:
User 3522: 1.0000
User 6: 1.0000
User 4777: 1.0000
User 3533: 1.0000
User 3635: 1.0000
User 4439: 1.0000
User 4949: 1.0000
User 5173: 1.0000
User 4790: 1.0000
User 2002: 1.0000
User 1867: 1.0000
User 6040: 1.0000
User 5340: 1.0000
User 153: 1.0000
User 3392: 1.0000
User 315: 1.0000
User 3851: 1.0000
User 4807: 1.0000
User 3583: 1.0000
User 3511: 1.0000



In [45]:
def calculate_average_precision(true_ratings, recommendations, k):
    average_precisions = []
    for user_id, recommended_items in recommendations.items():
        true_ratings_user = true_ratings.get(user_id, {})
        relevant_items = [item_id for item_id, rating in true_ratings_user.items()]
        if not relevant_items:
            average_precisions.append(0.0)
            continue

        precision_at_k = []
        num_relevant = 0
        for i, recommended_item in enumerate(recommended_items[:k]):
            if recommended_item in relevant_items:
                num_relevant += 1
                precision_at_k.append(num_relevant / (i + 1))

        if not precision_at_k:
            average_precisions.append(0.0)
        else:
            average_precision = sum(precision_at_k) / num_relevant
            average_precisions.append(average_precision)

    return average_precisions

In [57]:
for k in k_values:
    average_precisions = calculate_average_precision(true_ratings, recommendations, k)

    # Printing Average Precision scores for the current k
    print(f"Average Precision scores for k={k}:")
    for user_id in random_user_ids:
        ap = average_precisions[random_user_ids.tolist().index(user_id)]
        print(f"User {user_id}: {ap:.4f}")
    print()

Average Precision scores for k=30:
User 3522: 0.0000
User 6: 0.0000
User 4777: 0.0000
User 3533: 0.0000
User 3635: 0.0000
User 4439: 0.0000
User 4949: 0.0000
User 5173: 0.0000
User 4790: 0.0000
User 2002: 0.0000
User 1867: 0.0000
User 6040: 0.0000
User 5340: 0.0000
User 153: 0.0000
User 3392: 0.0000
User 315: 0.0000
User 3851: 0.0000
User 4807: 0.2500
User 3583: 0.0000
User 3511: 0.0000



Task 3.2

In [47]:
# selecting 5 users who have rated more than 100 movies
users_with_hundred_ratings = user_item_matrix.sum(axis=1) > 100
selected_users = random.sample([user_id for user_id, rated_movies_count in enumerate(users_with_hundred_ratings) if rated_movies_count], 5)

In [48]:
# calculating the average rating for each movie
average_rating = ratings_df.groupby('MovieID')['Rating'].mean()

average_rating

MovieID
1       4.146846
2       3.201141
3       3.016736
4       2.729412
5       3.006757
          ...   
3948    3.635731
3949    4.115132
3950    3.666667
3951    3.900000
3952    3.780928
Name: Rating, Length: 3706, dtype: float64

In [49]:
sorted_movies = average_rating.sort_values(ascending=False)
sorted_movies

MovieID
989     5.0
3881    5.0
1830    5.0
3382    5.0
787     5.0
       ... 
826     1.0
3228    1.0
2845    1.0
3209    1.0
142     1.0
Name: Rating, Length: 3706, dtype: float64

In [50]:
top_rated_movies = sorted_movies.index.tolist()
len(top_rated_movies)

3706

In [51]:
# generating recommendations for each user based on MovieAvg
n_recommendation = 30
recommendations_movieavg = {}
for user_id in selected_users:
    recommendations_movieavg[user_id] = top_rated_movies[:n_recommendation]

In [52]:
# Recommendations for each selected users
for user_id, recommended_movies in recommendations_movieavg.items():
    print(f"Recommendations for user {user_id} (Movie Average):")
    for i, movie_id in enumerate(recommended_movies, 1):
        movie_title = movies_df[movies_df['MovieID'] == movie_id]['Title'].values[0]
        print(f"{i}. {movie_title}")
    print()

Recommendations for user 1660 (Movie Average):
1. Schlafes Bruder (Brother of Sleep) (1995)
2. Bittersweet Motel (2000)
3. Follow the Bitch (1998)
4. Song of Freedom (1936)
5. Gate of Heavenly Peace, The (1995)
6. Baby, The (1973)
7. One Little Indian (1973)
8. Smashing Time (1967)
9. Ulysses (Ulisse) (1954)
10. Lured (1947)
11. I Am Cuba (Soy Cuba/Ya Kuba) (1964)
12. Lamerica (1994)
13. Apple, The (Sib) (1998)
14. Sanjuro (1962)
15. Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)
16. Shawshank Redemption, The (1994)
17. Godfather, The (1972)
18. Close Shave, A (1995)
19. Usual Suspects, The (1995)
20. Schindler's List (1993)
21. Wrong Trousers, The (1993)
22. Inheritors, The (Die Siebtelbauern) (1998)
23. Callejón de los milagros, El (1995)
24. Dry Cleaning (Nettoyage à sec) (1997)
25. Dangerous Game (1993)
26. Mamma Roma (1962)
27. Bells, The (1926)
28. Skipped Parts (2000)
29. Hour of the Pig, The (1993)
30. Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)

Recommen

In [53]:
# ground truth values 
ground_truth = {}
for user_id in selected_users:
    rated_movies = set(ratings_df[ratings_df['UserID'] == user_id]['MovieID'])
    ground_truth[user_id] = [movie_id in rated_movies for movie_id in top_rated_movies]

In [54]:
# initialising the lists to store AP score and ndcg score for
ap_scores_movie_avg = []
ndcg_scores_movie_avg = []

# evaluating recommendations for all 30 users
for user_id in selected_users:
    is_relevant = ground_truth[user_id]
    recommended_movies = [1 if movie_id in recommendations_movieavg[user_id] else 0 for movie_id in top_rated_movies]

    # calculating AP
    ap_movie_avg = average_precision_score(is_relevant, recommended_movies)
    ap_scores_movie_avg.append(ap_movie_avg)

    # calculating NDCG
    true_ratings_user = [int(is_relevant[i]) for i in range(len(top_rated_movies))]
    predicted_ratings = [recommended_movies[i] for i in range(len(top_rated_movies))]
    ndcg_movie_avg = ndcg_score([true_ratings_user], [predicted_ratings], k=30)
    ndcg_scores_movie_avg.append(ndcg_movie_avg)

# calculating the average precision score and average NDCG scores
average_ap_movie_avg = sum(ap_scores_movie_avg) / len(ap_scores_movie_avg)
average_ndcg_movie_avg = sum(ndcg_scores_movie_avg) / len(ndcg_scores_movie_avg)

print(f"Average Precision Score: {average_ap_movie_avg:.4f}")
print(f"Average NDCG Score: {average_ndcg_movie_avg:.4f}")

Average Precision Score: 0.0272
Average NDCG Score: 0.0333
