In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

In [19]:
# Let's assume you have another dataset containing movie information like movie_id, title, genres
movies_dataframe = pd.read_csv("dataset/ml-100k/u.item", delimiter="|", encoding="latin1",
                               names=["item id", "title", "release date", 
                                      "video release date", "IMDb URL", "unknown", 
                                      "Action", "Adventure", "Animation",
                                      "Children's", "Comedy", "Crime",
                                      "Documentary", "Drama", "Fantasy",
                                      "Film-Noir", "Horror", "Musical",
                                      "Mystery", "Romance", "Sci-Fi",
                                      "Thriller", "War", "Western"
                                      ])
movies_dataframe = movies_dataframe.drop(["release date", "video release date", "IMDb URL",
                     "unknown", "Action", "Adventure", "Animation",
                     "Children's", "Comedy", "Crime", "Documentary",
                     "Drama", "Fantasy", "Film-Noir", "Horror", "Musical",
                     "Mystery", "Romance", "Sci-Fi","Thriller", "War", "Western"], axis=1)
movies_dataframe

Unnamed: 0,item id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [20]:
# Reading the dataset
userbase1_dataframe = pd.read_csv("dataset/ml-100k/u1.base", names=['user id', 'item id', 'rating', 'timestamp'], delimiter="\t")
userbase1_dataframe = userbase1_dataframe.drop(["timestamp"], axis=1)
userbase1_dataframe

Unnamed: 0,user id,item id,rating
0,1,1,5
1,1,2,3
2,1,3,4
3,1,4,3
4,1,5,3
...,...,...,...
79995,943,1067,2
79996,943,1074,4
79997,943,1188,3
79998,943,1228,3


In [25]:
# Convert DataFrame to sparse matrix
user_item_matrix = userbase1_dataframe.pivot(index='user id', columns='item id', values='rating').fillna(0)
user_item_matrix_sparse = csr_matrix(user_item_matrix.values)

# Calculate cosine similarity matrix
cosine_sim_matrix = cosine_similarity(user_item_matrix_sparse)

user id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.097021,0.052469,0.021162,0.193545,0.290114,0.200438,0.097786,0.060791,0.200926,...,0.252187,0.090728,0.216186,0.139478,0.156937,0.095265,0.222958,0.158151,0.133090,0.252778
2,0.097021,1.000000,0.051348,0.084426,0.015516,0.187717,0.074479,0.023262,0.108167,0.078176,...,0.083045,0.298811,0.308608,0.403810,0.282896,0.213233,0.189257,0.132806,0.126597,0.101784
3,0.052469,0.051348,1.000000,0.145660,0.037180,0.084526,0.015718,0.086400,0.000000,0.030909,...,0.041330,0.036956,0.135750,0.063940,0.102082,0.034064,0.133720,0.083537,0.081230,0.019676
4,0.021162,0.084426,0.145660,1.000000,0.017291,0.000000,0.037566,0.046087,0.000000,0.023232,...,0.014238,0.047742,0.127305,0.118947,0.089016,0.039116,0.120521,0.135669,0.125925,0.035586
5,0.193545,0.015516,0.037180,0.017291,1.000000,0.158552,0.170890,0.157488,0.053846,0.089508,...,0.284295,0.074215,0.086345,0.063759,0.126358,0.068145,0.198118,0.142141,0.150000,0.239803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.095265,0.213233,0.034064,0.039116,0.068145,0.099518,0.075901,0.039294,0.000000,0.031693,...,0.066039,0.431154,0.258021,0.226449,0.432666,1.000000,0.087687,0.180029,0.043264,0.144250
940,0.222958,0.189257,0.133720,0.120521,0.198118,0.289962,0.263561,0.094273,0.093677,0.246078,...,0.327153,0.107024,0.187536,0.181317,0.175158,0.087687,1.000000,0.145152,0.261376,0.241028
941,0.158151,0.132806,0.083537,0.135669,0.142141,0.093607,0.017469,0.106551,0.110612,0.014989,...,0.046952,0.203301,0.288318,0.234211,0.313400,0.180029,0.145152,1.000000,0.101642,0.095120
942,0.133090,0.126597,0.081230,0.125925,0.150000,0.178815,0.169986,0.067465,0.064484,0.156821,...,0.226440,0.073513,0.089588,0.129554,0.099385,0.043264,0.261376,0.101642,1.000000,0.182465


In [27]:
utility_matrix = userbase1_dataframe.pivot(index='user id', columns='item id', values='rating').fillna(0)
utility_matrix

item id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,0.0,4.0,1.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
940,0.0,0.0,0.0,2.0,0.0,0.0,4.0,5.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
941,5.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
# Convert cosine similarity matrix to DataFrame for better visualization 
# Where both the rows and columns are labeled with user ids, and the values represent the cosine similarity between corresponding users based on their ratings.
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=user_item_matrix.index, columns=user_item_matrix.index)
cosine_sim_df

user id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.097021,0.052469,0.021162,0.193545,0.290114,0.200438,0.097786,0.060791,0.200926,...,0.252187,0.090728,0.216186,0.139478,0.156937,0.095265,0.222958,0.158151,0.133090,0.252778
2,0.097021,1.000000,0.051348,0.084426,0.015516,0.187717,0.074479,0.023262,0.108167,0.078176,...,0.083045,0.298811,0.308608,0.403810,0.282896,0.213233,0.189257,0.132806,0.126597,0.101784
3,0.052469,0.051348,1.000000,0.145660,0.037180,0.084526,0.015718,0.086400,0.000000,0.030909,...,0.041330,0.036956,0.135750,0.063940,0.102082,0.034064,0.133720,0.083537,0.081230,0.019676
4,0.021162,0.084426,0.145660,1.000000,0.017291,0.000000,0.037566,0.046087,0.000000,0.023232,...,0.014238,0.047742,0.127305,0.118947,0.089016,0.039116,0.120521,0.135669,0.125925,0.035586
5,0.193545,0.015516,0.037180,0.017291,1.000000,0.158552,0.170890,0.157488,0.053846,0.089508,...,0.284295,0.074215,0.086345,0.063759,0.126358,0.068145,0.198118,0.142141,0.150000,0.239803
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.095265,0.213233,0.034064,0.039116,0.068145,0.099518,0.075901,0.039294,0.000000,0.031693,...,0.066039,0.431154,0.258021,0.226449,0.432666,1.000000,0.087687,0.180029,0.043264,0.144250
940,0.222958,0.189257,0.133720,0.120521,0.198118,0.289962,0.263561,0.094273,0.093677,0.246078,...,0.327153,0.107024,0.187536,0.181317,0.175158,0.087687,1.000000,0.145152,0.261376,0.241028
941,0.158151,0.132806,0.083537,0.135669,0.142141,0.093607,0.017469,0.106551,0.110612,0.014989,...,0.046952,0.203301,0.288318,0.234211,0.313400,0.180029,0.145152,1.000000,0.101642,0.095120
942,0.133090,0.126597,0.081230,0.125925,0.150000,0.178815,0.169986,0.067465,0.064484,0.156821,...,0.226440,0.073513,0.089588,0.129554,0.099385,0.043264,0.261376,0.101642,1.000000,0.182465


# Recommendation System

In [71]:
def recommend_items_for_user(user_id, cosine_sim_df, utility_matrix, movies_dataframe, top_n=10):
    """
    Recommend items for a given user based on ratings from similar users.
    
    Parameters:
        user_id (int): The user for whom items are to be recommended.
        cosine_sim_df (pd.DataFrame): DataFrame containing cosine similarity matrix.
        utility_matrix (pd.DataFrame): DataFrame containing the utility matrix of ratings.
        movies_dataframe (pd.DataFrame): DataFrame containing movie titles.
        top_n (int): Number of items to recommend.
        
    Returns:
        recommended_items_df (pd.DataFrame): DataFrame containing recommended items with titles and aggregated scores.
    """
    # Get cosine similarity scores for the given user
    user_similarity_scores = cosine_sim_df[user_id]
    
    # Sort users by similarity scores and get top N similar users (excluding the user itself)
    similar_users = user_similarity_scores.sort_values(ascending=False)[1:top_n+1]
    
    # Initialize a dictionary to store aggregated ratings from similar users
    aggregated_ratings = {}
    
    # Aggregate ratings from similar users for items not yet rated by the given user
    for similar_user_id, similarity_score in similar_users.items():
        # Get items rated by the similar user that the given user has not yet rated
        unrated_items = utility_matrix.loc[user_id][utility_matrix.loc[user_id] == 0].index
        
        # Get ratings of unrated items by the similar user
        similar_user_ratings = utility_matrix.loc[similar_user_id, unrated_items]
        
        # Aggregate ratings from similar user
        for item_id, rating in similar_user_ratings.items():
            if item_id not in aggregated_ratings:
                aggregated_ratings[item_id] = 0
            aggregated_ratings[item_id] += rating * similarity_score
    
    # Sort recommended items by aggregated ratings
    recommended_items = sorted(aggregated_ratings, key=aggregated_ratings.get, reverse=True)[:top_n]
    
    # Filter movies_dataframe to include only recommended items
    recommended_items_df = movies_dataframe[movies_dataframe['item id'].isin(recommended_items)].copy()
    
    # Add aggregated scores to the DataFrame using .loc accessor
    recommended_items_df.loc[:, 'aggregated_score'] = [aggregated_ratings[item_id] for item_id in recommended_items_df['item id']]
    
    # Sort dataframe by aggregated_score in descending order
    recommended_items_df = recommended_items_df.sort_values(by='aggregated_score', ascending=False).reset_index(drop=True)
    
    return recommended_items_df

In [73]:
# Example usage: Recommending items for user 1
user_id = 7
recommended_items_df = recommend_items_for_user(user_id, cosine_sim_df, utility_matrix, movies_dataframe, 50)
print("Recommended items with titles and aggregated scores (sorted by aggregated_score in descending order):")
recommended_items_df

Recommended items with titles and aggregated scores (sorted by aggregated_score in descending order):


Unnamed: 0,item id,title,aggregated_score
0,174,Raiders of the Lost Ark (1981),83.618518
1,172,"Empire Strikes Back, The (1980)",71.21593
2,176,Aliens (1986),69.71323
3,483,Casablanca (1942),69.427736
4,195,"Terminator, The (1984)",69.381104
5,168,Monty Python and the Holy Grail (1974),68.866863
6,183,Alien (1979),68.842704
7,181,Return of the Jedi (1983),67.793836
8,191,Amadeus (1984),67.506399
9,22,Braveheart (1995),67.256344
