# Collaborative filtering
Emanuel de Jong (495804) - Erik Markvoort (519894)

# imports

the following modules are used for the collaborative filtering assignment.

In [1]:
from IPython.core.display import HTML
from movie_display import movie_display
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from ipywidgets import interact_manual
import ipywidgets as widgets

these global variables are used to determine which user the recommendations are based on and how many recommendations should be given.

In [2]:
USER_ID = 1
RECOMMENDATION_COUNT = 5

## Dataset analysis

## similarity user 1 and 2 in cosine, pearson and adjusted cosine

to verify a correct calulation of cosine similarities, data is taken from the examples in the slides as a 2D numpy array.
user A and B are the users for which we calculate the various cosine similarities between.

In [3]:
numpy_data = np.array([[4.0,0.0,0.0,5.0,1.0,0.0,0.0],[5.0,5.0,4.0,0.0,0.0,0.0,0.0],[0.0,0.0,0.0,2.0,4.0,5.0,0.0],[0.0,3.0,0.0,0.0,0.0,1.0,3.0]])
numpy_data

userA = 0
userB = 1

to calculate the standard cosine similarity, first, each item from the two users are 

In [None]:
a1 = np.sum(numpy_data[userA] * numpy_data[userB])
b1 = (np.sqrt(np.sum(numpy_data[userA]**2)) * np.sqrt(np.sum(numpy_data[userB]**2)))
cosineSimilarity = a1 / b1
print("regular cosine similarity: " + str(cosineSimilarity))

In [None]:
meanA = np.sum(numpy_data[userA]) / np.sum(numpy_data[userA] > 0)
pearsonA = numpy_data[userA].copy()
pearsonA[numpy_data[userA] > 0] -= meanA
meanB = np.sum(numpy_data[userB]) / np.sum(numpy_data[userB] > 0)
pearsonB = numpy_data[userB].copy()
pearsonB[numpy_data[userB] > 0] -= meanB
a2 = np.sum(pearsonA * pearsonB)
b2 = (np.sqrt(np.sum(pearsonA**2)) * np.sqrt(np.sum(pearsonB**2)))
pearsonSimilarity = a2 / b2
print("pearson cosine similarity: " + str(pearsonSimilarity))

In [None]:
adjustedData = numpy_data.copy()
for i in range(0, len(numpy_data[0])):
    mean = np.sum(adjustedData[:,i]) / np.sum(adjustedData[:,i] > 0)
    adjustedData[adjustedData[:,i] > 0, i] -= mean
a3 = np.sum(adjustedData[userA] * adjustedData[userB])
b3 = (np.sqrt(np.sum(adjustedData[userA]**2)) * np.sqrt(np.sum(adjustedData[userB]**2)))
adjustedCosineSimilarity = a3 / b3
print("adjusted cosine similarity: " + str(adjustedCosineSimilarity))

The dataset is read into a pandas dataframe. Then, the sparse matrix is converted into a 2d-array matrix in order to be able to do the correct calculations on it and each non existing data is converted into a 0.0 float. It is also mapped which index is mapped to which movie title.

In [None]:
csv = pd.read_csv('./dataset/ratings.csv')

unique_movie_ids = sorted(csv['movieId'].unique())
movie_id_mapping = {movie_id: idx for idx, movie_id in enumerate(unique_movie_ids)}
csv['movieId_mapped'] = csv['movieId'].map(movie_id_mapping)

reverse_movie_id_mapping = {v: k for k, v in movie_id_mapping.items()}

pivot_df = csv.pivot(index='userId', columns='movieId_mapped', values='rating')
pivot_df = pivot_df.fillna(0)

pivot_df.iloc[:5,:5]

The pandas dataframe is converted into a numpy 2d array to be able to make numpy calculations. 

In [None]:
numpy_data = pivot_df.to_numpy()
numpy_data[:5,:5]

## Similarity calculation methods

these methods are to make it possible to use a similarity matrices in multiple locations of the program. Here, we use scikit learn's method to calculate similarities into a matrix. For pearson similarity, the mean of a row's non-zero values is subtracted from each non-zero value in that row, after which the standard cosine similarity is used. For adjusted similarity, a column's non-zero values mean is subtracted from that column's non-zero values, after which standard cosine similarity is calculated.

In [9]:
def get_cosine_similarity(data):
    return cosine_similarity(data)

def get_pearson_similarity(data):
    mean_user_rating = np.zeros(data.shape[0])
    for i in range(data.shape[0]):
        non_zero_ratings = np.count_nonzero(data[i])
        if non_zero_ratings > 0:
            mean_user_rating[i] = np.sum(data[i]) / non_zero_ratings

    centered_data = data.copy()
    for i in range(len(mean_user_rating)):
        non_zero_mask = data[i] > 0
        centered_data[i][non_zero_mask] -= mean_user_rating[i]

    return cosine_similarity(centered_data)

def get_adjusted_cosine_similarity(data):
    mean_item_rating = np.sum(data, axis=0) / np.count_nonzero(data, axis=0)
    adjusted_data = data.copy()
    for j in range(len(mean_item_rating)):
        non_zero_mask = data[:, j] > 0  
        adjusted_data[non_zero_mask, j] -= mean_item_rating[j] 
    return cosine_similarity(adjusted_data)

## User-user reccomendations

In the following method, the cosine similarity matrix chosen by the user is used to find the most similar users to the selected user. then, the highest rated movies among these similar users are given as a recommendation sorted from highest to lowest. This method is used in the eventual program.

In [10]:
def get_user_user_recommendations():
    used_matrix = get_pearson_similarity(numpy_data)
    similarities = []
    movies_unrated = np.where(numpy_data[USER_ID] == 0.0)[0]

    for i in range(0, len(used_matrix[USER_ID])):
        if i != USER_ID:
            similarities.append((i, used_matrix[USER_ID][i]))

    similarities = sorted(similarities, key=lambda r: r[1], reverse=True)
    similar_users = [r[0] for r in similarities[:10]]
    total_ratings = np.zeros(len(numpy_data[0]))
    for user in similar_users:
        total_ratings += numpy_data[user]

    movie_reccomendations = []
    for i in range(0, len(total_ratings)):
        if np.isin(i, movies_unrated):
            movie_reccomendations.append((i, total_ratings[i]))

    movie_reccomendations = sorted(movie_reccomendations, key=lambda r: r[1], reverse=True)

    recommendations_user_user = [r[0] for r in movie_reccomendations[:RECOMMENDATION_COUNT]]

    recommendations_user_user
    return [reverse_movie_id_mapping[movie_mapped_id] for movie_mapped_id in recommendations_user_user]
    

# validation hit-rate

To validate the results of the recommendations given for user-user recommendation, the hit rate method is used. Ahead of the hit-rate method all movies with less than a total of 300 ratings are eliminated to find a more accurate similarity between users. 
After elimination, a new dataset is made where for each row, the highest value is changed into a 0.0 and the location of this value is saved. 
Then, for each user, a user-user recommendation is made of 5 movies. if the highest rated movie is in this top-5, it is counted as a hit. The total amount of hits is divided by the amount of users to make the hot-rate.

In [11]:
def hitrate_user_user():
    non_zero_counts_cols = np.count_nonzero(numpy_data, axis=0)
    filtered_data = numpy_data[:, non_zero_counts_cols >= 300]

    max_indices = np.argmax(filtered_data, axis=1)
    new_data = np.copy(filtered_data)

    for i in range(filtered_data.shape[0]):
        max_index = np.argmax(filtered_data[i])
        new_data[i, max_index] = 0.0
    new_pearson = similarity_matrix_function(new_data)

    hitrate_space = 5
    tries = len(numpy_data[:,0])
    hits = 0

    for i in range(tries):
        similarities = []
        movies_unrated = np.where(transposed_data[:,USER_ID] == 0)[0]
        for j in range(0, len(new_pearson[i])):
            if j != tries:
                similarities.append((j, new_pearson[i][j]))
        similarities = sorted(similarities, key=lambda r: r[1], reverse=True)
        similar_users = [r[0] for r in similarities[:10]]
        total_ratings = np.zeros(len(new_data[0]))
        for user in similar_users:
            total_ratings += new_data[user]
        movie_reccomendations = []
        for k in range(0, len(total_ratings)):
            movie_reccomendations.append((k, total_ratings[k]))
        movie_reccomendations = sorted(movie_reccomendations, key=lambda r: r[1], reverse=True)
        reccomendations_movies = [r[0] for r in movie_reccomendations[:hitrate_space]]
        if np.isin(max_indices[i], reccomendations_movies):
            hits += 1
    hitrate = hits / tries
    return hitrate

## Item-item recommendations

for item-item recommendation, a minimum amount of a rating is made to take into account only movies with a rating above 3.5 when making recommendations.

In [12]:
MIN_USER_RATING = 3.5

to make item-item recommendations, the data is transposed into a numpy 2d array where rows are movies and columns are users. This is to make it possible to make a similarity matrix between movies instead of users.

In [13]:
transposed_data = np.transpose(numpy_data)

The following method is used to find the highest rated movies for the selected user. This is for display purposes when showing the top rated movies next to the top reccomended movies.

In [14]:
def get_user_movies_with_high_rating():
    # Get indicies of movies the user has rated higher than or exqual to MIN_USER_RATING
    user_movies_with_high_rating = np.where(transposed_data[:, USER_ID] >= MIN_USER_RATING)[0]
    # Get the ratings of those indicies
    movie_ratings = transposed_data[user_movies_with_high_rating, USER_ID]
    # Get the indicies of the ratings sorted by highest rating
    sorted_indices = np.argsort(-movie_ratings)
    # Sort the indicies of the movies by highest user rating with sorted_indices
    return user_movies_with_high_rating[sorted_indices]

In the following method, item-item recommendations are given. To make item-item recommendations, the indexes are saved for both unrated movies(0.0) and highly rated movies(> 3.5). The similarity is calculated between the rated and unrated movies, then, for the 2 most similar movies, a calculation is made to make up the expected rating((similarity1 * rating1 + similarity2 * rating2) / (similarity1 * similarity2)). 
For the user, an expected rating is calculated for each unrated movie. Then, the unrated movies with the highest expected ratings are given as the item-item recommendation.

In [15]:
def get_user_highest_rated_unwatched_movies():  
    movies_rated = np.where(transposed_data[:, USER_ID] >= MIN_USER_RATING)[0]

    movies_unrated = np.where(transposed_data[:,USER_ID] == 0)[0]
    movie_ex_ratings = []
    for movie in movies_unrated:
        similarities = []
        for rated in movies_rated:
            similarities.append([rated, used_matrix[movie][rated]])
        similarities = sorted(similarities, key=lambda r: r[1], reverse=True)
        ex_rating = ((transposed_data[:,USER_ID][similarities[0][0]] * similarities[0][1]) + (transposed_data[:,USER_ID][similarities[1][0]] * similarities[1][1])) / (similarities[1][1] + similarities[0][1])
        movie_ex_ratings.append([movie, ex_rating])
    movie_ex_ratings = sorted(movie_ex_ratings, key=lambda r: r[1], reverse=True)
    reccomended_unrated_movies = []
    for i in range(RECOMMENDATION_COUNT):
        reccomended_unrated_movies.append(movie_ex_ratings[i][0])
    
    return [reverse_movie_id_mapping[movie_mapped_id] for movie_mapped_id in reccomended_unrated_movies]

To validate the results of the recommendations given for item-item recommendation, the hit rate method is used. Ahead of the hit-rate method all users with less than a total of 500 ratings are eliminated to find a more accurate similarity between items. 
After elimination, a new dataset is made where for each row, the highest value is changed into a 0.0 and the location of this value is saved. 
Then, for each user, a user-user recommendation is made of 20 movies. If the highest rated movie is in this top-20, it is counted as a hit. The total amount of hits is divided by the amount of users to make the hit-rate.

In [16]:
def hitrate_item_item():    
    non_zero_counts = np.count_nonzero(numpy_data, axis=1)
    filtered_data = numpy_data[non_zero_counts >= 500]

    max_indices = np.argmax(filtered_data, axis=1)
    new_data = np.copy(filtered_data)
    for i in range(new_data.shape[0]):
        new_data[i, max_indices[i]] = 0.0

    transposed_new = np.transpose(new_data)

    used_matrix = similarity_matrix_function(transposed_new)

    hitrate_space = 20
    tries = len(transposed_new[0])
    hits = 0
    for i in range(tries):
        USER_ID = i
        
        movies_rated = np.where(transposed_new[:, USER_ID] >= MIN_USER_RATING)[0]

        movies_unrated = np.where(transposed_new[:,USER_ID] == 0)[0]
        movie_ex_ratings = []
        for movie in movies_unrated:
            similarities = []
            for rated in movies_rated:
                similarities.append([rated, used_matrix[movie][rated]])
            similarities = sorted(similarities, key=lambda r: r[1], reverse=True)
            ex_rating = ((transposed_new[:,USER_ID][similarities[0][0]] * similarities[0][1]) + (transposed_new[:,USER_ID][similarities[1][0]] * similarities[1][1])) / (similarities[1][1] + similarities[0][1])
            movie_ex_ratings.append([movie, ex_rating])
        movie_ex_ratings = sorted(movie_ex_ratings, key=lambda r: r[1], reverse=True)
        reccomended_unrated_movies = []
        for i in range(hitrate_space):
            reccomended_unrated_movies.append(movie_ex_ratings[i][0])
        recommendations_movies = reccomended_unrated_movies

        if np.isin(max_indices[i], recommendations_movies):
            hits += 1

    hitrate = hits / tries
    return hitrate

# Program

In the program, a user can fill in the parameters for how to make a recommendation.

this method is to convert user input into integers.

In [17]:
def num_from_str(value, max_value):
    for i in range(1, max_value + 1):
        if str(i) in value:
            return i
    return 1

For the parameters, a standard value is given in the case the user gives no input.

In [18]:
recommendation_method = "Item-Item"
similarity_matrix_method = "Pearson"
similarity_matrix_function = get_pearson_similarity
RECOMMENDATION_COUNT = 5
USER_ID = 5
hitrate_bool = 1

The indexes in ratings.csv are not directly related to the displayable data in imdbdata.json. These methods help with linking the movie reccomendations to imdb data.

In [19]:
links_data = pd.read_csv('./dataset/links.csv')
imdb_data = pd.read_json('./dataset/imdbdata.json')

imdb_data['imdbId'] = imdb_data['imdbId'].astype(int)

def movies_to_imdb(movies):
    imdb_ids = []
    for movie_id in movies:
        link = links_data[links_data['movieId'] == movie_id]
        if not link.empty:
            imdb_id = link.iloc[0]['imdbId']
            imdb_ids.append(imdb_id)

    imdb_movies = []
    for imdb_id in imdb_ids:
        imdb_movie = imdb_data[imdb_data['imdbId'] == imdb_id]
        if not imdb_movie.empty:
            imdb_movies.append(imdb_movie.iloc[0])
    
    return imdb_movies

These methods are to display movies with the provided html movie displayer, firstly, for the users highly rated movies and secondly for movies recommended to the user.

In [20]:
def display_high_rated():
    user_movies_with_high_rating = get_user_movies_with_high_rating()
    user_movies_with_high_rating = [reverse_movie_id_mapping[movie_mapped_id] for movie_mapped_id in user_movies_with_high_rating[:RECOMMENDATION_COUNT]]

    movies_to_show = movies_to_imdb(user_movies_with_high_rating)
    
    print("Movies rated highly by the user:")
    display(HTML(movie_display.show(movies_to_show)))

def display_final_result():
    movies_to_show = movies_to_imdb(recommendations)

    print("Movies reccomended")
    display(HTML(movie_display.show(movies_to_show)))

In the following widget, a user can fill in the desired recommendation method, similarity matrix, recommendation count, user and if they want to see a hitrate. When submit it clicked, it will display the results.

In [None]:
def submit_all(method, similarity, recommendation_count, user_id, hitrate):
    global recommendation_method, similarity_matrix_function, used_matrix, recommendations
    global RECOMMENDATION_COUNT, USER_ID, hitrate_bool

    RECOMMENDATION_COUNT = recommendation_count
    USER_ID = user_id
    
    # Set recommendation method and data
    if method == "User-User":
        recommendation_method = method
        data = numpy_data
    elif method == "Item-Item":
        recommendation_method = method
        data = transposed_data
    
    # Set similarity matrix function
    if similarity == "Cosine":
        similarity_matrix_function = get_cosine_similarity
    elif similarity == "Pearson":
        similarity_matrix_function = get_pearson_similarity
    elif similarity == "Adjusted":
        similarity_matrix_function = get_adjusted_cosine_similarity
    
    # Calculate similarity matrix
    used_matrix = similarity_matrix_function(data)
    
    # Generate recommendations based on method
    if recommendation_method == "User-User":
        recommendations = get_user_user_recommendations()
    elif recommendation_method == "Item-Item":
        recommendations = get_user_highest_rated_unwatched_movies()
    
    # Set other parameters
    hitrate_bool = 1 if hitrate == "Yes" else 0
    
    # Output the selections
    print(f"Recommendation Method: {recommendation_method}")
    print(f"Similarity Method: {similarity}")
    print(f"Recommendation Count: {RECOMMENDATION_COUNT}")
    print(f"User ID: {USER_ID}")
    print(f"Hitrate Evaluation: {hitrate}")
    print(f"Similarity Matrix: {similarity}")

    if hitrate_bool == 1:
        if recommendation_method == "User-User":
            print("Hit-rate user-user recommendation: " + str(hitrate_user_user()))
        elif recommendation_method == "Item-Item":
            print("Hit-rate item-item recommendation: " + str(hitrate_item_item()))

# Create widgets for all interactions
method_dropdown = widgets.Dropdown(
    options=["User-User", "Item-Item"],
    description="Method:"
)

similarity_dropdown = widgets.Dropdown(
    options=["Cosine", "Pearson", "Adjusted"],
    description="Similarity:"
)

recommendation_slider = widgets.IntSlider(
    min=1, max=20, step=1, value=10, description="Count:"
)

user_slider = widgets.IntSlider(
    min=0, max=500, step=1, value=10, description="User ID:"
)

hitrate_dropdown = widgets.Dropdown(
    options=["No", "Yes"],
    description="Hitrate:"
)

# Button to submit the selections
submit_button = widgets.Button(description="Submit")

# Function to trigger when button is clicked
def on_submit_clicked(b):
    submit_all(
        method=method_dropdown.value,
        similarity=similarity_dropdown.value,
        recommendation_count=recommendation_slider.value,
        user_id=user_slider.value,
        hitrate=hitrate_dropdown.value
    )
    display_high_rated()
    display_final_result()


# Attach the button to the callback function
submit_button.on_click(on_submit_clicked)

# Display the widgets
display(method_dropdown, similarity_dropdown, recommendation_slider, user_slider, hitrate_dropdown, submit_button)
