# 1. Recommendation system

We will start by loading the csv data into a dataframe and then we will do some preprocessing 

In [18]:
import pandas as pd
from typing import List
import pandas as pd
import random


In [22]:
pd.set_option("display.max_colwidth", None)

# Load data
data = pd.read_csv("vodclickstream_uk_movies_03.csv")

# We convert datetime column to a date
data.datetime = pd.to_datetime(data.datetime)

# We convert release_date column to a date
data.release_date = pd.to_datetime(data.release_date, errors="coerce")

# We don't need the Row ID column
data.drop(data.columns[0], axis=1, inplace=True)

# Extract genres column to a new column genre_list that includes a list with all genres
data["genre_list"] = data.genres.apply(lambda row: [word.strip() for word in row.split(",")])

The function get_top_movies takes a user ID as input and extracts the top 10 most clicked movies for that user.
It filters the dataframe to include only rows associated with the given user, then groups the data by movie ID, title, and genres.
The resulting dataframe is sorted based on the number of clicks, and the top 10 movies are selected.
The function returns a dataframe with information about the top 10 movies for the specified user.

In [23]:
def get_top_movies(user_id: str) -> pd.DataFrame:
    
    
    # Filter all movies for the given user
    user_movies = data[data["user_id"] == user_id]

    # Group the data by user, movie ID, title, and genres, and count the number of clicks for each movie
    user_movies = user_movies.groupby(["user_id", "movie_id", "title", "genres"]).size().reset_index(name="clicks")

    # Sort the movies based on the number of clicks in descending order and select the top 10
    top_10_user_movies = user_movies.sort_values(by="clicks", ascending=False).head(10)[["title", "genres", "clicks"]]

    # Reset the index of the dataframe for a cleaner structure
    top_10_user_movies.reset_index(drop=True, inplace=True)
    
    return top_10_user_movies

In [30]:
# Choose the user you want to see the top 10 movies regarding the clicks
user_id = "a41dd22654"

get_top_movies(user_id)

Unnamed: 0,title,genres,clicks
0,Elle,"Crime, Drama, Thriller",3
1,Derren Brown: The Push,NOT AVAILABLE,2
2,Science Fiction Volume One: The Osiris Child,"Action, Adventure, Drama, Fantasy, Horror, Sci-Fi, Thriller",2
3,ARQ,"Action, Sci-Fi, Thriller",2
4,Warcraft,"Action, Adventure, Fantasy",1
5,The Big Short,"Biography, Comedy, Drama, History",1
6,I Don't Feel at Home in This World Anymore,"Comedy, Crime, Drama, Thriller",1
7,Happy Anniversary,Comedy,1
8,Mute,"Mystery, Sci-Fi, Thriller",1
9,To All the Boys I've Loved Before,"Comedy, Drama, Romance",1


# 1.2 Minhash Signatures

In [32]:
# Retrieve a list with all unique genres
# We're using a set to collect unique genres from the data
# The %%capture is used to suppress output in Jupyter cells

%%capture
unique_genres = set()
data["genre_list"].apply(lambda row: [unique_genres.add(value) for value in row])


UsageError: Line magic function `%%capture` not found.


Retrieve Unique Genres and Encode Binary Representation

Purpose: Collect unique genres using a set and encode a binary representation of genres for each movie.

Explanation: A set is used to gather distinct genres, and the binary representation is created by assigning 1 if a genre is present and 0 otherwise.

In [33]:
# Encode the genre list (which we treat as our shingles) of every movie to a one-hot list
# We create a binary representation of genres for each movie
data["one_hot_genre_list"] = data["genre_list"].apply(
    lambda genre_list: [1 if genre in genre_list else 0 for genre in unique_genres]
)

Define Hash Function for Minhash

Purpose: Implement hash functions for generating Minhash signatures.

Explanation: Twelve hash functions are defined to calculate hash values based on specific formulas. These functions are crucial for creating Minhash signatures.

In [37]:
# Define hash function for Minhash
def hash_function(element: List) -> List:
   
    
    # Define the hash functions
    hash_functions = [
        lambda x: (x + 1) % 29,
        lambda x: (3*x + 1) % 83,
        lambda x: (2*x + 4) % 59,
        lambda x: (3*x - 1) % 83,
        lambda x: (x << 1) % 59,
        lambda x: (x >> 1) % 19,
        lambda x: (x << 2) % 109,
        lambda x: (x >> 2) % 11,
        lambda x: (x << 3) % 211,
        lambda x: (x >> 3) % 7,
        lambda x: (x << 4) % 421,
        lambda x: (x >> 4) % 5
    ]

    # Calculate hash values based on the defined functions
    hashes_result = [hash_fn(element) for hash_fn in hash_functions]

    return hashes_result

# Define Minhash function
def minhash(genre_list: List) -> List:
    
    
    # Create list of size 12 with 'inf' as the default value
    similarity_signature = [float("inf")]*12

    # Iterate through every element of the genre list
    for row_index, genre in enumerate(genre_list):
        # Calculate hash values of the current genre and skip if genre is not present
        if genre == 1:
            # Retrieve hash values of the current genre based on the row index
            hashes_result = hash_function(row_index)
            
            # Only update the similarity signature if a new hash value is smaller than the current one
            for i in range(0, 12):
                similarity_signature[i] = min(similarity_signature[i], hashes_result[i])

    return similarity_signature

Define Minhash Function

Purpose: Generate Minhash similarity signatures for movies based on their binary genre representation.

Explanation: The function initializes a list of 12 values with 'inf' as the default. It iterates through the binary genre list, calculates hash values, and updates the similarity signature if a new hash value is smaller.

In [38]:
# Create the minhash similarity signature for every movie
# Apply the Minhash function to the binary representation of genres
data["minhash"] = data["one_hot_genre_list"].apply(lambda genre_list: minhash(genre_list))

Define Custom Hash Function for LSH

Purpose: Create a custom hash function for the Locality-Sensitive Hashing (LSH) process.

Explanation: The function calculates a hash value for a given bucket of values. It uses a prime number and iterates through the bucket, updating the hash value.

Define LSH Function

Purpose: Apply Locality-Sensitive Hashing to Minhash signatures, creating LSH bands.

Explanation: The function divides the Minhash similarity signature into bands and calculates hash values for each band using the custom hash function. The resulting hash values form the LSH bands.

In [39]:
# Define custom hash function for LSH
def custom_hash(bucket: List) -> int:
    
    # Initialize hash value and prime number
    hash_value = 0
    prime_number = 41

    # Iterate through every value of the bucket
    for element in bucket:
        # Calculate hash value
        hash_value = (hash_value * prime_number) + element

    return hash_value

# Define LSH function
def lsh(similarity_signature: List, num_buckets: int, num_rows: int) -> List:
    
    # Create empty list
    band_hashes = []

    # Iterate through every bucket
    for bucket_start in range(0, len(similarity_signature), num_rows):
        # Extract current bucket
        bucket = similarity_signature[bucket_start:bucket_start + num_rows]
        
        # Calculate band hash for the current bucket
        band_hash = custom_hash(bucket) % 997
        
        # Append hash value to the final band
        band_hashes.append(band_hash)

    return band_hashes

Create LSH Bands for Every Movie

Purpose: Apply the LSH function to Minhash similarity signatures for each movie.

Explanation: The LSH function is applied to the Minhash similarity signatures, creating LSH bands for every movie. The number of buckets and rows is specified to control the granularity of the bands.

In [40]:
# Given number of buckets and number of rows
num_buckets = 4
num_rows = 3

# Create LSH band for every movie
# Apply the LSH function to the Minhash Similarity Signature
data["lsh_bands"] = data["minhash"].apply(lambda similarity_signature: lsh(similarity_signature, num_buckets, num_rows))

In [41]:
data

Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id,genre_list,one_hot_genre_list,minhash,lsh_bands
0,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe,"[Comedy, Drama, Romance]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0]","[1, 1, 4, 32, 0, 0, 0, 0, 0, 0, 0, 0]","[729, 951, 0, 0]"
1,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510,"[Fantasy, Horror, Mystery, Thriller]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0]","[10, 28, 22, 26, 18, 4, 36, 2, 72, 1, 144, 0]","[34, 580, 850, 606]"
2,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf,"[Action, Thriller]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[10, 28, 22, 26, 18, 4, 36, 2, 72, 1, 144, 0]","[34, 580, 850, 606]"
3,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6,"[Action, Drama]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0]","[24, 70, 50, 68, 46, 11, 92, 5, 184, 2, 368, 1]","[393, 553, 506, 505]"
4,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, Fantasy",2004-11-19,a80d6fc2aa,a57c992287,"[Animation, Action, Adventure, Comedy, Family, Fantasy]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0]","[1, 1, 4, 47, 0, 0, 0, 0, 0, 0, 0, 0]","[729, 244, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...
671731,2019-06-30 21:37:08,851.0,Oprah Presents When They See Us Now,Talk-Show,2019-06-12,43cd23f30f,57501964fd,[Talk-Show],"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[5, 13, 12, 11, 8, 2, 16, 1, 32, 0, 64, 0]","[974, 875, 50, 630]"
671732,2019-06-30 21:49:34,91157.0,HALO Legends,"Animation, Action, Adventure, Family, Sci-Fi",2010-02-16,febf42d55f,d4fcb079ba,"[Animation, Action, Adventure, Family, Sci-Fi]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0]","[17, 49, 36, 47, 32, 8, 64, 4, 128, 2, 256, 1]","[712, 567, 200, 898]"
671733,2019-06-30 22:00:44,0.0,Pacific Rim,"Action, Adventure, Sci-Fi",2013-07-12,7b15e5ada1,4a14a2cd5a,"[Action, Adventure, Sci-Fi]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0]","[18, 52, 38, 50, 34, 8, 68, 4, 136, 2, 272, 1]","[524, 707, 950, 557]"
671734,2019-06-30 22:04:23,0.0,ReMastered: The Two Killings of Sam Cooke,"Documentary, Music",2019-02-08,52d49c515a,0b8163ea4b,"[Documentary, Music]","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[8, 22, 18, 20, 14, 3, 28, 1, 56, 0, 112, 0]","[410, 299, 306, 604]"


# 1.3 Locality-Sensitive Hashing (LSH)

Function to Retrieve Two Most Similar Users

This function aims to find the two most similar users to a given user based on movie genres. It starts by retrieving the unique Locality-Sensitive Hashing (LSH) bands of the target user. Then, it identifies movies with exactly the same LSH bands and calculates a similarity score for each user. Finally, it sorts the users by similarity, using movie count as a tiebreaker, and returns the top two similar users.

In [69]:
def get_two_most_similar_users(user_id: str) -> pd.DataFrame:
    
    # Extract the unique Locality-Sensitive Hashing (LSH) bands of the specified user
    user_lsh_bands = data[data["user_id"] == user_id]["lsh_bands"]
    user_lsh_bands = user_lsh_bands.explode().drop_duplicates().groupby(level=0).agg(list).tolist()

    # Initialize an empty list to store similar movies
    similar_movies = []
    
    # Iterate through each unique LSH band associated with the user
    for user_lsh_band in user_lsh_bands:
        
        # Collect movies with exactly the same LSH band
        similar_movies.append(data[data["lsh_bands"].apply(lambda lsh_band: lsh_band == user_lsh_band)])

    # Combine the results into a Pandas DataFrame
    similar_movies = pd.concat(similar_movies)
    
    # Group by user_id and LSH bands, retrieving movie_count
    similar_movies["lsh_bands"] = similar_movies["lsh_bands"].apply(tuple)
    similar_users = similar_movies.groupby(["user_id", "lsh_bands"]).size().reset_index(name="movie_count")
    
    # Group by user_id, obtaining LSH bands count and total movie_count for each user
    similar_users = similar_users.groupby("user_id").agg({"lsh_bands": "count", "movie_count": "sum"})
    similar_users = similar_users.rename(columns={"lsh_bands": "lsh_bands_count",
                                                  "movie_count": "movie_count"})
    
    # Calculate the similarity score for each user
    similar_users["similarity"] = (similar_users["lsh_bands_count"] / len(user_lsh_bands)).round(2)
    
    # Sort by the highest similarity score, using movie_count as a tiebreaker
    similar_users = similar_users.sort_values(by=["similarity", "movie_count"], ascending=[False, False])
    
    # Exclude the specified user and return only the top two users
    similar_users = similar_users[similar_users.index != user_id].head(2)
    
    return similar_users


In [70]:
# Given user id
user_id = "a41dd22654"

# Retrieve the two most similar users given the user
two_most_similar_users = get_two_most_similar_users(user_id)
two_most_similar_users

Unnamed: 0_level_0,lsh_bands_count,movie_count,similarity
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
29c1164b3b,8,72,0.57
4e9e38261c,8,53,0.57


Retrieving Movies from Two Most Similar Users

Following the identification of the two most similar users, this section retrieves all movies clicked by these users. The similarity score obtained from the previous step is added to prioritize movies clicked by both users. The resulting DataFrame is grouped by user, movie, and similarity, with aggregated values for the number of clicks. The final step involves sorting movies by commonality (both users clicked), user similarity, and clicks and returning the top 5 movies.

In [73]:
# Retrive all movies from the two most similar users
movies = data[data["user_id"].isin(two_most_similar_users.index.tolist())]

# Merge the movies with the previous table to add the similary score
movies = pd.merge(two_most_similar_users, movies, on="user_id")

# Group by user_id, movie_id, and similarity and calcualate new aggregated value clicks
movies = movies.groupby(["user_id", "movie_id", "similarity"]).size().reset_index(name="clicks")
movies = movies.groupby("movie_id").agg({"user_id": "count", "clicks": "sum", "similarity": "sum"})
movies = movies.rename(columns={"user_id": "common"})

# Replace the similariy score with the similarity score of the most similar user if the score is larger than that
threshold_similarity = two_most_similar_users.loc[two_most_similar_users.index[0], "similarity"]
movies.loc[movies["similarity"] > threshold_similarity,"similarity"] = threshold_similarity

# Order by the common column first to recommend the movies which both users clicked on and than the movies from
# the user with the highest similarity and clicks as the tie breaker and return only the first 5 movies
movies = movies.sort_values(by=["common", "similarity", "clicks"], ascending=[False, False, False]).head(5)
# Retrieve the title of every movie and set movie_id as the index
top_5_similar_movies = data[data["movie_id"].isin(movies.index.tolist())]
top_5_similar_movies = top_5_similar_movies.drop_duplicates(subset=["movie_id"]).loc[:, ["movie_id", "title"]]
top_5_similar_movies = top_5_similar_movies.reset_index(drop=True,inplace=False).set_index("movie_id")
top_5_similar_movies

Unnamed: 0_level_0,title
movie_id,Unnamed: 1_level_1
57a9265c0b,The Do-Over
09b19a990f,30 Minutes or Less
f5f284408b,Cowboys & Aliens
51a1a42126,Avengers: Age of Ultron
30ea555fe5,Assassin's Creed
