In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import *
from collections import *
import collections
import heapq

## 1. Recommendation sytem 
Implementing a recommendation system is critical for businesses and digital platforms that want to thrive in today's competitive environment. These systems use data-driven personalization to tailor content, products, and services to individual user preferences. The latter improves user engagement, satisfaction, retention, and revenue through increased sales and cross-selling opportunities. In this section, you will attempt to implement a recommendation system by identifying similar users' preferences and recommending movies they watch to the study user. 

To be more specific, you will implement your version of the [**LSH algorithm**](https://www.learndatasci.com/tutorials/building-recommendation-engine-locality-sensitive-hashing-lsh-python/), which will take as input the user's preferred genre of movies, find the most similar users to this user, and recommend the most watched movies by those who are more similar to the user. 

__Data__: The data you will be working with can be found [here](https://www.kaggle.com/datasets/vodclickstream/netflix-audience-behaviour-uk-movies).

Looking at the data, you can see that there is data available for each user for the movies the user <ins>clicked on</ins>. Gather the __title and genre__ of the __maximum top 10 movies__ that each user clicked on regarding the __number of clicks__.


In [2]:
df = pd.read_csv('/home/theballer/Desktop/Sapienza Courses/ADM/ADM-HW4-Dataset/vodclickstream_uk_movies_03.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287


In [4]:
df.groupby(by=['title', 'genres']).count().sort_values(by='duration', ascending=False).reset_index()[['title', 'genres']].head(10)

Unnamed: 0,title,genres
0,Black Mirror: Bandersnatch,"Drama, Mystery, Sci-Fi, Thriller"
1,Bright,"Action, Fantasy, Thriller"
2,Avengers: Age of Ultron,"Action, Adventure, Sci-Fi"
3,Annihilation,"Adventure, Drama, Horror, Mystery, Sci-Fi, Thr..."
4,Hot Fuzz,"Action, Comedy, Mystery, Thriller"
5,Deadpool,"Action, Adventure, Comedy, Sci-Fi"
6,Bird Box,"Drama, Horror, Sci-Fi"
7,FYRE: The Greatest Party That Never Happened,"Documentary, Music"
8,The Big Short,"Biography, Comedy, Drama, History"
9,The Hitman's Bodyguard,"Action, Comedy, Crime, Thriller"


### 1.2 Minhash Signatures 
Using the movie genre and user_ids, try to implement your min-hash signatures so that users with similar interests in a genre appear in the same bucket. 

__Important note:__ You must write your minhash function from scratch.  You are not permitted to use any already implemented hash functions.  Read the class materials and, if necessary, conduct an internet search.  The description of hash functions in the [book](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) may be helpful as a reference.


In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287


In [6]:
df_lsh = df.loc[:,['user_id', 'genres']]

In [7]:
stemmer = PorterStemmer()
#drop potential null values from the description column
df_lsh = df_lsh.dropna(subset=['genres'])
#uses apply method with list comprehension to tokenize each row and stem each word as well as filter them on alphanumeric and prevent stopwords
df_lsh['genres_clean'] = df_lsh.genres.apply(lambda row: [stemmer.stem(word) for word in nltk.word_tokenize(row)])

In [8]:
remove_words = [',', 'avail', 'not']
df_lsh.genres_clean = df_lsh.genres_clean.apply(lambda row: [word for word in row if word not in remove_words]) 

In [9]:
vocabulary = set()
df_lsh.genres_clean.apply(lambda row: [vocabulary.add(word) for word in row]) 

0                           [None, None, None]
1                     [None, None, None, None]
2                                 [None, None]
3                                 [None, None]
4         [None, None, None, None, None, None]
                          ...                 
671731                                  [None]
671732          [None, None, None, None, None]
671733                      [None, None, None]
671734                            [None, None]
671735                            [None, None]
Name: genres_clean, Length: 671736, dtype: object

In [10]:
df_lsh = df_lsh.groupby(by='user_id').agg({'genres_clean': 'sum'})

In [11]:
df_lsh.genres_clean = df_lsh.genres_clean.apply(lambda row: set(row))

In [12]:
df_lsh = df_lsh.reset_index()

In [13]:
vocabulary = list(vocabulary)
df_lsh.genres_clean = df_lsh.genres_clean.apply(lambda row: list(row))

In [14]:
df_lsh.head()

Unnamed: 0,user_id,genres_clean
0,00004e2862,"[crime, drama, thriller]"
1,000052a0a0,"[thriller, fantasi, music, comedi, adventur, a..."
2,000090e7c8,"[mysteri, sci-fi, thriller]"
3,000118a755,[horror]
4,000296842d,"[mysteri, sci-fi, drama, thriller]"


In [15]:
shingle_dict = {genre: i for i, genre in enumerate(vocabulary)}

In [16]:
N = len(shingle_dict)
n_sig = 10
params = np.random.randint(N, size=[n_sig,2])

In [17]:
def _permuteRow(row):
    return (params@np.array([1,row]))%N

In [18]:
sig = np.full((n_sig, df_lsh.shape[0]), np.inf)

In [19]:
for j, row in df_lsh.iterrows():
    for shingle in row['genres_clean']:
        orig_row = shingle_dict[shingle]
        curr_col = _permuteRow(orig_row)
        sig[:,j] = np.minimum(sig[:,j],curr_col)

In [20]:
sig = sig.astype(int)
pd.DataFrame(sig)

  sig = sig.astype(int)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,161908,161909,161910,161911,161912,161913,161914,161915,161916,161917
0,5,1,5,21,5,1,0,1,1,1,...,0,1,1,0,9,8,5,1,1,10
1,2,0,2,6,2,1,7,1,0,1,...,1,1,1,1,3,9,2,1,1,22
2,2,0,0,1,0,2,12,0,6,0,...,2,0,6,4,18,2,0,2,4,2
3,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,3,1,10,17,3,2,0,2,2,1,...,0,1,2,0,18,16,3,2,1,3
5,6,1,6,21,6,1,6,11,1,1,...,1,1,1,1,16,1,6,1,1,6
6,0,0,5,9,1,0,1,4,3,4,...,0,0,4,1,6,0,1,0,4,0
7,0,0,10,23,10,0,20,13,8,4,...,0,0,8,4,7,0,10,0,4,0
8,23,23,23,23,23,23,23,23,23,23,...,23,23,23,23,23,23,23,23,23,23
9,12,0,9,4,9,14,2,11,5,8,...,2,5,13,2,10,9,11,3,8,20


### 1.3 Locality-Sensitive Hashing (LSH)

Now that your buckets are ready, it's time to ask a few queries. We will provide you with some user_ids and ask you to recommend at __most five movies__ to the user to watch based on the movies clicked by similar users. 

To recommend at most five movies given a user_id, use the following procedure: 

1. Identify the <ins>two most similar</ins> users to this user.
2. If these two users have any movies __in common__, recommend those movies based on the total number of clicks by these users.
3. If there are __no more common__ movies, try to propose the most clicked movies by the __most similar user first__, followed by the other user. 

__Note:__ At the end of the process, we expect to see at most five movies recommended to the user.

In [21]:
def fastCandidatePairs(sig_mat, b, r):
    n, d = sig_mat.shape
    assert(n==b*r)
    hashbuckets = collections.defaultdict(set)
    bands = np.array_split(sig_mat, b, axis=0)
    for i,band in enumerate(bands):
        for j in range(d):
            # The last value must be made a string, to prevent accidental
            # key collisions of r+1 integers when we really only want
            # keys of r integers plus a band index
            band_id = tuple(list(band[:,j])+[str(i)])
            hashbuckets[band_id].add(j)
    bucket_candidates = list()
    for bucket in hashbuckets.values():
        if len(bucket) > 1:
            bucket_candidates.append(bucket)
    return bucket_candidates

In [22]:
sig.shape

(10, 161918)

In [88]:
candidate_pairs = fastCandidatePairs(sig, b=2, r=5)

In [89]:
def score(target_user_id, current_user_id, df):
    current_user = df.genres_clean.iloc[current_user_id]
    target_user = df.genres_clean.iloc[target_user_id]
    return len(set(target_user).intersection(set(current_user)))/len(set(target_user).union(set(current_user)))


In [90]:
def query(candidate_pairs, query_results, user_ids, df):
    for user_id in user_ids:
        target_user_id = df.loc[df.user_id == user_id].index[0]
        for bucket in candidate_pairs:
            if target_user_id in bucket:
                max_heap = []
                for current_user_id in bucket:
                    if current_user_id != target_user_id:
                        current_score = score(target_user_id, current_user_id, df)
                        heapq.heappush(max_heap, (current_score, current_user_id))
                        if len(max_heap) > 2:
                            heapq.heappop(max_heap)
                query_results[target_user_id] = max_heap
    return query_results

In [136]:
df.user_id[100]

'ad08fad2ec'

In [137]:
user_ids = [df.user_id[100], df.user_id[140000], df.user_id[51478]]

In [138]:
lsh_results = query(candidate_pairs, {}, user_ids, df_lsh)

In [139]:
lsh_results

{109317: [(1.0, 160034), (1.0, 160180)],
 89168: [(0.75, 134721), (0.8, 17496)],
 121616: [(1.0, 155637), (1.0, 157204)]}

In [148]:
recommendations_df = pd.DataFrame(columns=['user_id', 'recommended_movies'])

for user_id in user_ids:
    recommended_movies = []

    # Identify two most similar users
    (_, second), (_, first) = lsh_results[df_lsh.loc[df_lsh.user_id == user_id].index[0]]
    
    # Let's find the movies that were already watched by the target user, 
    # to not accidentally reccommend those movies
    df_target_movies = set(df.loc[df.user_id == user_id].title)
    # DataFrames for the first and second users
    df_first = df.loc[df.user_id == df_lsh.user_id.iloc[first]]
    df_second = df.loc[df.user_id == df_lsh.user_id.iloc[second]]

    # Sets of movies for each user
    movies_first = set(df_first.title)
    movies_second = set(df_second.title)

    # Find common movies
    common_movies = movies_first.intersection(movies_second)
    common_movies = [movie for movie in common_movies if movie not in df_target_movies]

    if common_movies:
        combined_df = pd.concat([df_first, df_second])
        combined_df = combined_df[combined_df.title.isin(common_movies)]
        combined_df = combined_df.groupby('title').agg({'user_id': 'count'}).reset_index().sort_values(by='user_id', ascending=False)
        recommended_movies.extend(combined_df.title.tolist())

    # Add movies from each user if needed
    for df_user in [df_first, df_second]:
        if len(recommended_movies) < 5:
            top_movies = df_user.groupby('title').agg({'user_id':'count'}).reset_index().sort_values(by='user_id', ascending=False)
            for movie in top_movies.title:
                if len(recommended_movies) < 5 and movie not in recommended_movies and movie not in df_target_movies:
                    recommended_movies.append(movie)
    
    new_row = pd.DataFrame({'user_id': [user_id], 'recommended_movies': [recommended_movies]})
    recommendations_df = pd.concat([recommendations_df, new_row], ignore_index=True)

In [149]:
recommendations_df.recommended_movies.iloc[1]

['Bring It On: All or Nothing',
 'Me & Earl & the Dying Girl',
 'Love, Rosie',
 '8 Mile',
 'Testament of Youth']