In [2]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import *
from collections import *
import collections
import heapq 

## 1. Recommendation sytem 
Implementing a recommendation system is critical for businesses and digital platforms that want to thrive in today's competitive environment. These systems use data-driven personalization to tailor content, products, and services to individual user preferences. The latter improves user engagement, satisfaction, retention, and revenue through increased sales and cross-selling opportunities. In this section, you will attempt to implement a recommendation system by identifying similar users' preferences and recommending movies they watch to the study user. 

To be more specific, you will implement your version of the [**LSH algorithm**](https://www.learndatasci.com/tutorials/building-recommendation-engine-locality-sensitive-hashing-lsh-python/), which will take as input the user's preferred genre of movies, find the most similar users to this user, and recommend the most watched movies by those who are more similar to the user. 

__Data__: The data you will be working with can be found [here](https://www.kaggle.com/datasets/vodclickstream/netflix-audience-behaviour-uk-movies).

Looking at the data, you can see that there is data available for each user for the movies the user <ins>clicked on</ins>. Gather the __title and genre__ of the __maximum top 10 movies__ that each user clicked on regarding the __number of clicks__.


In [3]:
df = pd.read_csv('/home/theballer/Desktop/Sapienza Courses/ADM/ADM-HW4-Dataset/vodclickstream_uk_movies_03.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287


In [5]:
df.groupby(by=['title', 'genres']).count().sort_values(by='duration', ascending=False).reset_index()[['title', 'genres']].head(10)

Unnamed: 0,title,genres
0,Black Mirror: Bandersnatch,"Drama, Mystery, Sci-Fi, Thriller"
1,Bright,"Action, Fantasy, Thriller"
2,Avengers: Age of Ultron,"Action, Adventure, Sci-Fi"
3,Annihilation,"Adventure, Drama, Horror, Mystery, Sci-Fi, Thr..."
4,Hot Fuzz,"Action, Comedy, Mystery, Thriller"
5,Deadpool,"Action, Adventure, Comedy, Sci-Fi"
6,Bird Box,"Drama, Horror, Sci-Fi"
7,FYRE: The Greatest Party That Never Happened,"Documentary, Music"
8,The Big Short,"Biography, Comedy, Drama, History"
9,The Hitman's Bodyguard,"Action, Comedy, Crime, Thriller"


### 1.2 Minhash Signatures 
Using the movie genre and user_ids, try to implement your min-hash signatures so that users with similar interests in a genre appear in the same bucket. 

__Important note:__ You must write your minhash function from scratch.  You are not permitted to use any already implemented hash functions.  Read the class materials and, if necessary, conduct an internet search.  The description of hash functions in the [book](http://infolab.stanford.edu/~ullman/mmds/ch3n.pdf) may be helpful as a reference.


In [8]:
df.head()

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287


In [9]:
df_lsh = df.loc[:,['user_id', 'genres']]

In [10]:
stemmer = PorterStemmer()
#drop potential null values from the description column
df_lsh = df_lsh.dropna(subset=['genres'])
#uses apply method with list comprehension to tokenize each row and stem each word as well as filter them on alphanumeric and prevent stopwords
df_lsh['genres_clean'] = df_lsh.genres.apply(lambda row: [stemmer.stem(word) for word in nltk.word_tokenize(row)])

In [11]:
remove_words = [',', 'avail', 'not']
df_lsh.genres_clean = df_lsh.genres_clean.apply(lambda row: [word for word in row if word not in remove_words]) 

In [12]:
vocabulary = set()
df_lsh.genres_clean.apply(lambda row: [vocabulary.add(word) for word in row]) 

0                           [None, None, None]
1                     [None, None, None, None]
2                                 [None, None]
3                                 [None, None]
4         [None, None, None, None, None, None]
                          ...                 
671731                                  [None]
671732          [None, None, None, None, None]
671733                      [None, None, None]
671734                            [None, None]
671735                            [None, None]
Name: genres_clean, Length: 671736, dtype: object

In [13]:
df_lsh = df_lsh.groupby(by='user_id').agg({'genres_clean': 'sum'})

In [14]:
df_lsh.genres_clean = df_lsh.genres_clean.apply(lambda row: set(row))

In [15]:
df_lsh = df_lsh.reset_index()

In [16]:
vocabulary = list(vocabulary)
df_lsh.genres_clean = df_lsh.genres_clean.apply(lambda row: list(row))

In [18]:
df_lsh.head()

Unnamed: 0,user_id,genres_clean
0,00004e2862,"[thriller, crime, drama]"
1,000052a0a0,"[thriller, famili, sport, adventur, sci-fi, mu..."
2,000090e7c8,"[thriller, mysteri, sci-fi]"
3,000118a755,[horror]
4,000296842d,"[thriller, mysteri, sci-fi, drama]"


In [19]:
shingle_dict = {genre: i for i, genre in enumerate(vocabulary)}

In [20]:
N = len(shingle_dict)
n_sig = 10
params = np.random.randint(N, size=[n_sig,2])

In [21]:
def _permuteRow(row):
    return (params@np.array([1,row]))%N

In [26]:
sig = np.full((n_sig, df_lsh.shape[0]), np.inf)

In [27]:
for j, row in df_lsh.iterrows():
    for shingle in row['genres_clean']:
        orig_row = shingle_dict[shingle]
        curr_col = _permuteRow(orig_row)
        sig[:,j] = np.minimum(sig[:,j],curr_col)

In [28]:
sig = sig.astype(int)
pd.DataFrame(sig)

  sig = sig.astype(int)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,161908,161909,161910,161911,161912,161913,161914,161915,161916,161917
0,1,0,9,19,9,1,14,16,0,7,...,1,1,0,7,8,1,14,1,5,1
1,8,3,8,23,8,8,3,8,3,3,...,3,3,3,3,18,3,8,3,3,8
2,0,0,10,15,0,0,0,11,4,8,...,0,4,8,0,17,10,0,0,10,0
3,5,0,6,0,5,5,5,8,9,7,...,4,7,9,5,21,4,5,1,4,5
4,17,1,1,16,1,6,15,9,3,9,...,0,3,5,3,7,0,9,0,0,21
5,13,1,10,5,10,0,20,11,1,6,...,0,1,7,1,8,2,13,0,2,19
6,16,0,0,1,0,2,8,0,2,0,...,2,0,2,2,14,6,0,2,2,16
7,7,1,17,22,7,1,0,1,1,1,...,0,1,1,0,24,17,7,1,1,7
8,4,0,4,0,4,3,20,3,1,1,...,3,1,1,3,14,7,4,3,3,7
9,13,13,13,13,13,13,13,13,13,13,...,13,13,13,13,13,13,13,13,13,13


### 1.3 Locality-Sensitive Hashing (LSH)

Now that your buckets are ready, it's time to ask a few queries. We will provide you with some user_ids and ask you to recommend at __most five movies__ to the user to watch based on the movies clicked by similar users. 

To recommend at most five movies given a user_id, use the following procedure: 

1. Identify the <ins>two most similar</ins> users to this user.
2. If these two users have any movies __in common__, recommend those movies based on the total number of clicks by these users.
3. If there are __no more common__ movies, try to propose the most clicked movies by the __most similar user first__, followed by the other user. 

__Note:__ At the end of the process, we expect to see at most five movies recommended to the user.

In [29]:
def fastCandidatePairs(sig_mat, b, r):
    n, d = sig_mat.shape
    assert(n==b*r)
    hashbuckets = collections.defaultdict(set)
    bands = np.array_split(sig_mat, b, axis=0)
    for i,band in enumerate(bands):
        for j in range(d):
            # The last value must be made a string, to prevent accidental
            # key collisions of r+1 integers when we really only want
            # keys of r integers plus a band index
            band_id = tuple(list(band[:,j])+[str(i)])
            hashbuckets[band_id].add(j)
    bucket_candidates = list()
    for bucket in hashbuckets.values():
        if len(bucket) > 1:
            bucket_candidates.append(bucket)
    return bucket_candidates

In [30]:
sig.shape

(10, 161918)

In [31]:
candidate_pairs = fastCandidatePairs(sig, b=2, r=5)

In [32]:
len(candidate_pairs)

1459

In [77]:
target_user_ids = [4, 6, 10000]

In [52]:
def score(target_user_id, current_user_id, df):
    current_user = df.genres_clean.iloc[current_user_id]
    target_user = df.genres_clean.iloc[target_user_id]
    return len(set(target_user).intersection(set(current_user)))/len(set(target_user).union(set(current_user)))


In [78]:
query_results = {}
for target_user_id in target_user_ids:
    for bucket in candidate_pairs:
        if target_user_id in bucket:
            max_heap = []
            for current_user_id in bucket:
                if current_user_id != target_user_id:
                    current_score = score(target_user_id, current_user_id, df_lsh)
                    heapq.heappush(max_heap, (current_score, current_user_id))
                    if len(max_heap) > 2:
                        heapq.heappop(max_heap)
            query_results[target_user_id] = max_heap

In [79]:
query_results

{4: [(1.0, 161220), (1.0, 161561)],
 6: [(1.0, 161797), (1.0, 161865)],
 10000: [(0.9285714285714286, 142198), (1.0, 99427)]}

In [38]:
df_lsh

Unnamed: 0,user_id,genres_clean
0,00004e2862,"[thriller, crime, drama]"
1,000052a0a0,"[thriller, famili, sport, adventur, sci-fi, mu..."
2,000090e7c8,"[thriller, mysteri, sci-fi]"
3,000118a755,[horror]
4,000296842d,"[thriller, mysteri, sci-fi, drama]"
...,...,...
161913,fffd9bf758,"[mysteri, crime, documentari]"
161914,fffe7b777b,"[thriller, sci-fi, drama]"
161915,fffeac83be,"[sport, drama, comedi, documentari, war, histo..."
161916,ffff2c5f9e,"[thriller, comedi, documentari, mysteri, action]"


In [39]:
df_lsh.user_id.iloc[142198]

'ffffd36adf'

In [81]:
df.loc[df.user_id==df_lsh.user_id.iloc[142198]]

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
110552,169325,2017-06-28 12:58:35,0.0,Ghostbusters II,"Action, Comedy, Fantasy, Sci-Fi",1989-06-16,bb4eab862f,e0d059e32c
110638,169411,2017-06-28 14:04:09,0.0,Puss in Book: Trapped in an Epic Tale,"Animation, Short, Action, Adventure, Comedy, F...",2017-06-20,6a63d1aff2,e0d059e32c
112476,171249,2017-06-30 21:55:19,0.0,Left Behind,"Action, Drama, Fantasy, Mystery, Thriller",2014-10-03,de7e3a7873,e0d059e32c
112499,171272,2017-06-30 21:55:19,0.0,Labyrinth,"Adventure, Family, Fantasy, Musical",1986-06-27,d23d0988fb,e0d059e32c
112953,171726,2017-07-01 23:34:11,0.0,The Circle,"Drama, Sci-Fi, Thriller",2017-04-28,0fc283bc6d,e0d059e32c
115594,174367,2017-07-04 17:10:07,48843.0,The Amazing Spider-Man,"Action, Adventure, Sci-Fi",2012-07-03,91ca994ce3,e0d059e32c
116839,175612,2017-07-06 07:19:30,0.0,The Amazing Spider-Man,"Action, Adventure, Sci-Fi",2012-07-03,91ca994ce3,e0d059e32c
117260,176033,2017-07-06 20:04:59,0.0,A Good Marriage,"Crime, Drama, Thriller",2014-10-03,333b94a355,e0d059e32c
117994,176767,2017-07-07 20:58:46,0.0,What the Health,Documentary,2017-03-07,40e7fed935,e0d059e32c


In [80]:
df.loc[df.user_id==df_lsh.user_id.iloc[99427]]

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
537078,595851,2019-01-16 13:50:30,1069.0,Black Mirror: Bandersnatch,"Drama, Mystery, Sci-Fi, Thriller",2018-12-28,e847f14da5,9d7fd44f17
537481,596254,2019-01-16 08:17:58,0.0,Black Mirror: Bandersnatch,"Drama, Mystery, Sci-Fi, Thriller",2018-12-28,e847f14da5,9d7fd44f17
537573,596346,2019-01-16 08:38:51,5113.0,Black Mirror: Bandersnatch,"Drama, Mystery, Sci-Fi, Thriller",2018-12-28,e847f14da5,9d7fd44f17
537596,596369,2019-01-16 22:24:40,0.0,Prisoners,"Crime, Drama, Mystery, Thriller",2013-09-20,603b1a4226,9d7fd44f17
537697,596470,2019-01-16 12:44:04,3986.0,Black Mirror: Bandersnatch,"Drama, Mystery, Sci-Fi, Thriller",2018-12-28,e847f14da5,9d7fd44f17
537878,596651,2019-01-17 04:40:58,600.0,Prisoners,"Crime, Drama, Mystery, Thriller",2013-09-20,603b1a4226,9d7fd44f17
540195,598968,2019-01-19 15:21:50,4567.0,Prisoners,"Crime, Drama, Mystery, Thriller",2013-09-20,603b1a4226,9d7fd44f17
542587,601360,2019-01-21 22:14:51,0.0,Pokémon the Movie: I Choose You!,"Animation, Action, Adventure, Family, Fantasy",2017-11-05,5fd3acfff3,9d7fd44f17
548491,607264,2019-01-28 11:35:30,11324.0,Reincarnated,"Documentary, Music",2013-03-22,1c0dacfba3,9d7fd44f17
550024,608797,2019-01-30 23:15:28,0.0,The Legend of 420,"Documentary, Comedy",2017-09-20,1f02514a0d,9d7fd44f17


In [61]:
a = set(df.loc[df.user_id==df_lsh.user_id.iloc[99427]].title.tolist()) 
b = set(df.loc[df.user_id==df_lsh.user_id.iloc[99427]].title.tolist())
a.intersection(b)


{"The Hitman's Bodyguard"}

In [65]:
a = df.loc[df.user_id==df_lsh.user_id.iloc[161649]]

In [66]:
b = df.loc[df.user_id==df_lsh.user_id.iloc[161735]]

In [70]:
combined_df = pd.concat([a,b])

In [99]:
common_movies = combined_df.groupby(by='title').agg({'user_id': 'count'}).reset_index().sort_values(by='user_id', ascending=False).title.tolist()

In [100]:
common_movies

["The Hitman's Bodyguard"]