**1. Recommendation System**

Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from random import randint
from collections import defaultdict
from tqdm import tqdm
import re
import math



Build dataframe from csv file

In [2]:
moviesDF = pd.read_csv(r"C:\Users\Elias Antoun\Documents\ADM_HW4_Group3\vodclickstream_uk_movies_03.csv")

# misc pre-handling optimization
moviesDF['datetime'] = pd.to_datetime(moviesDF['datetime'])

moviesDF

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287
...,...,...,...,...,...,...,...,...
671731,730504,2019-06-30 21:37:08,851.0,Oprah Presents When They See Us Now,Talk-Show,2019-06-12,43cd23f30f,57501964fd
671732,730505,2019-06-30 21:49:34,91157.0,HALO Legends,"Animation, Action, Adventure, Family, Sci-Fi",2010-02-16,febf42d55f,d4fcb079ba
671733,730506,2019-06-30 22:00:44,0.0,Pacific Rim,"Action, Adventure, Sci-Fi",2013-07-12,7b15e5ada1,4a14a2cd5a
671734,730507,2019-06-30 22:04:23,0.0,ReMastered: The Two Killings of Sam Cooke,"Documentary, Music",2019-02-08,52d49c515a,0b8163ea4b


**1.1** Title and Genre of top movies that the user _clicked on_

In [3]:
u_m = ['user_id', 'movie_id']

# Break process into chunks, otherwise takes 15+ minutes to run
chunk_size = 10000

# Initialize empty "result" dataframe with columns
top_movies = pd.DataFrame(columns=['user_id', 'title', 'genres', 'click_count', 'movie_id'])

for chunk_start in range(0, len(moviesDF), chunk_size):
    chunk_end = min(chunk_start + chunk_size, len(moviesDF)) 
    chunk = moviesDF.iloc[chunk_start:chunk_end]   # to delimit chunk

    # Group by 'user_id', 'movie_id', and count the clicks for the chunk
    u_m_c = chunk.groupby(u_m).size().reset_index(name='click_count')

    # Use nlargest to get the top 10 movies for each user in the chunk
    top_mov_user = (
        u_m_c.groupby('user_id', group_keys=False)
        .apply(lambda group: group.nlargest(10, 'click_count'))
        .reset_index(drop=True)
    )

    # Merge with original dataframe to get title and genre
    result_chunk = pd.merge(top_mov_user, chunk[['movie_id', 'title', 'genres', 'datetime']], on='movie_id', how='left')

    # Identify and aggregate genuine multiple clicks based on datetime
    result_chunk['click_count'] = result_chunk.groupby(['user_id', 'movie_id', 'title', 'genres'])['click_count'].transform('sum')

    # Drop duplicate rows
    result_chunk = result_chunk.drop_duplicates(subset=['user_id', 'movie_id', 'title', 'genres', 'click_count'])

    # Append chunk result to final result
    top_movies = pd.concat([top_movies, result_chunk], ignore_index=True)



In [4]:
top_movies

Unnamed: 0,user_id,title,genres,click_count,movie_id,datetime
0,0005d9a8f4,Joe and Caspar Hit the Road,"Documentary, Adventure, Comedy",27,416464eaad,2017-01-01 11:05:46
1,001991be8a,Star Trek: First Contact,"Action, Adventure, Drama, Sci-Fi, Thriller",8,dfd60c5a87,2017-01-06 15:27:56
2,0029f6bb1e,Jackass Presents: Bad Grandpa,Comedy,12,03a064a477,2017-01-01 21:00:34
3,0029f6bb1e,The Drop,"Crime, Drama, Thriller",25,0fa2d624f1,2017-01-03 11:18:01
4,0029f6bb1e,"Big Mommas: Like Father, Like Son","Action, Comedy, Crime",5,135b083a96,2017-01-01 22:54:57
...,...,...,...,...,...,...
501118,fef1ee2eb5,Shaft,"Action, Comedy, Crime",189,85e4f0af7b,2019-06-28 09:52:54
501119,fef1ee2eb5,Pirates of the Caribbean: Dead Men Tell No Tales,"Action, Adventure, Fantasy",5,8c8cc2dc08,2019-06-29 23:31:09
501120,ff4a416d0c,Transformer: The Last Knight,"Action, Adventure, Sci-Fi",18,47a9573b47,2019-06-29 18:46:36
501121,ffa9cd1dfc,"Dirty John, The Dirty Truth","Documentary, Crime",2,cf710f153d,2019-06-29 02:08:26


**1.2** Minhash Signatures

Some preprocessing of the genres column to prep for hashing

In [5]:
top_movies.sort_values(by = ['user_id','click_count'], ascending = False)

top_movies_per_user = top_movies
top_movies_per_user['filtered_genres'] = top_movies_per_user['genres'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))


top_movies_per_user = top_movies_per_user.drop_duplicates(subset = 'user_id')
top_movies_per_user


Unnamed: 0,user_id,title,genres,click_count,movie_id,datetime,filtered_genres
0,0005d9a8f4,Joe and Caspar Hit the Road,"Documentary, Adventure, Comedy",27,416464eaad,2017-01-01 11:05:46,documentary adventure comedy
1,001991be8a,Star Trek: First Contact,"Action, Adventure, Drama, Sci-Fi, Thriller",8,dfd60c5a87,2017-01-06 15:27:56,action adventure drama scifi thriller
2,0029f6bb1e,Jackass Presents: Bad Grandpa,Comedy,12,03a064a477,2017-01-01 21:00:34,comedy
12,002b8b112a,To Kill a Mockingbird,"Crime, Drama",5,cb67c1c73b,2017-01-07 18:21:02,crime drama
13,003a3c6c6b,Jack Reacher,"Action, Thriller",17,1e70af3161,2017-01-01 12:00:08,action thriller
...,...,...,...,...,...,...,...
501102,fb8b518cb8,John Wick,"Action, Crime, Thriller",1,2248543715,2019-06-29 10:15:28,action crime thriller
501104,fbd09baccc,Jigsaw,"Comedy, Crime, Fantasy, Horror, Mystery, Sci-F...",1,34ac3ca578,2019-06-30 17:43:44,comedy crime fantasy horror mystery scifi thri...
501112,fdaeca145d,Spider-Man: Homecoming,"Action, Adventure, Sci-Fi",40,a607dc99c1,2019-06-28 15:55:48,action adventure scifi
501115,fe13bdea4b,Scary Movie,Comedy,3,4749bdfef6,2019-06-29 12:33:36,comedy


**Hashing and Grouping**

The method used follows a lot of the same steps used in: https://www.codemotion.com/magazine/backend/fast-document-similarity-in-python-minhashlsh/ but altered to give us buckets and not pairs, my code uses lsh with a threshhold of 0.6
The hash function has been modified to a custom one based on xor, instead of an already implemented one.

Step by step overview of the code:

Classes:

1. shingler: create shingles to be used in the following hashing
2. HashFamily: create a hash function and return a hash value to be used in minhash
3. MinhashSigner: compute minhash signature
4. LSH: locality sensitive hashing to group similar sets using their minhash signatures

Please note that the code was run through chat gpt for final optimization

In [6]:
class Shingler: 
    def __init__(self, k):
        if k > 0:
            self.k = int(k)
        else:
            self.k = 10

    def process_doc(self, document):
        return re.sub("( )+|(\n)+", " ", document).lower()

    def get_shingles(self, document):
        shingles = set()
        document = self.process_doc(document)
        for i in range(0, len(document) - self.k + 1):
            shingles.add(document[i:i + self.k])
        return shingles

class HashFamily:
    def __init__(self, i):
        self.result_size = 8
        self.max_len = 20
        self.salt = str(i).zfill(self.max_len)[-self.max_len:]

    def custom_hash_function(self, el_to_hash):  # hash function from scratch that uses XOR
        hash_val = 0
        for char in str(el_to_hash) + self.salt:
            hash_val ^= ord(char)
        return hash_val

    def get_hash_value(self, el_to_hash):  # return final hash value to use in minhash step
        return int(self.custom_hash_function(el_to_hash)) & ((1 << self.result_size) - 1)

class MinhashSigner:
    def __init__(self, sig_size):
        self.sig_size = sig_size
        self.hash_functions = [HashFamily(randint(0, 10000000000)) for _ in range(sig_size)]

    def compute_set_signature(self, set_):  # compute minhash signature for every element
        set_sig = []
        for h_funct in self.hash_functions:
            min_hash = math.inf  # initialize minhash values as infinity
            for el in set_:
                h = h_funct.get_hash_value(el)
                if h < min_hash:
                    min_hash = h  # append hash value if it's lower than the currently stored minhash

            set_sig.append(min_hash)

        return set_sig

    def compute_signature_matrix(self, set_list):  # return minhash signature matrix
        signatures = []
        for s in tqdm(set_list, desc="Computing MinHash Signatures", unit="set"):
            signatures.append(self.compute_set_signature(s))
        return signatures

class LSH:  #locality sensitive hashing with later specified threshhold of 0.6
    def __init__(self, threshold): 
        self.threshold = threshold

    def get_signature_matrix_bands(self, sig_matrix, bands_nr, sign_len):
        r = int(len(sig_matrix[0]) / bands_nr)  # adjusted to use length of the signature matrix
        bands = {i: [] for i in range(bands_nr)}
        for i in range(bands_nr):
            bands[i] = []
        for signature in sig_matrix:
            for i in range(bands_nr):
                idx = i * r
                bands[i].append(" ".join(str(x) for x in signature[idx : idx + r]))
        return bands

    def get_band_buckets(self, band, user_ids):
        buckets = defaultdict(set)
        for doc_id, users_in_doc in enumerate(band):
            for user_id in users_in_doc.split():
                buckets[user_id].add(user_ids[doc_id])
        return buckets

    def get_similar_buckets(self, sig_matrix, bands_nr, sign_len, user_ids):
        similar_buckets = defaultdict(set)
        bands = self.get_signature_matrix_bands(sig_matrix, bands_nr, sign_len)
        for band_id, elements in tqdm(bands.items(), desc="Processing Bands", unit="band"):
            buckets = self.get_band_buckets(elements, user_ids)
            for bucket_id, users in buckets.items():
                similar_buckets[bucket_id].update(users)
        return similar_buckets



**Execution**

1- Shingling: generate shingles(genres) from the filtered_genres column

2- Minhashing Compute the MinHash signatures for each set of shingles

3- Locality Sensitive Hashing: set number of bands, signature size and similarity threshhold, then apply LSH to group similar users into buckets

4- Output: grouped users who have similar interests based on the genres of their favorite movies

In [7]:
# Step 1: Shingling
shingler_instance = Shingler(k=2)
shingles_per_document = [shingler_instance.get_shingles(str(pref)) for pref in tqdm(top_movies_per_user['filtered_genres'], desc="Shingling", unit="document")]

# Step 2: MinHashing
minhash_instance = MinhashSigner(sig_size=50) 
minhash_signatures = minhash_instance.compute_signature_matrix(shingles_per_document)

# Step 3: Locality Sensitive Hashing (LSH)
bands_nr = 5
sign_len = 50
lsh_instance = LSH(threshold=0.6)

# Step 4: Output (grouped users)
user_ids = top_movies_per_user['user_id'].tolist()
similar_user_buckets = lsh_instance.get_similar_buckets(minhash_signatures, bands_nr, sign_len, user_ids=user_ids)


Shingling: 100%|██████████| 161918/161918 [00:01<00:00, 124167.89document/s]
Computing MinHash Signatures: 100%|██████████| 161918/161918 [02:47<00:00, 964.43set/s] 
Processing Bands: 100%|██████████| 5/5 [00:01<00:00,  4.91band/s]


In [8]:
similar_user_buckets

defaultdict(set,
            {'1': {'51f517a8f4',
              'a3245f2276',
              '246b84c1bd',
              'd089c4c9a4',
              '98c5fa206c',
              '2c62df7c98',
              '4f7f7f3d29',
              'd37b0eac77',
              '40fc4ea9f2',
              '87f6f430ac',
              '7fdd78d458',
              '88540d04ed',
              '17f3b47fe7',
              '2f233746ba',
              '4aab92781b',
              '9e526459dd',
              'c9e86836da',
              'a5c66e7a0a',
              '8e527e3dae',
              '4aeec8379f',
              '87de0e2a9b',
              '9b3ce9bf6f',
              '005274c469',
              '1530e9581e',
              'bab68da80b',
              '9d61a585cc',
              '497b0640a9',
              '8d1b02040e',
              'f9fa92afb6',
              '5854aac9da',
              '0918f799f4',
              '5fc6735477',
              '55673a142c',
              '359ae78f25',
              '3286812ac3'

**1.3: Recommendations**

In [9]:
def recommend_movies(user_id, similar_user_buckets, top_movies):
    # Check if the given user_id is in the buckets
    user_id = user_id.strip()
    matching_user = next((key for key in similar_user_buckets if user_id in similar_user_buckets[key]), None)

    if matching_user is None:
        print(f"User {user_id} not found in similar user buckets.")
        return

    # Get the two most similar users to the given user_id since all previously computed similarities from the LSH step are still stored
    similar_users = list(similar_user_buckets[matching_user])

    if len(similar_users) < 2:
        print(f"Not enough similar users found for user {user_id}.")
        return

    # Extract the movie information for the two most similar users
    user_a_movies = top_movies[top_movies['user_id'] == similar_users[0]]
    user_b_movies = top_movies[top_movies['user_id'] == similar_users[1]]

    # common movies
    common_movies = set(user_a_movies['title']).intersection(set(user_b_movies['title']))

    # recommend movies based on total clicks
    if common_movies:
        recommended_movies = (
            top_movies[top_movies['title'].isin(common_movies)]
            .groupby('title')['click_count']
            .sum()
            .sort_values(ascending=False)
            .head(5)
            .index
        )
    else:
        
        recommended_movies = (
            user_a_movies.groupby('title')['click_count']
            .sum()
            .sort_values(ascending=False)
            .head(5)
            .index
        )

    print(f"Recommended movies for user {user_id}:")
    for movie in recommended_movies:
        print(f"  - {movie}")

**NOTE** The cell below will output the same result for any id other than the current one unless the kernel is restarted. I tried to solve this issue but could not figure out why this is happening. If you want to test with different ids please change the user_id_to_recomment below, restart the kernel then run all again.

In [10]:
# Example id
user_id_to_recommend = ' 003a3c6c6b	'
recommend_movies(user_id_to_recommend, similar_user_buckets, top_movies)

Recommended movies for user 003a3c6c6b:
  - IBOY
  - Good Kids
  - Menace II Society
  - Indie Game: The Movie
