**1. Recommendation System**

Imports

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cosine
from random import randint
from collections import defaultdict
from tqdm import tqdm
import re
import math



Build dataframe from csv file

In [2]:
moviesDF = pd.read_csv(r"C:\Users\Elias Antoun\Documents\ADM_HW4_Group3\vodclickstream_uk_movies_03.csv")

# misc pre-handling optimization
moviesDF['datetime'] = pd.to_datetime(moviesDF['datetime'])

moviesDF

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287
...,...,...,...,...,...,...,...,...
671731,730504,2019-06-30 21:37:08,851.0,Oprah Presents When They See Us Now,Talk-Show,2019-06-12,43cd23f30f,57501964fd
671732,730505,2019-06-30 21:49:34,91157.0,HALO Legends,"Animation, Action, Adventure, Family, Sci-Fi",2010-02-16,febf42d55f,d4fcb079ba
671733,730506,2019-06-30 22:00:44,0.0,Pacific Rim,"Action, Adventure, Sci-Fi",2013-07-12,7b15e5ada1,4a14a2cd5a
671734,730507,2019-06-30 22:04:23,0.0,ReMastered: The Two Killings of Sam Cooke,"Documentary, Music",2019-02-08,52d49c515a,0b8163ea4b


**1.1** Title and Genre of top movies that the user _clicked on_

In [3]:
# Drop duplicates from the Dataframe
moviesDF = moviesDF.drop_duplicates(subset=['user_id', 'movie_id', 'datetime'])

# Group data
aggregated_data = moviesDF.groupby(["user_id", "movie_id"]).agg({
    "title": "first",
    "datetime": "size",  # Rename this to click_count
    "genres": "first",
    "duration": "sum"
}).reset_index().rename(columns={"datetime": "click_count"})

# Sort and get top 10 movies for each user
top_movies_per_user = aggregated_data.sort_values(by=["user_id", "click_count", "duration"], ascending=[True, False, False]).groupby("user_id").head(10)

# Display the result
top_movies_per_user

Unnamed: 0,user_id,movie_id,title,click_count,genres,duration
0,00004e2862,9bfee795ff,Hannibal,1,"Crime, Drama, Thriller",0.0
2,000052a0a0,4718f9963c,Looper,3,"Action, Drama, Sci-Fi, Thriller",6736.0
6,000052a0a0,7314699c23,Frailty,3,"Crime, Drama, Thriller",5667.0
3,000052a0a0,4fa0b092d6,Jumanji,3,"Adventure, Comedy, Family, Fantasy",4387.0
5,000052a0a0,6275614f9a,Resident Evil,2,"Action, Horror, Sci-Fi",4279.0
...,...,...,...,...,...,...
502495,fffeac83be,5a650007e7,To the Bone,1,Drama,129.0
502499,fffeac83be,9e1f83ce16,Stop at Nothing: The Lance Armstrong Story,1,"Documentary, Biography, Sport",0.0
502504,ffff2c5f9e,6467fee6b6,Hot Fuzz,1,"Action, Comedy, Mystery, Thriller",0.0
502505,ffff2c5f9e,9ab62a3f2c,Forks Over Knives,1,Documentary,0.0


**1.2** Minhash Signatures

Some preprocessing of the genres column to prep for hashing

In [4]:
top_movies_per_user.sort_values(by = ['user_id','click_count'], ascending = False)
top_movies_per_user['filtered_genres'] = top_movies_per_user['genres'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))


top_movies_per_user = top_movies_per_user.drop_duplicates(subset = 'user_id')
top_movies_per_user


Unnamed: 0,user_id,movie_id,title,click_count,genres,duration,filtered_genres
0,00004e2862,9bfee795ff,Hannibal,1,"Crime, Drama, Thriller",0.0,crime drama thriller
2,000052a0a0,4718f9963c,Looper,3,"Action, Drama, Sci-Fi, Thriller",6736.0,action drama scifi thriller
12,000090e7c8,eb72fbc6ee,Mute,1,"Mystery, Sci-Fi, Thriller",0.0,mystery scifi thriller
14,000118a755,4c3d7b724e,From Dusk till Dawn (franchise),2,NOT AVAILABLE,0.0,not available
15,000296842d,e847f14da5,Black Mirror: Bandersnatch,5,"Drama, Mystery, Sci-Fi, Thriller",906.0,drama mystery scifi thriller
...,...,...,...,...,...,...,...
502492,fffd9bf758,ad9b7ae449,The Fear of 13,1,"Documentary, Crime, Mystery",8495.0,documentary crime mystery
502493,fffe7b777b,0fc283bc6d,The Circle,1,"Drama, Sci-Fi, Thriller",1785.0,drama scifi thriller
502502,fffeac83be,cb6d7afd76,Amanda Knox,2,"Documentary, Crime",88185.0,documentary crime
502504,ffff2c5f9e,6467fee6b6,Hot Fuzz,1,"Action, Comedy, Mystery, Thriller",0.0,action comedy mystery thriller


**Hashing and Grouping**

The method used follows a lot of the same steps used in: https://www.codemotion.com/magazine/backend/fast-document-similarity-in-python-minhashlsh/
The hash function has been modified to a custom one based on xor, instead of an already implemented one.

Step by step overview of the code:

Classes:

1. shingler: create shingles to be used in the following hashing
2. HashFamily: create a hash function and return a hash value to be used in minhash
3. MinhashSigner: compute minhash signature
4. LSH: locality sensitive hashing to group similar sets using their minhash signatures

In [5]:
class Shingler:
    def __init__(self, k):
        if k > 0:
            self.k = int(k)
        else:
            self.k = 10

    def process_doc(self, document):
        return re.sub("( )+|(\n)+", " ", document).lower()

    def get_shingles(self, document):
        shingles = set()
        document = self.process_doc(document)
        for i in range(0, len(document) - self.k + 1):
            shingles.add(document[i:i + self.k])
        return shingles

class HashFamily:
    def __init__(self, i):
        self.result_size = 8
        self.max_len = 20
        self.salt = str(i).zfill(self.max_len)[-self.max_len:]

    def custom_hash_function(self, el_to_hash):
        hash_val = 0
        for char in str(el_to_hash) + self.salt:
            hash_val ^= ord(char)
        return hash_val

    def get_hash_value(self, el_to_hash):
        return int(self.custom_hash_function(el_to_hash)) & ((1 << self.result_size) - 1)

class MinhashSigner:
    def __init__(self, sig_size):
        self.sig_size = sig_size
        self.hash_functions = [HashFamily(randint(0, 10000000000)) for _ in range(sig_size)]

    def compute_set_signature(self, set_):
        set_sig = []
        for h_funct in self.hash_functions:
            min_hash = math.inf
            for el in set_:
                h = h_funct.get_hash_value(el)
                if h < min_hash:
                    min_hash = h

            set_sig.append(min_hash)

        return set_sig

    def compute_signature_matrix(self, set_list):
        signatures = []
        for s in tqdm(set_list, desc="Computing MinHash Signatures", unit="set"):
            signatures.append(self.compute_set_signature(s))
        return signatures

class LSH:
    def __init__(self, threshold):
        self.threshold = threshold

    def get_signature_matrix_bands(self, sig_matrix, bands_nr, sign_len):
        r = int(len(sig_matrix[0]) / bands_nr)  # Adjusted to use length of the signature matrix
        bands = {i: [] for i in range(bands_nr)}
        for i in range(bands_nr):
            bands[i] = []
        for signature in sig_matrix:
            for i in range(bands_nr):
                idx = i * r
                bands[i].append(" ".join(str(x) for x in signature[idx : idx + r]))
        return bands

    def get_band_buckets(self, band, user_ids):
        buckets = defaultdict(set)
        for doc_id, users_in_doc in enumerate(band):
            for user_id in users_in_doc.split():
                buckets[user_id].add(user_ids[doc_id])
        return buckets

    def get_similar_buckets(self, sig_matrix, bands_nr, sign_len, user_ids):
        similar_buckets = defaultdict(set)
        bands = self.get_signature_matrix_bands(sig_matrix, bands_nr, sign_len)
        for band_id, elements in tqdm(bands.items(), desc="Processing Bands", unit="band"):
            buckets = self.get_band_buckets(elements, user_ids)
            for bucket_id, users in buckets.items():
                similar_buckets[bucket_id].update(users)
        return similar_buckets



**Execution**

1- Shingling: generate shingles(genres) from the filtered_genres column

2- Minhashing Compute the MinHash signatures for each set of shingles

3- Locality Sensitive Hashing: set number of bands, signature size and similarity threshhold, then apply LSH to group similar users into buckets

4- Output: grouped users who have similar interests based on the genres of their favorite movies

In [8]:
# Step 1: Shingling
shingler_instance = Shingler(k=2)
shingles_per_document = [shingler_instance.get_shingles(str(pref)) for pref in tqdm(top_movies_per_user['filtered_genres'], desc="Shingling", unit="document")]

# Step 2: MinHashing
minhash_instance = MinhashSigner(sig_size=50) 
minhash_signatures = minhash_instance.compute_signature_matrix(shingles_per_document)

# Step 3: Locality Sensitive Hashing (LSH)
bands_nr = 5
sign_len = 50
lsh_instance = LSH(threshold=0.5)

# Step 4: Output (grouped users)
user_ids = top_movies_per_user['user_id'].tolist()
similar_user_buckets = lsh_instance.get_similar_buckets(minhash_signatures, bands_nr, sign_len, user_ids=user_ids)

# Print the content of the first two buckets
for bucket_id, users in list(similar_user_buckets.items())[:2]:
    print(f"Bucket {bucket_id}: {users}")


Shingling: 100%|██████████| 161918/161918 [00:01<00:00, 119416.03document/s]
Computing MinHash Signatures: 100%|██████████| 161918/161918 [02:44<00:00, 981.92set/s] 
Processing Bands: 100%|██████████| 5/5 [00:00<00:00,  5.13band/s]

Bucket 3: {'b79be62244', 'e5878d8f2d', '790ee1dae0', '1386c7f78e', 'a5c9c06075', 'a5821a7ca7', 'f2fc7cf852', '0f4a3f40ad', 'd58e3aaa18', 'd2a381776c', 'b42639ec45', '42f7cfd99c', '3e5198d81c', 'c8b66cb432', 'b00942ae9a', '87e6f0b492', '3c9a82e26f', '526e1b62af', '36014cc3bc', 'd5eba30bb9', '246c777bcf', '9ea7d9be8f', '0c688262e5', '320125faf0', 'a8d794c76e', 'a4bf76a683', '8b18be85da', 'cbea6fbdb6', '8a7491f33f', '6f0e5728fa', '9473c073be', '1f3fe75ca8', '85ed0615ae', '0b3fbae193', '4d95ecb051', '6eb3e0b7fe', 'ba141951d7', '37747fd6c3', 'ae3c971964', 'd47e237797', '79a50fd340', '376363acba', 'f98372b4fe', '34a1a129ba', 'b0dbe2c291', 'bf419071ca', 'ed445406de', '743e4d73ed', 'f3369bb682', 'fa77ac3d8f', 'eea6afaddd', 'd62ac5b14e', '41139219ff', '14df713dd1', '389680d3e5', '4a440a8975', 'fa6a05722f', '911afc91ac', '0addb8d6ce', 'f198d2f93c', '1f6c8bdd68', '48fa489a69', '45d7a14a43', 'c1f4ab52c0', 'd78bb030dc', '563d4273bd', 'f40ecfd9e0', '36435c7e4f', 'b867d26bd1', 'fc38c07f2b', 'c7ee49e0




**1.3: Recommendations**