**1. Recommendation System**

Imports

In [11]:
import pandas as pd
import csv
import numpy as np
from scipy.spatial.distance import cosine
from random import randint


Build dataframe from csv file

In [2]:
moviesDF = pd.read_csv(r"C:\Users\Elias Antoun\Documents\ADM_HW4_Group3\vodclickstream_uk_movies_03.csv")

# misc pre-handling optimization
moviesDF['datetime'] = pd.to_datetime(moviesDF['datetime'])

moviesDF

Unnamed: 0.1,Unnamed: 0,datetime,duration,title,genres,release_date,movie_id,user_id
0,58773,2017-01-01 01:15:09,0.0,"Angus, Thongs and Perfect Snogging","Comedy, Drama, Romance",2008-07-25,26bd5987e8,1dea19f6fe
1,58774,2017-01-01 13:56:02,0.0,The Curse of Sleeping Beauty,"Fantasy, Horror, Mystery, Thriller",2016-06-02,f26ed2675e,544dcbc510
2,58775,2017-01-01 15:17:47,10530.0,London Has Fallen,"Action, Thriller",2016-03-04,f77e500e7a,7cbcc791bf
3,58776,2017-01-01 16:04:13,49.0,Vendetta,"Action, Drama",2015-06-12,c74aec7673,ebf43c36b6
4,58777,2017-01-01 19:16:37,0.0,The SpongeBob SquarePants Movie,"Animation, Action, Adventure, Comedy, Family, ...",2004-11-19,a80d6fc2aa,a57c992287
...,...,...,...,...,...,...,...,...
671731,730504,2019-06-30 21:37:08,851.0,Oprah Presents When They See Us Now,Talk-Show,2019-06-12,43cd23f30f,57501964fd
671732,730505,2019-06-30 21:49:34,91157.0,HALO Legends,"Animation, Action, Adventure, Family, Sci-Fi",2010-02-16,febf42d55f,d4fcb079ba
671733,730506,2019-06-30 22:00:44,0.0,Pacific Rim,"Action, Adventure, Sci-Fi",2013-07-12,7b15e5ada1,4a14a2cd5a
671734,730507,2019-06-30 22:04:23,0.0,ReMastered: The Two Killings of Sam Cooke,"Documentary, Music",2019-02-08,52d49c515a,0b8163ea4b


**1.1** Title and Genre of top movies that the user _clicked on_

In [10]:
# Initial grouping
u_m = ['user_id', 'movie_id']

# Break process into chunks, otherwise takes 15+ minutes to run
chunk_size = 10000

top_10DF = pd.DataFrame(columns=['user_id', 'title', 'genres', 'click_count']) #initialize empty "result" dataframe


for chunk_start in range(0, len(moviesDF), chunk_size):
    chunk_end = min(chunk_start + chunk_size, len(moviesDF)) 
    chunk = moviesDF.iloc[chunk_start:chunk_end]   # to delimit chunk

    # Group by 'user_id', 'movie_id', and count the clicks for the chunk
    u_m_c = chunk.groupby(u_m).size().reset_index(name='click_count')

    # Use nlargest to get the top 10 movies for each user in the chunk
    top_mov_user = ( u_m_c.groupby('user_id', group_keys=False)
        .apply(lambda group: group.nlargest(10, 'click_count'))
        .reset_index(drop=True))

    # Merge with the original dataframe to get movie details: title and genres
    result = pd.merge(top_mov_user, chunk[['movie_id', 'title', 'genres', 'datetime']], on='movie_id', how='left')

    # Some movies were showing up as duplicate entries due to the user id and movie id being the same but on different dates,
    #  so to avoid duplicates and wrong counts in the final output:
    result['click_count'] = result.groupby(['user_id', 'movie_id', 'title', 'genres'])['click_count'].transform('sum')
    result = result.drop_duplicates(subset=['user_id', 'movie_id', 'title', 'genres', 'click_count'])

    # Append chunk result to final result
    top_10DF = pd.concat([top_10DF, result], ignore_index=True)

top_10DF


Unnamed: 0,user_id,title,genres,click_count,movie_id,datetime
0,0005d9a8f4,Joe and Caspar Hit the Road,"Documentary, Adventure, Comedy",27,416464eaad,2017-01-01 11:05:46
1,001991be8a,Star Trek: First Contact,"Action, Adventure, Drama, Sci-Fi, Thriller",8,dfd60c5a87,2017-01-06 15:27:56
2,0029f6bb1e,Jackass Presents: Bad Grandpa,Comedy,12,03a064a477,2017-01-01 21:00:34
3,0029f6bb1e,The Drop,"Crime, Drama, Thriller",25,0fa2d624f1,2017-01-03 11:18:01
4,0029f6bb1e,"Big Mommas: Like Father, Like Son","Action, Comedy, Crime",5,135b083a96,2017-01-01 22:54:57
...,...,...,...,...,...,...
501118,fef1ee2eb5,Shaft,"Action, Comedy, Crime",189,85e4f0af7b,2019-06-28 09:52:54
501119,fef1ee2eb5,Pirates of the Caribbean: Dead Men Tell No Tales,"Action, Adventure, Fantasy",5,8c8cc2dc08,2019-06-29 23:31:09
501120,ff4a416d0c,Transformer: The Last Knight,"Action, Adventure, Sci-Fi",18,47a9573b47,2019-06-29 18:46:36
501121,ffa9cd1dfc,"Dirty John, The Dirty Truth","Documentary, Crime",2,cf710f153d,2019-06-29 02:08:26


**1.2** Minhash Signatures

In [15]:
# Prep the genres column to be used with the hash function (convert it to a set)
def convert_to_set(genres):
    if pd.notna(genres):
        return set(genres.split(', '))
    else:
        return set()
    
top_10DF['genres_set'] = top_10DF['genres'].apply(convert_to_set)

In [22]:

# Function to calculate MinHash signature
def simple_hash(value, seed, max_value):
    return (value + seed) % max_value

def minhash_signature(genres_set, num_hashes, max_value):
    minhash_signature = np.full(num_hashes, np.inf)

    for genre in genres_set:
        hashes = [simple_hash(hash(genre), seed, max_value) for seed in range(num_hashes)]
        minhash_signature[:num_hashes] = np.minimum(minhash_signature[:num_hashes], hashes)

    return minhash_signature

# Parameters for MinHash
num_hashes = 100
max_value = 1000

# Create MinHash signatures for each user
top_10DF['minhash_signature'] = top_10DF['genres_set'].apply(minhash_signature, args=(num_hashes, max_value))

# Function to create buckets based on MinHash signatures
def create_buckets(df, threshold=0.8):
    buckets = {}
    for index, row in df.iterrows():
        user_id = row['user_id']
        signature = row['minhash_signature']

        # Find buckets with similar signatures
        similar_buckets = [key for key, value in buckets.items() if np.sum(value == signature) / num_hashes >= threshold]

        if not similar_buckets:
            # Create a new bucket if no similar buckets are found
            buckets[user_id] = signature
        else:
            # Add the user to the first similar bucket
            first_bucket = similar_buckets[0]
            buckets[first_bucket] = np.concatenate((buckets[first_bucket], signature), axis=None)

    return buckets

# Create buckets
buckets = create_buckets(top_10DF)

# Print the contents of the buckets
for key, value in buckets.items():
    print(f"Bucket {key}: {len(value)} users")

ValueError: operands could not be broadcast together with shapes (200,) (100,) 