In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
file_path = '/content/drive/MyDrive/MyDatasetFolder/vodclickstream_uk_movies_03.csv'

In [3]:
# Load the dataframe
df_movies = pd.read_csv(file_path)

# Exercise 1

## 1.1 Gather the title and genre of the maximum top 10 movies that each user clicked on regarding the number of clicks.

In [14]:
# Create a vairable named 'movie counts'where I group by 'user_id' and 'movie_id', and count occurrences
movie_counts = df_movies.groupby(['user_id', 'movie_id']).size().reset_index(name='count')

# Rank movies for each user group and filter for top 10 by count
movie_counts['rank'] = movie_counts.groupby('user_id')['count'].rank(ascending=False, method='first')
top_movies = movie_counts[movie_counts['rank'] <= 10]

# Merge with original DataFrame for movie details and select relevant columns 'user_id', 'count', 'movie_id', 'title', 'genres'
df_top10 = (top_movies.merge(df_movies, on=['user_id', 'movie_id'])
            [['user_id', 'count', 'movie_id', 'title', 'genres']])

# Drop duplicate rows, this is because it showed all the occurrance of each click.
df_top10 = df_top10.drop_duplicates(subset=['user_id', 'movie_id'])

df_top10


Unnamed: 0,user_id,count,movie_id,title,genres
0,00004e2862,1,9bfee795ff,Hannibal,"Crime, Drama, Thriller"
1,000052a0a0,1,43bb6de314,Green Room,"Horror, Music, Thriller"
2,000052a0a0,9,4718f9963c,Looper,"Action, Drama, Sci-Fi, Thriller"
11,000052a0a0,3,4fa0b092d6,Jumanji,"Adventure, Comedy, Family, Fantasy"
14,000052a0a0,1,59d313ed8b,Ant-Man,"Action, Adventure, Comedy, Sci-Fi"
...,...,...,...,...,...
613388,fffeac83be,2,cb6d7afd76,Amanda Knox,"Documentary, Crime"
613390,fffeac83be,1,dda0eae17b,Enemy at the Gates,"Drama, History, War"
613391,ffff2c5f9e,1,6467fee6b6,Hot Fuzz,"Action, Comedy, Mystery, Thriller"
613392,ffff2c5f9e,1,9ab62a3f2c,Forks Over Knives,Documentary


## 1.2 Minhash Signatures

---

### Create **mapping** of each genre and its associated id

---


This is useful because I can later on work with more simplicity when associating genres to movies

In [16]:
# First I deleted genre 'NOT AVAILABLE' because it would have messed the outcomes. Two users having NOT AVAILABLE in common are not more likely to have similar preferences in genres.
df_movies_filtered = df_movies[df_movies['genres'] != 'NOT AVAILABLE'].copy()

# make a column in df_movies_filtered named 'genres_list'
df_movies_filtered['genres_list'] = df_movies_filtered.genres.apply(lambda row: [word.strip() for word in row.split(',')])

# Initialize unique_genres var
unique_genres = set()

# fill unique_genres
df_movies_filtered['genres_list'].apply(lambda row: [unique_genres.add(value) for value in row])

# Convert unique_genres to a dictionary with IDs
mapping = {genre: genre_id for genre_id, genre in enumerate(unique_genres)}

print(mapping)


{'Drama': 0, 'Romance': 1, 'News': 2, 'History': 3, 'War': 4, 'Animation': 5, 'Mystery': 6, 'Musical': 7, 'Thriller': 8, 'Crime': 9, 'Music': 10, 'Fantasy': 11, 'Adventure': 12, 'Short': 13, 'Comedy': 14, 'Reality-TV': 15, 'Sport': 16, 'Action': 17, 'Sci-Fi': 18, 'Talk-Show': 19, 'Family': 20, 'Horror': 21, 'Film-Noir': 22, 'Western': 23, 'Biography': 24, 'Documentary': 25}


### Make a list of genres watched by each user_id and associate it to the mapping

---



In [8]:
# Initialize dictionary named user_genres in which we will have key = user_id and value = list of unique genres watched
user_genres = {}

# Iterate through each row in df_movies_filtered
for index, row in df_movies_filtered.iterrows():
    # Extract the user ID and genres list from the row which is currently iterating
    user_id = row['user_id']
    genres_list = row['genres_list']

    # Check if user ID is not already in user_genres
    if user_id not in user_genres:
        # If not, add user ID as the key and the value is a list of unique genre IDs ensuring that the genre is a string and removing any duplicates
        user_genres[user_id] = list(set(mapping[genre.strip()] for genre in genres_list if isinstance(genre, str)))
    else:
        # If the user ID already exists in the dictionary, extend the existing genre list with new genre IDs from the current row, avoiding duplicates
        user_genres[user_id].extend(
            genre_id for genre_id in set(mapping[genre.strip()] for genre in genres_list if isinstance(genre, str))
            if genre_id not in user_genres[user_id]
        )

# Print the first 5 entries in user_genres to show it works
for user_id, genres_list in list(user_genres.items())[:5]:
    print(f"User ID: {user_id}, Genres List: {genres_list}")


User ID: 1dea19f6fe, Genres List: [0, 1, 14]
User ID: 544dcbc510, Genres List: [8, 11, 21, 6, 1, 14, 0, 12]
User ID: 7cbcc791bf, Genres List: [8, 17, 9, 12, 14, 1, 5, 11, 20]
User ID: ebf43c36b6, Genres List: [0, 17, 5, 11, 12, 20, 8, 9, 24, 18, 14]
User ID: a57c992287, Genres List: [5, 11, 12, 14, 17, 20, 0, 8, 6, 18, 24, 4, 16, 9, 13, 10, 21, 3, 23, 25]


## Create Minhash Signature

In [20]:
import random

# Here a make permutations of the order of 0 to 25. We wil the associate this order to our mapping
n_permutations = 100
min_value = 0
max_value = 25

# Initialize minhash_signatures in which we will record the first value for each permutation which is inlcuded in the user set
minhash_signatures = {}
permutations = [random.sample(range(min_value, max_value + 1), max_value + 1) for _ in range(n_permutations)]
print(permutations, "/n")

# Iterate over the range of permutations
for n in range(1, n_permutations + 1):
    minhash_signatures[n] = []  # Initialize an empty list for each permutation in the minhash_signatures where n is the n of the permutation

    for user_id, genres_list in user_genres.items():
        # Check if any value in the current permutation is in the user's genres list
        for i, permutation in enumerate(permutations[n - 1]):
            if permutation in genres_list:
                # If found, append the user ID and the matching permutation value to the signatures and break so that we will get the first genre to appear in the permutations which has been seen by the user
                minhash_signatures[n].append((user_id, permutation))
                break

# minhash_ginature is too big I will only show a snippet
for n in range(1, 5):
    if minhash_signatures[n]:
        print(f"minhash_signatures[{n}]: {minhash_signatures[n][:3]}")
    else:
        print(f"minhash_signatures[{n}]: No matches found")


[[18, 10, 13, 21, 25, 14, 12, 15, 19, 23, 16, 8, 7, 22, 5, 3, 11, 9, 0, 6, 20, 17, 4, 24, 2, 1], [13, 2, 15, 12, 6, 3, 14, 22, 19, 11, 21, 24, 7, 16, 25, 0, 4, 1, 10, 17, 20, 9, 8, 18, 5, 23], [2, 13, 11, 1, 12, 7, 9, 5, 18, 14, 15, 16, 24, 8, 10, 19, 0, 20, 3, 21, 22, 17, 25, 23, 6, 4], [1, 25, 7, 6, 4, 14, 2, 9, 8, 20, 13, 11, 22, 16, 21, 24, 12, 15, 23, 18, 19, 5, 10, 3, 17, 0], [7, 11, 0, 1, 18, 4, 2, 14, 9, 5, 23, 21, 25, 8, 3, 13, 12, 20, 6, 15, 17, 10, 24, 19, 16, 22], [23, 6, 3, 2, 24, 4, 17, 19, 9, 22, 25, 11, 21, 12, 8, 16, 20, 10, 18, 15, 5, 0, 14, 7, 1, 13], [24, 4, 10, 14, 1, 18, 21, 7, 5, 13, 22, 11, 25, 15, 8, 23, 19, 20, 2, 12, 9, 17, 0, 16, 3, 6], [1, 18, 23, 21, 2, 8, 7, 6, 15, 25, 24, 14, 22, 17, 9, 20, 11, 16, 0, 10, 19, 4, 12, 3, 5, 13], [6, 15, 0, 22, 14, 16, 23, 10, 8, 5, 17, 21, 3, 11, 9, 12, 13, 20, 4, 24, 2, 1, 25, 19, 7, 18], [17, 14, 8, 7, 15, 13, 18, 19, 12, 11, 3, 24, 9, 5, 6, 0, 2, 4, 16, 22, 20, 21, 10, 1, 23, 25], [23, 0, 14, 16, 1, 24, 17, 19, 8, 12, 2

It is correct, user id 1dea19f6fe watches "Drama" (0), "Romance" (1) and "Commedy" (14). As you can see in the first permutation it gets 14 as it's the first genre that appears, also on the second permutation while in the third one the genre 1 (Romance) appears first.

## 1.3 Locality-Sensitive Hashing (LSH)

In [21]:
input_user_id = 'a57c992287'  # user id to comapre

input_user_signature = [] # signature of input id
other_users_signatures = {} # signatures of other user id

# Iterate over each permutation's signatures in minhash_signatures
for _, tuples in minhash_signatures.items():
  for user_id, signature in tuples:
    # Check if the user_id matches our input user
    if user_id == input_user_id:
      # add the signature to the input user's signature list
      input_user_signature.append(signature)
    else:
      # Check if the user_id is already in other_users_signatures
      if user_id not in other_users_signatures:
        other_users_signatures[user_id] = []
      # Append the signature to the existing list for the user
      other_users_signatures[user_id].append(signature)

# Calculate the similarity score for each user how many times the got the same minhash value / total number of permutations
similarity_scores = {}

for user_id, other_user_signature in other_users_signatures.items():
    count_common_values = sum(1 for i, j in zip(other_user_signature, input_user_signature) if i == j)
    similarity_score = count_common_values / len(input_user_signature)
    similarity_scores[user_id] = similarity_score

# Sort similarity_scores by values in descending order
sorted_similarity_scores = sorted(similarity_scores.items(), key=lambda x: x[1], reverse=True)

# Top 2 users
top_2_users = sorted_similarity_scores[:2]
top_2_users_id = [item[0] for item in top_2_users] # Variable which we will use later. List of user_id

print("Top 2 Similarity Scores:")
for user_id, similarity_score in top_2_users:
    print(f"User ID: {user_id}, Similarity Score: {similarity_score}")



Top 2 Similarity Scores:
User ID: f76ecc2439, Similarity Score: 0.95
User ID: ec9f1ee809, Similarity Score: 0.95


In [22]:
# make a filtered df for user in the top_2_users_id list called similar_movies_clicks
similar_movies_clicks = df_movies[df_movies['user_id'].isin(top_2_users_id)]

# count occurrances
movie_id_counts_similar = similar_movies_clicks.groupby(['user_id', 'movie_id', 'title']).size().reset_index(name='count')

# I chose to use sets in order to use the intersection functionlater
user_a = set()
user_b = set()

# movies whatched by a in user_a
filtered_df_a = movie_id_counts_similar[movie_id_counts_similar['user_id'] == top_2_users_id[0]]
user_a.update(filtered_df_a['movie_id'].unique())

# movies whatched by b in user_b
filtered_df_b = movie_id_counts_similar[movie_id_counts_similar['user_id'] == top_2_users_id[1]]
user_b.update(filtered_df_b['movie_id'].unique())

# use the intersection function to see those in common between a and b
similar_common_movies = list(user_a.intersection(user_b))
similar_noncommon_movies = list(user_a.union(user_b))

# list in descending order by click the movies seen by both a and b and those watched indiviually by them
result_common = movie_id_counts_similar[movie_id_counts_similar['movie_id'].isin(similar_common_movies)].groupby(['title']).agg({'count': 'sum'}).reset_index().sort_values(by='count', ascending=False)
result_noncommon = movie_id_counts_similar[movie_id_counts_similar['movie_id'].isin(similar_noncommon_movies)].sort_values(by='count', ascending=False)[['title', 'count']]

# concatenate so that we will first see the ones in result_common (seen by both) and then the once which they did not both see
concatenated_result = pd.concat([result_common, result_noncommon])

# show top 5 movies
concatenated_result.head(5)


Unnamed: 0,title,count
59,Nerdland,10
42,Exodus: Gods and Kings,5
25,The Avengers,4
16,Star Wars: The Force Awakens,4
28,Arthur Christmas,4
