## The following note book outlines the broken down code of the algorithm
<br/>
The below code downloads the dataset in the form a gzip file.

In [None]:
import urllib.request
filename = 'ratebeer.json.gz'
urllib.request.urlretrieve('https://datarepo.eng.ucsd.edu/mcauley_group/data/beer/ratebeer.json.gz', filename) # Download Dataset

The below code extracts the gziped json file into a regular .json file

In [None]:
import gzip
import shutil

def unzip_gzip(input_file, output_file):
    with gzip.open(input_file, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

unzip_gzip('ratebeer.json.gz', 'data.json')

The below code takes the json file, and stores it as a list of python dicionaries

In [None]:
import json

def json_to_jsonlines(input_file):
   jsonHolder =  []
   with open(input_file, 'r') as input_file:
     for obj in input_file:
       try:
         data_dict = json.loads(obj.replace("'", "\""))
         jsonHolder.append({"review/profileName": data_dict["review/profileName"], "rating": data_dict['review/overall'], "beer/beerId": data_dict["beer/beerId"], "name": data_dict["beer/name"]})
       except:
         pass
   return jsonHolder

jsonHolder = json_to_jsonlines("data.json")

The below code takes takes the stored python dictionaries, and writes this out to a json lines file (.jsonl) a basic json file with each entry on a new line

In [None]:
def convert_to_jsonl():
    with open('data.jsonl', 'w') as f:
      for entry in jsonHolder:
        json.dump(entry, f)
        f.write('\n')

convert_to_jsonl()

The below code starts up the dask distributed clusters. Here is where we set the amount of workers, the amount of threads these workers use and the memory limit for each worker. It also starts up the dask dashboard, that shows the tasks and workers as they work in parralell.

In [None]:
from dask.distributed import Client, LocalCluster, default_client

# If a client is already running, close it
try:
    if default_client() is not None:
        default_client().close()
except ValueError:
    pass

cluster = LocalCluster(n_workers=8, threads_per_worker=1, memory_limit='4GB') # Set number of workers, threads, and max memory for each worker
client = Client(cluster)
dashboard_link = client.dashboard_link
print ("Dask Dashboard link: ", dashboard_link)

The below code is used for reading the json lines file into a dask bag of dictionaries. Here we can also set our data set sample size.
Comment out the random_sample line in order to perform the analysis on all reviews in the data set.

In [None]:
import dask
from dask import bag as db
import dask.dataframe as dd
import pandas as pd
import json
import numpy as np
data_bag = db.read_text('data.jsonl', blocksize="10MB")
data_bag = data_bag.map(json.loads)

data_bag = data_bag.random_sample(0.1) # Set dataset sample size, comment this line out for the whole dataset
print(f"You are preforming the algorithm on {data_bag.count().compute()} unique reviews")

The below code is used for filtering out that may cause sparity issues. It works by first getting the frequences of occurance for the beerIds and users, and stores these in corresponding python dictionaries. The dataset is then filtered to remove any beers or profiles, that dont meet the required threshold. I chose 50 for the count of beers, as in a data set of over 2.8 million reviews, a beer that occurs less than 50 times is extremely unlikely to be the predicted beer, and if it is, its unlikely that the user will actually enjoy it. I chose a profile threshold of 10, as if a user has reviewed less that 10 beers its unlikely that they have a distinguishable pattern. Another reason for setting this threshold is that it reduces the likely hood of the user having the same binary matrix any other user. Which is good cause if this is the case, then there will be no predicted beer for that user, and they have both had only the same beers, thus not being able to predict one.

In [None]:
# Get counts of unique users and beers
profile_name_counts = data_bag.pluck("review/profileName").frequencies().compute()
beer_id_counts = data_bag.pluck("beer/beerId").frequencies().compute()

# Store unique counts in pythn dictionaries
profile_name_counts_dict = dict(profile_name_counts)
beer_id_counts_dict = dict(beer_id_counts)

# Filter the data set by the given thresholds for profiles and beers
filtered_bag = data_bag.filter(lambda x: beer_id_counts_dict[x["beer/beerId"]] > 50)
filtered_bag = filtered_bag.filter(lambda x: profile_name_counts_dict[x["review/profileName"]] > 10)


result = filtered_bag.compute()

print(f"After filtering the data you are left with {len(result)} reviews")

In [None]:
data_bag = db.from_sequence(result, npartitions=16)

beer_with_name = data_bag.map(lambda x: (x["beer/beerId"], x["name"]))
beer_with_name = beer_with_name.compute()
beer_with_name = dict(beer_with_name)

The below code, gets all the unique profile names and beers, it then computes the length of these lists. It then creates a empty utility matrix populated with all 0s of the dimension given by the amount of users and number of beers.

In [None]:
# Get list of unique names and unique beers
unique_profile_names = data_bag.pluck('review/profileName').distinct().compute()
unique_beer_ids = data_bag.pluck("beer/beerId").distinct().compute()

# Get amount of beers and users in data set
num_profiles = len(unique_profile_names)
num_beers = len(unique_beer_ids)

# Create a empty utility matrix populates with all 0s
utility_matrix = [np.zeros(num_beers) for _ in range(num_profiles)]

The below code, creates a mapping from the beer ids and profiles names given by the data set, to the correspoding indexes inside the utility matrix

In [None]:
user_index_map = {user: idx for idx, user in enumerate(unique_profile_names)}
beer_index_map = {beer: idx for idx, beer in enumerate(unique_beer_ids)}

The below code, takes the bag of user revews, the beer and profile name mapping and uses this to update the utility matrix, propulating it with the users ratings for the given beer.

In [None]:
# Get the users real beer id, real user id and rating for each record in the data bag 
def update_sinle_record(row, user_index_map, beer_index_map):
    beer_id = beer_index_map[row['beer/beerId']]
    profile_id = user_index_map[row['review/profileName']]
    rating = int(row['rating'].split('/')[0])
    return (beer_id, profile_id, rating)

def apply_update(row):
    global user_index_map
    global beer_index_map
    return update_sinle_record(row, user_index_map, beer_index_map)

partitioned_bag = data_bag.repartition(npartitions=128)
updates = partitioned_bag.map(apply_update).compute()

# Update the utility matrix sequentially
for beer_id, user_id, rating in updates:
    utility_matrix[user_id][beer_id] = rating


The below code clears the workers memory, and helps to fix some of the memory issues I was having.

In [None]:
import gc

def clear_worker_data():
    import gc
    gc.collect()


client.run(clear_worker_data) # Do garbage collection
client.rebalance() # Rebalance the memory across workers

The below two boxes are for allowing you to select a user you want a recommended beer for

In [None]:
print(unique_profile_names)

In [None]:
user_name = input("Please select a user name")
user_id = user_index_map[user_name] # Get the real index of the user
test_user = utility_matrix[user_id] # Get the test users utility matrix, to use in cosine comparison

The below code, calculates the cosine similarity for the given test user, to every other user in the utility matrix. This then returns a list of tuples containing the utility vector of the user, and similarity rating of that user. Then takes the top 10 most similar users.

In [None]:
# Calculates cosine similarity
def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

# Calls the cosine similarity function, mapping it in parrallel
def calculate_similarity(utility_matrix, test_user):
    sims = utility_matrix.map(lambda x: (x, cosine_similarity(x, test_user)))
    return sims

utility_matrix_bag = db.from_sequence(utility_matrix) # Convert utility matrix t oa bag, allowing it to be processed in parrallel
similarities = calculate_similarity(utility_matrix_bag, test_user) # Calculate cosine similarity for each user in the utility matrix

top_similar_users = sorted(similarities, key=lambda x: x[1], reverse=True)[1:11] # Take the top 10 most similar users. Excludes the top most similar, as this is the test user them self

The following code takes, the top 10 most similar users in a list of tuples of the form (<userVector>, <similarity>) and calculates each column in the userVector by the similarity score.

In [None]:
def get_weighted_matrix(similar_users):
    weighted_matrix = []
    for user_tup in similar_users: # For each of the 10 similar users
        similarity = user_tup[1]
        user_matrix = user_tup[0]
        for i in range(len(user_matrix)): # For each rating in the userVector
            user_matrix[i] = similarity * user_matrix[i] # Multiply the utility matrix score by users cosine similarity
        weighted_matrix.append((user_id, user_matrix))
    
    return weighted_matrix

weighted_matrix = get_weighted_matrix(top_similar_users)

This final part of code is used for giving the beer recomendation. It takes all the weighted userVectors of the similar users, and adds these together, so each column/beer has a total sum of weighted beer ratings. Then we are able to take the highest rated, and recommend this to the user.

In [None]:
def recommend_beer(weighted_matrix, num_similar, num_beers):
    weighted_sum = np.zeros(num_beers)
    for i in range(num_similar):
        for j in range(num_beers):
            weighted_sum[j] = weighted_sum[j] + weighted_matrix[i][1][j]
    return weighted_sum

num_similar = len(top_similar_users)
    
weighted_sum = recommend_beer(weighted_matrix, num_similar, num_beers)
print(beer_with_name[unique_beer_ids[np.argmax(weighted_sum)]])