In [None]:
import urllib.request
filename = 'ratebeer.json.gz'
urllib.request.urlretrieve('https://datarepo.eng.ucsd.edu/mcauley_group/data/beer/ratebeer.json.gz', filename)

In [None]:
import gzip
import shutil

def unzip_gzip(input_file, output_file):
    with gzip.open(input_file, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

unzip_gzip('ratebeer.json.gz', 'data.json')

In [None]:
import json

def json_to_jsonlines(input_file):
   jsonHolder =  []
   with open(input_file, 'r') as input_file:
     for obj in input_file:
       try:
         data_dict = json.loads(obj.replace("'", "\""))
         jsonHolder.append({"review/profileName": data_dict["review/profileName"], "rating": data_dict['review/overall'], "beer/beerId": data_dict["beer/beerId"], "name": data_dict["beer/name"]})
       except:
         pass
   return jsonHolder

input_file = 'data.json'
jsonHolder = json_to_jsonlines(input_file)

In [None]:
import json

def get_beer_with_names(input_file):
   jsonHolder =  {}
   with open(input_file, 'r') as input_file:
     for obj in input_file:
       try:
         data_dict = json.loads(obj.replace("'", "\""))
         jsonHolder[data_dict["beer/beerId"]] = data_dict["beer/name"]
       except:
         pass
   return jsonHolder

input_file = 'data.json'
beer_with_name = get_beer_with_names(input_file)

In [None]:
def convert_to_jsonl():
    with open('data.jsonl', 'w') as f:
      for entry in jsonHolder:
        json.dump(entry, f)
        f.write('\n')

convert_to_jsonl()

In [31]:
from dask.distributed import Client, LocalCluster, default_client

try:
    if default_client() is not None:
        default_client().close()
except ValueError:
    pass

cluster = LocalCluster(n_workers=8, threads_per_worker=1, memory_limit='4GB')
client = Client(cluster)
dashboard_link = client.dashboard_link
print ("Dask Dashboard link: ", dashboard_link)

Perhaps you already have a cluster running?
Hosting the HTTP server on port 37963 instead


Dask Dashboard link:  http://127.0.0.1:37963/status


In [32]:
import dask
from dask import bag as db
import dask.dataframe as dd
import pandas as pd
import json
import numpy as np
data_bag = db.read_text('data.jsonl', blocksize="10MB")
data_bag = data_bag.map(json.loads)

# data_bag = data_bag.random_sample(0.1)
print(data_bag.count().compute())

2783710


In [33]:
profile_name_counts = data_bag.pluck("review/profileName").frequencies().compute()
beer_id_counts = data_bag.pluck("beer/beerId").frequencies().compute()

profile_name_counts_dict = dict(profile_name_counts)
beer_id_counts_dict = dict(beer_id_counts)

# filtered_bag = data_bag.filter(lambda x: profile_name_counts_dict[x["review/profileName"]] > 20 and beer_id_counts_dict[x["beer/beerId"]] > 100)
filtered_bag = data_bag.filter(lambda x: beer_id_counts_dict[x["beer/beerId"]] > 50)
filtered_bag = filtered_bag.filter(lambda x: profile_name_counts_dict[x["review/profileName"]] > 20)


result = filtered_bag.compute()

print(len(result))

1947817


In [34]:
data_bag = db.from_sequence(result, npartitions=16)


In [59]:


beer_with_name = data_bag.map(lambda x: (x["beer/beerId"], x["name"]))
beer_with_name = beer_with_name.compute()
beer_with_name = dict(beer_with_name)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


In [6]:
# profile_name_counts = data_bag.pluck("review/profileName").frequencies().compute()
# profile_name_counts_dict = dict(profile_name_counts)

# filtered_bag = data_bag.filter(lambda x: profile_name_counts_dict[x["review/profileName"]] > 10)
# result = filtered_bag.compute()
# print(len(result))

In [2]:
# data_bag = db.from_sequence(result)
# print(data_bag.count().compute())
# profile_name_counts = data_bag.pluck("beer/beerId").frequencies().compute()
# beer_ids_counts = data_bag.pluck("beer/beerId").frequencies().compute()
# beer_ids_count_dict = dict(beer_ids_counts)
# filtered_bag = data_bag.filter(lambda x: profile_name_counts_dict[x["review/profileName"]] > 10)

# result = filtered_bag.compute()
# print(len(result))

In [37]:
unique_profile_names = data_bag.pluck('review/profileName').distinct().compute()
unique_beer_ids = data_bag.pluck("beer/beerId").distinct().compute()

num_profiles = len(unique_profile_names)
num_beers = len(unique_beer_ids)

print(num_beers)
print(num_profiles)

8963
6479


In [38]:

utility_matrix = [np.zeros(num_beers) for _ in range(num_profiles)]
# binary_matrix = [np.zeros(num_beers) for _ in range(num_profiles)]
print(len(utility_matrix))

6479


In [39]:
user_index_map = {user: idx for idx, user in enumerate(unique_profile_names)}
beer_index_map = {beer: idx for idx, beer in enumerate(unique_beer_ids)}

In [40]:
client.restart()



In [9]:
# def update_utility(row, utility_matrix, user_index_map, beer_index_map):
#     profile_id = user_index_map[row['review/profileName']]
#     beer_id = beer_index_map[row['beer/beerId']]
#     rating = int(row['rating'].split('/')[0])
#     utility_matrix[profile_id][beer_id] = rating

# def apply_update(row):
#     global utility_matrix
#     global user_index_map
#     global beer_index_map
#     update_utility(row, utility_matrix, user_index_map, beer_index_map)

# partitioned_bag = data_bag.repartition(npartitions=128)
# result = partitioned_bag.map(apply_update).compute()
# print(len(result))

In [41]:
def update_sinle_record(row, user_index_map, beer_index_map):
    beer_id = beer_index_map[row['beer/beerId']]
    profile_id = user_index_map[row['review/profileName']]
    rating = int(row['rating'].split('/')[0])
    return (beer_id, profile_id, rating)

def apply_update(row):
    global user_index_map
    global beer_index_map
    return update_sinle_record(row, user_index_map, beer_index_map)

partitioned_bag = data_bag.repartition(npartitions=128)
updates = partitioned_bag.map(apply_update).compute()

for beer_id, user_id, rating in updates:
    utility_matrix[user_id][beer_id] = rating

print(utility_matrix)

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


[array([14.,  0., 13., ..., 14., 13.,  0.]), array([14., 15., 11., ..., 16., 11., 10.]), array([15.,  0.,  0., ...,  0.,  0.,  0.]), array([16., 16., 14., ..., 15.,  0.,  0.]), array([13.,  0.,  0., ...,  0., 10.,  0.]), array([16.,  0.,  0., ...,  0.,  0.,  0.]), array([12., 13., 13., ..., 14., 10.,  0.]), array([13., 11., 10., ...,  0.,  0.,  0.]), array([14., 13.,  0., ..., 14., 13.,  0.]), array([13., 14.,  0., ..., 13., 12.,  0.]), array([14., 15.,  0., ..., 12.,  0.,  0.]), array([13., 12., 11., ...,  6., 12.,  0.]), array([13., 12., 11., ..., 13., 14.,  0.]), array([13., 14.,  0., ..., 15., 12.,  0.]), array([12.,  0.,  0., ...,  0.,  0.,  0.]), array([13.,  0.,  0., ..., 15.,  0.,  0.]), array([15., 15.,  0., ..., 12., 11.,  0.]), array([10.,  0.,  0., ..., 16.,  1.,  0.]), array([15.,  0.,  0., ...,  0., 14.,  0.]), array([10.,  0.,  0., ..., 14., 12.,  0.]), array([14.,  0.,  0., ..., 15., 10.,  0.]), array([15., 14., 10., ...,  0.,  0.,  0.]), array([16.,  0.,  0., ..., 15.,

In [42]:
# def update_utility(row, utility_matrix, user_index_map, beer_index_map):
#     profile_id = user_index_map[row['review/profileName']]
#     beer_id = beer_index_map[row['beer/beerId']]
#     rating = int(row['rating'].split('/')[0])
#     utility_matrix[profile_id][beer_id] = rating
#     return utility_matrix[profile_id]


# partitioned_bag = data_bag.repartition(npartitions=128)
# utility_matrix = partitioned_bag.map(lambda x: update_utility(x, utility_matrix, user_index_map, beer_index_map)).compute()
print(len(utility_matrix))

6479


In [43]:
import gc

def clear_worker_data():
    import gc
    gc.collect()


client.run(clear_worker_data)
client.rebalance()
# client.restart()

In [13]:
# def update_binary_utility(row, utility_matrix, user_index_map, beer_index_map):
#     profile_id = user_index_map[row['review/profileName']]
#     beer_id = beer_index_map[row['beer/beerId']]
#     utility_matrix[profile_id][beer_id] = 1
#     return utility_matrix[profile_id]

# partitioned_bag = data_bag.repartition(npartitions=16)
# binary_matrix = partitioned_bag.map(lambda x: update_binary_utility(x, utility_matrix, user_index_map, beer_index_map)).compute()
# print(binary_matrix)

In [44]:
print(unique_profile_names)

['BBB63', 'Cornfield', 'merlin48', 'MI2CA', 'WabashMan', 'paultheguru', 'bu11zeye', 'thirdeye11', 'travita', 'mar', 'kramer', 'Cavie', 'blutt59', 'BMan1113VR', 'Maltajo', 'jason', 'CaptainCougar', 'bitbucket', 'dchmela', 'scrizzz', 'durhambeer', 'ucusty', 'emacgee', 'Dorwart', 'FlacoAlto', 'alexsdad06', 'after4ever', 'JCB', 'otakuden', 'hopscotch', 'Sparky', 'jsquire', 'hopdog', 'jcwattsrugger', 'smith4498', 'Immy', 'mgumby10', 'TheBeerGod', 'Drake', 'LooseCannon', 'decaturstevo', 'EithCubes', 'thedm', 'Tmoney99', 'brentfeesh', 'heemer77', 'kp', 'puzzl', 'Suttree', 'shp555', 'beerguy101', 'alexanderj', 'JohnC', 'Taverner', 'MoDog', 'Optigon', 'Kevster', 'Acknud', 'goldtwins', 'hotstuff', 'beastiefan2k', 'JoeMcPhee', 'golubj', 'notalush', 'GMCC2181', 'eaglefan538', 'Cletus', 'SpudClampDawg', 'TheBeerOrg', 'pantanap', 'tjthresh', 'bdigital', 'IrishBoy', 'Odeed', 'StFun', 'shrubber85', 'adamlangolf', 'GG', 'Snojerk321', 'drfabulous', 'mmmbeer', 'DWestrick', 'kwoeltje', 'csbosox', '502Flav

In [45]:
user_name = input("Please select a user name")
user_id = user_index_map[user_name]
test_user = utility_matrix[user_id]
print(test_user)

[14.  0. 13. ... 14. 13.  0.]


In [46]:
import gc

def clear_worker_data():
    import gc
    gc.collect()


client.run(clear_worker_data)
client.rebalance()

In [None]:
# def check_for_same_binary_matrix(test_user, all_users, num_beers, utility_matrix):
#     for i in range(len(all_users)):
#         if np.array_equal(all_users[i], test_user):
#             print(i)
#             utility_matrix[i] = np.zeros(num_beers)

#     return utility_matrix

# utility_matrix = check_for_same_binary_matrix(binary_matrix[user_id], binary_matrix, num_beers, utility_matrix)
# binary_matrix = []

In [47]:
print(utility_matrix)

[array([14.,  0., 13., ..., 14., 13.,  0.]), array([14., 15., 11., ..., 16., 11., 10.]), array([15.,  0.,  0., ...,  0.,  0.,  0.]), array([16., 16., 14., ..., 15.,  0.,  0.]), array([13.,  0.,  0., ...,  0., 10.,  0.]), array([16.,  0.,  0., ...,  0.,  0.,  0.]), array([12., 13., 13., ..., 14., 10.,  0.]), array([13., 11., 10., ...,  0.,  0.,  0.]), array([14., 13.,  0., ..., 14., 13.,  0.]), array([13., 14.,  0., ..., 13., 12.,  0.]), array([14., 15.,  0., ..., 12.,  0.,  0.]), array([13., 12., 11., ...,  6., 12.,  0.]), array([13., 12., 11., ..., 13., 14.,  0.]), array([13., 14.,  0., ..., 15., 12.,  0.]), array([12.,  0.,  0., ...,  0.,  0.,  0.]), array([13.,  0.,  0., ..., 15.,  0.,  0.]), array([15., 15.,  0., ..., 12., 11.,  0.]), array([10.,  0.,  0., ..., 16.,  1.,  0.]), array([15.,  0.,  0., ...,  0., 14.,  0.]), array([10.,  0.,  0., ..., 14., 12.,  0.]), array([14.,  0.,  0., ..., 15., 10.,  0.]), array([15., 14., 10., ...,  0.,  0.,  0.]), array([16.,  0.,  0., ..., 15.,

In [48]:
import gc

def clear_worker_data():
    import gc
    gc.collect()


client.run(clear_worker_data)
client.rebalance()

In [49]:
utility_matrix_bag = db.from_sequence(utility_matrix)

In [50]:
client.restart()



In [51]:
def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    # print(f"{dot_product} / ({norm_u} * {norm_v})")
    similarity = dot_product / (norm_u * norm_v)
    # print(similarity)
    return similarity


def calculate_similarity(utility_matrix, test_user):
    sims = utility_matrix.map(lambda x: (x, cosine_similarity(x, test_user)))
    return sims

similarities = calculate_similarity(utility_matrix_bag, test_user)

print(similarities.compute())

This may cause some slowdown.
Consider scattering data ahead of time and using futures.


[(array([14.,  0., 13., ..., 14., 13.,  0.]), 0.9999999999999999), (array([14., 15., 11., ..., 16., 11., 10.]), 0.5377771041881646), (array([15.,  0.,  0., ...,  0.,  0.,  0.]), 0.29303829258920916), (array([16., 16., 14., ..., 15.,  0.,  0.]), 0.4768215824779747), (array([13.,  0.,  0., ...,  0., 10.,  0.]), 0.28176041755745385), (array([16.,  0.,  0., ...,  0.,  0.,  0.]), 0.2720212462162411), (array([12., 13., 13., ..., 14., 10.,  0.]), 0.5906406482781178), (array([13., 11., 10., ...,  0.,  0.,  0.]), 0.3974301083572115), (array([14., 13.,  0., ..., 14., 13.,  0.]), 0.4169140121778793), (array([13., 14.,  0., ..., 13., 12.,  0.]), 0.4281302186581495), (array([14., 15.,  0., ..., 12.,  0.,  0.]), 0.5184608722671706), (array([13., 12., 11., ...,  6., 12.,  0.]), 0.3788298853565256), (array([13., 12., 11., ..., 13., 14.,  0.]), 0.48434813360908435), (array([13., 14.,  0., ..., 15., 12.,  0.]), 0.49464265597019363), (array([12.,  0.,  0., ...,  0.,  0.,  0.]), 0.15890809678629442), (arr

In [52]:
top_similar_users = sorted(similarities, key=lambda x: x[1], reverse=True)[1:11]
print(top_similar_users)

[(array([13., 13., 11., ..., 10.,  8.,  0.]), 0.6180448332476652), (array([ 0., 12.,  0., ...,  0., 11.,  9.]), 0.613779317635128), (array([15., 15.,  0., ..., 12., 11.,  0.]), 0.6103569073786884), (array([ 9., 10.,  0., ..., 13., 11.,  0.]), 0.5926445689045644), (array([15., 15., 12., ..., 14., 11.,  0.]), 0.5916002893370599), (array([12., 13., 13., ..., 14., 10.,  0.]), 0.5906406482781178), (array([13., 13.,  0., ..., 14.,  0.,  0.]), 0.578642809391248), (array([14., 14.,  0., ..., 13.,  0.,  0.]), 0.5755797647842387), (array([ 0.,  0.,  0., ...,  0., 14.,  0.]), 0.573749158774832), (array([ 0.,  0.,  0., ..., 13., 11.,  8.]), 0.573610074640418)]


In [53]:
def get_weighted_matrix(similar_users, utility_matrix):
    weighted_matrix = []
    for user_tup in similar_users:
        similarity = user_tup[1]
        user_matrix = user_tup[0]
        for i in range(len(user_matrix)):
            user_matrix[i] = similarity * user_matrix[i]
        weighted_matrix.append((user_id, user_matrix))
    
    return weighted_matrix

weighted_matrix = get_weighted_matrix(top_similar_users, utility_matrix)

print(weighted_matrix)

[(0, array([8.03458283, 8.03458283, 6.79849317, ..., 6.18044833, 4.94435867,
       0.        ])), (0, array([0.        , 7.36535181, 0.        , ..., 0.        , 6.75157249,
       5.52401386])), (0, array([9.15535361, 9.15535361, 0.        , ..., 7.32428289, 6.71392598,
       0.        ])), (0, array([5.33380112, 5.92644569, 0.        , ..., 7.7043794 , 6.51909026,
       0.        ])), (0, array([8.87400434, 8.87400434, 7.09920347, ..., 8.28240405, 6.50760318,
       0.        ])), (0, array([7.08768778, 7.67832843, 7.67832843, ..., 8.26896908, 5.90640648,
       0.        ])), (0, array([7.52235652, 7.52235652, 0.        , ..., 8.10099933, 0.        ,
       0.        ])), (0, array([8.05811671, 8.05811671, 0.        , ..., 7.48253694, 0.        ,
       0.        ])), (0, array([0.        , 0.        , 0.        , ..., 0.        , 8.03248822,
       0.        ])), (0, array([0.        , 0.        , 0.        , ..., 7.45693097, 6.30971082,
       4.5888806 ]))]


In [54]:
def recommend_beer(weighted_matrix, num_similar, num_beers):
    weighted_sum = np.zeros(num_beers)
    for i in range(num_similar):
        for j in range(num_beers):
            weighted_sum[j] = weighted_sum[j] + weighted_matrix[i][1][j]
    return weighted_sum

num_similar = len(top_similar_users)
num_beers = len(utility_matrix[0])
    
weighted_sum = recommend_beer(weighted_matrix, num_similar, num_beers)
# print(beer_with_name[unique_beer_ids[np.argmax(weighted_sum)]])

In [55]:
print(beer_with_name[unique_beer_ids[np.argmax(weighted_sum)]])

TypeError: list indices must be integers or slices, not str