#Sample project
In this abbreviated study, we answer the question:
What are the most common (non stop-word) words associated with restaurant reviews that are 1, 2, 3, 4, and 5 star?

Method: we will compute word frequencies after first removing all words in NLTK's stop word category. We will then group by restaurant star rating.

##Download data
We will use a filtered dataset collected by from Google Maps (see https://cseweb.ucsd.edu/~jmcauley/datasets.html#google_restaurants)


In [1]:
import urllib.request
filename = 'ratebeer.json.gz'
urllib.request.urlretrieve('https://datarepo.eng.ucsd.edu/mcauley_group/data/beer/ratebeer.json.gz', filename)

('ratebeer.json.gz', <http.client.HTTPMessage at 0x7fa79f93b850>)

In [2]:
import gzip
import shutil

def unzip_gzip(input_file, output_file):
    with gzip.open(input_file, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

unzip_gzip('ratebeer.json.gz', 'data.json')

In [1]:
import json

def json_to_jsonlines(input_file):
   jsonHolder =  []
   with open(input_file, 'r') as input_file:
     for obj in input_file:
       try:
         data_dict = json.loads(obj.replace("'", "\""))
         jsonHolder.append({"review/profileName": data_dict["review/profileName"], "rating": data_dict['review/overall'], "beer/beerId": data_dict["beer/beerId"]})
       except:
         pass
   return jsonHolder

input_file = 'data.json'
jsonHolder = json_to_jsonlines(input_file)


In [2]:
import json

def get_berr_with_names(input_file):
   jsonHolder =  {}
   with open(input_file, 'r') as input_file:
     for obj in input_file:
       try:
         data_dict = json.loads(obj.replace("'", "\""))
         jsonHolder[data_dict["beer/beerId"]] = data_dict["beer/name"]
       except:
         pass
   return jsonHolder

input_file = 'data.json'
beer_with_name = get_berr_with_names(input_file)

In [3]:
def convert_to_jsonl():
    with open('data.jsonl', 'w') as f:
      for entry in jsonHolder:
        json.dump(entry, f)
        f.write('\n')

convert_to_jsonl()

In [4]:
#shows a sample of the first and last 10 extracted reviews
!head -10 'data.jsonl'

{"review/profileName": "hopdog", "rating": "13/20", "beer/beerId": "63836"}
{"review/profileName": "TomDecapolis", "rating": "13/20", "beer/beerId": "63836"}
{"review/profileName": "hopdog", "rating": "8/20", "beer/beerId": "64125"}
{"review/profileName": "JFGrind", "rating": "14/20", "beer/beerId": "71719"}
{"review/profileName": "egajdzis", "rating": "16/20", "beer/beerId": "71719"}
{"review/profileName": "PhillyBeer2112", "rating": "17/20", "beer/beerId": "71719"}
{"review/profileName": "PhillyBeer2112", "rating": "11/20", "beer/beerId": "71715"}
{"review/profileName": "PhillyBeer2112", "rating": "14/20", "beer/beerId": "1470"}
{"review/profileName": "PhillyBeer2112", "rating": "14/20", "beer/beerId": "7721"}
{"review/profileName": "TomDecapolis", "rating": "12/20", "beer/beerId": "64126"}


## Loading the data into a Dask Dataframe
See https://docs.dask.org/en/latest/generated/dask.dataframe.read_json.html

In [14]:
import dask
from dask import bag as db
import dask.dataframe as dd
import pandas as pd
import json
import numpy as np
data_bag = db.read_text('data.jsonl', blocksize="1MB")

ddf = dd.from_pandas(pd.DataFrame([json.loads(entry) for entry in data_bag]), npartitions=8)

print("First three items of data_bag:")
print(data_bag.take(3))

# Print the first three items of the Dask DataFrame
print("\nFirst three items of ddf:")
print(ddf.head(3))


First three items of data_bag:
('{"review/profileName": "hopdog", "rating": "13/20", "beer/beerId": "63836"}\n', '{"review/profileName": "TomDecapolis", "rating": "13/20", "beer/beerId": "63836"}\n', '{"review/profileName": "hopdog", "rating": "8/20", "beer/beerId": "64125"}\n')

First three items of ddf:
  review/profileName rating beer/beerId
0             hopdog  13/20       63836
1       TomDecapolis  13/20       63836
2             hopdog   8/20       64125


In [15]:
unique_beer_ids = ddf['beer/beerId'].unique().compute().tolist()
beer_index_map = {beer: idx for idx, beer in enumerate(unique_beer_ids)}

print(f"Unique Beer Ids: {unique_beer_ids}")
print(len(unique_beer_ids))

Unique Beer Ids: ['91592', '114513', '25951', '110497', '58510', '142974', '129305', '113770', '131184', '157915', '135578', '4140', '5987', '5979', '87862', '83555', '97703', '121430', '39195', '21151', '3355', '115891', '130190', '130187', '28034', '155907', '12940', '23010', '149824', '24948', '10708', '75203', '95553', '79988', '156842', '20191', '54681', '127577', '18328', '18326', '18324', '88369', '88368', '50919', '33251', '71011', '71015', '30725', '72188', '122731', '71310', '68417', '47081', '156109', '49367', '17251', '17275', '18169', '18170', '39022', '104830', '96380', '90425', '90461', '97007', '113900', '112929', '120394', '76917', '33314', '127521', '38352', '106294', '72062', '94585', '107950', '36382', '62426', '146597', '57762', '121045', '105128', '88623', '44605', '152944', '48716', '28956', '104141', '26686', '130878', '84125', '123965', '144368', '123070', '139523', '136036', '67542', '110702', '59173', '98898', '46358', '14761', '21427', '124604', '73386', '64

In [16]:
unique_user_names = ddf['review/profileName'].unique().compute().tolist()
print(f"Unique User Names: {unique_user_names}")
print(len(unique_user_names))


Unique User Names: ['TomDecapolis', 'JFGrind', 'CaptainCougar', 'MI2CA', 'madsberg', 'WabashMan', 'paultheguru', 'mgumby10', 'TheBeerGod', 'Optigon', 'mmmbeer', 'DWestrick', 'csbosox', 'jdmhawk', '3fourths', 'Ty5592', 'Duster72', 'jpm30', 'JMFG', 'Beerowulf', 'steelcitybrew', 'thehinge', 'misterbeer', 'lovemyipas', 'CelticBrew', 'jkwalking05', 'omhper', 'TearsforBeers', 'cb', 'WallyWalrus', 'Fratto', 'Elkas', 'jimmack34', 'LarvalChemist', 'PLundsgaard', 'NachlamSie', 'HoppyHoosier', '5000', 'smatty', 'bmcginni', 'JoeTheYounger', 'fiver29', 'Sammy', 'mkofron', 'daknole', 'Schroppfy', 'brewblackhole', 'Rastacouere', 'footbalm', 'muenster', 'peter', 'Lumpy', 'hippie4beer', 'matta', 'RCL', 'ericandersnavy', 'AllAboutStout', 'RKanis', 'Vir4030', 'douglas88', 'dionysus', 'Styles', 'kuphish', 'GeneralGao', 'theisti', 'BeerHawk', '19Ei8hty', 'MrBunn', 'monkeygirl', 'davidajensen', 'SledgeJr', 'csaso', 'NEB_WineNBeer', 'eboats', 'ryanfolty', 'Vertical Bacon Strips', 'sseb69', 'tiggmtl', 'TipsyM

In [17]:
# def create_utility_matrix_zeros(unique_names_list, unique_beer_names):
#   user_arrays = {user: np.zeros(len(unique_beer_names)) for user in unique_names_list}
#   return user_arrays

# utility_matrix = create_utility_matrix_zeros(unique_user_names, unique_beer_names)
import dask.array as da

def create_utility_matrix_zeros_dask(unique_names_list, unique_beer_ids):
    # Determine the dimensions of the utility matrix
    num_users = len(unique_names_list)
    num_beers = len(unique_beer_ids)
    
    # Create a Dask array of zeros with the desired shape
    utility_matrix = da.zeros((num_users, num_beers))
    
    return utility_matrix

utility_matrix_dask = create_utility_matrix_zeros_dask(unique_user_names, unique_beer_ids)

In [18]:
print(utility_matrix_dask.shape)
print(unique_beer_ids)
print(unique_user_names)

(28437, 109279)
['91592', '114513', '25951', '110497', '58510', '142974', '129305', '113770', '131184', '157915', '135578', '4140', '5987', '5979', '87862', '83555', '97703', '121430', '39195', '21151', '3355', '115891', '130190', '130187', '28034', '155907', '12940', '23010', '149824', '24948', '10708', '75203', '95553', '79988', '156842', '20191', '54681', '127577', '18328', '18326', '18324', '88369', '88368', '50919', '33251', '71011', '71015', '30725', '72188', '122731', '71310', '68417', '47081', '156109', '49367', '17251', '17275', '18169', '18170', '39022', '104830', '96380', '90425', '90461', '97007', '113900', '112929', '120394', '76917', '33314', '127521', '38352', '106294', '72062', '94585', '107950', '36382', '62426', '146597', '57762', '121045', '105128', '88623', '44605', '152944', '48716', '28956', '104141', '26686', '130878', '84125', '123965', '144368', '123070', '139523', '136036', '67542', '110702', '59173', '98898', '46358', '14761', '21427', '124604', '73386', '646

In [19]:
ratings = ddf.compute().tolist()
print(ratings)

AttributeError: 'DataFrame' object has no attribute 'tolist'

In [13]:
import dask.array as da
import pandas as pd
import dask.dataframe as dd

# Example Dask DataFrame
dask_df = dd.from_pandas(pd.DataFrame({"review/profileName": ["hopdog", "user2"],
                                       "rating": ["13/20", "18/20"],
                                       "beer/beerId": ["63836", "12345"]}), npartitions=2)

# Example Dask array
shape = (len(dask_df["review/profileName"].unique()), len(dask_df["beer/beerId"].unique()))
dask_array = da.zeros(shape, chunks=(1000, 1000))

# Convert Dask DataFrame to pandas DataFrame
pandas_df = dask_df.compute()

# Map beer IDs and user names to indices
beer_ids = pandas_df["beer/beerId"].unique()
user_names = pandas_df["review/profileName"].unique()

beer_id_map = {beer_id: idx for idx, beer_id in enumerate(beer_ids)}
user_name_map = {user_name: idx for idx, user_name in enumerate(user_names)}

# Function to parse rating and populate array
def populate_array(partition):
    for row in partition.itertuples(index=False):
        user_idx = user_name_map[row["review/profileName"]]
        beer_idx = beer_id_map[row["beer/beerId"]]
        rating = int(row["rating"].split("/")[0])  # Extracting the rating value
        yield user_idx, beer_idx, rating

# Apply function to each partition of the DataFrame and fill the Dask array
def fill_array(df_partition):
    data = list(populate_array(df_partition))
    if data:
        user_indices, beer_indices, ratings = zip(*data)
        dask_array[user_indices, beer_indices] = ratings

# Use map_partitions to parallelize the operation across partitions
dask_df.map_partitions(fill_array).compute()

# Display Dask array
print(dask_array.compute())

ValueError: Metadata inference failed in `fill_array`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
TypeError('tuple indices must be integers or slices, not str')

Traceback:
---------
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask/dataframe/utils.py", line 195, in raise_on_meta_error
    yield
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask/dataframe/core.py", line 7175, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/tmp/ipykernel_176350/2169674767.py", line 34, in fill_array
    data = list(populate_array(df_partition))
  File "/tmp/ipykernel_176350/2169674767.py", line 27, in populate_array
    user_idx = user_name_map[row["review/profileName"]]


In [13]:

user_index_map = {user: idx for idx, user in enumerate(unique_user_names)}
beer_index_map = {beer: idx for idx, beer in enumerate(unique_beer_ids)}

print(user_index_map)
# Normalize rating values
def parse_rating(rating_str):
    return int(rating_str.split('/')[0])

def process_rating(rating):
    rating
    user_idx = user_index_map[rating["review/profileName"]]
    beer_idx = beer_index_map[rating["beer/beerId"]]
    rating_value = parse_rating(rating["rating"])
    utility_matrix_dask[user_idx, beer_idx] = rating_value

ddf.map_partitions(process_rating).compute()


utility_matrix_np = utility_matrix_dask.compute()



{'TomDecapolis': 0, 'JFGrind': 1, 'CaptainCougar': 2, 'MI2CA': 3, 'madsberg': 4, 'WabashMan': 5, 'paultheguru': 6, 'mgumby10': 7, 'TheBeerGod': 8, 'Optigon': 9, 'mmmbeer': 10, 'DWestrick': 11, 'csbosox': 12, 'jdmhawk': 13, '3fourths': 14, 'Ty5592': 15, 'Duster72': 16, 'jpm30': 17, 'JMFG': 18, 'Beerowulf': 19, 'steelcitybrew': 20, 'thehinge': 21, 'misterbeer': 22, 'lovemyipas': 23, 'CelticBrew': 24, 'jkwalking05': 25, 'omhper': 26, 'TearsforBeers': 27, 'cb': 28, 'WallyWalrus': 29, 'Fratto': 30, 'Elkas': 31, 'jimmack34': 32, 'LarvalChemist': 33, 'PLundsgaard': 34, 'NachlamSie': 35, 'HoppyHoosier': 36, '5000': 37, 'smatty': 38, 'bmcginni': 39, 'JoeTheYounger': 40, 'fiver29': 41, 'Sammy': 42, 'mkofron': 43, 'daknole': 44, 'Schroppfy': 45, 'brewblackhole': 46, 'Rastacouere': 47, 'footbalm': 48, 'muenster': 49, 'peter': 50, 'Lumpy': 51, 'hippie4beer': 52, 'matta': 53, 'RCL': 54, 'ericandersnavy': 55, 'AllAboutStout': 56, 'RKanis': 57, 'Vir4030': 58, 'douglas88': 59, 'dionysus': 60, 'Styles':

ValueError: Metadata inference failed in `process_rating`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
TypeError("unhashable type: 'Series'")

Traceback:
---------
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask/dataframe/utils.py", line 195, in raise_on_meta_error
    yield
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask_expr/_expr.py", line 3987, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/tmp/ipykernel_120609/1054239308.py", line 11, in process_rating
    user_idx = user_index_map[rating["review/profileName"]]


In [None]:
print(utility_matrix_np)

In [22]:
# def populate_utility_matrix(utility_matrix, beer_index_map, unique_user_names, data_bag):
#   for entry in data_bag:
#     entry_dict = json.loads(entry)

#     beer_id = entry_dict.get("beer/beerId")
#     user_name = entry_dict.get("review/profileName")
#     rating = entry_dict.get("rating")
#     rating = rating.split("/")[0]

#     if beer_id in beer_index_map and user_name in utility_matrix:
#         beer_index = beer_index_map[beer_id]
#         utility_matrix[user_name][beer_index] = rating
#   return utility_matrix

# populated_utility_matrix = populate_utility_matrix(utility_matrix_dask, beer_index_map, unique_user_names, data_bag)


KeyboardInterrupt: 

In [6]:
def get_user(utility_matrix, user):
  return utility_matrix[user]

name = str(input("Select a user"))
user = get_user(populated_utility_matrix, name)
print(user)


[13.  8.  0. ...  0.  0.  0.]


In [15]:
import dask.bag as db
import numpy as np

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

def calculate_similarity(user_data, target_user):
    user, vector = user_data
    similarity = cosine_similarity(vector, target_user)
    return (user, similarity)

def filter_same(user_data, target_user):
    result = 0
    for i in range(len(user_data)):
        if user_data[i] != 0 and target_user[i] != 0:
            result += 1
        if user_data[i] == 0 and target_user[i] == 0:
            result += 1
    return result == len(list)

user_list = list(populated_utility_matrix.items())
print(len(user_list))
user_bag = db.from_sequence(user_list)
filtered_user_bag = user_bag.filter(lambda x: filter_same(x[1], target_user))
print(filtered_user_bag.compute())
target_user = user

similarities_bag = user_bag.map(lambda x: calculate_similarity(x, target_user)).compute()
top_similar_users = sorted(similarities_bag, key=lambda x: x[1], reverse=True)[1:10]

# print("Top 10 most similar users:")
# for similar_user, similarity in top_similar_users:
#     print(f"User: {similar_user}, Similarity: {similarity}")

28437


TypeError: object of type 'type' has no len()

Traceback
---------
  File "/usr/local/lib/python3.10/dist-packages/dask/local.py", line 225, in execute_task
    result = _execute_task(task, data)
  File "/usr/local/lib/python3.10/dist-packages/dask/core.py", line 127, in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
  File "/usr/local/lib/python3.10/dist-packages/dask/bag/core.py", line 1860, in reify
    seq = list(seq)
  File "/tmp/ipykernel_125094/160886648.py", line 28, in <lambda>
  File "/tmp/ipykernel_125094/160886648.py", line 23, in filter_same


In [44]:
# def find_unreviewed_beers(similar_users, query_user, utility_matrix, name):
#     # Beers reviewed by similar users
#     similar_users_beers = set()
#     for user in similar_users:
#         user_vector = utility_matrix[user[0]]
#         for i, rating in enumerate(user_vector):
#             if rating > 0:
#                 similar_users_beers.add(i)

#     # Beers reviewed by the query user
#     query_user_vector = utility_matrix[name]
#     query_user_beers = {i for i, rating in enumerate(query_user_vector) if rating > 0}

#     # Beers reviewed by similar users but not by the query user
#     unreviewed_beers = similar_users_beers - query_user_beers

#     # Convert beer indices back to beer IDs
#     beer_index_map_inv = {v: k for k, v in beer_index_map.items()}
#     unreviewed_beer_ids = [beer_index_map_inv[idx] for idx in unreviewed_beers]

#     return unreviewed_beer_ids

# result = find_unreviewed_beers(top_similar_users, user, populated_utility_matrix, name)
# print(len(result))

# count = 0
# for i in populated_utility_matrix['TomDecapolis']:
#   if i != 0:
#     count += 1

# print(count)

# count = 0
# for i in populated_utility_matrix[name]:
#   if i != 0:
#     count += 1

# print(count)


0


In [52]:
def q1c(similar_users, test_user, populated_utility_matrix, unique_beer_names):
    result = []
    for i in range(len(unique_beer_names)):
        flag = True
        holder = []
        for p in similar_users:
            if populated_utility_matrix[p[0]][i] == 0:
                holder.append(p[0])
                flag = False
            
        if flag:
            result.append(i)

 

    new = []
    for i in range(len(test_user)):
        if test_user[i] == 0 and i in result:
            new.append(i)

    again = []
    for i in unique_beer_names:
        try:
            if int(i) in new:
                again.append(i)
        except ValueError as e:
            print(f"Error: {e} occurred while converting {i} to an integer.")

    recommended_show_names_bag = db.from_sequence(again)
    return recommended_show_names_bag

result = q1c(top_similar_users, user, populated_utility_matrix, unique_beer_names)

print(result.compute())

[('TomDecapolis', 0.596343102191683), ('egajdzis', 0.5929820677848494), ('DocLock', 0.5466473684610734), ('michael-pollack', 0.5190846845358148)]
63836
64125
71719
71715
1470
7721
64126
91592
114513
77833
125204
58511
25286
19690
19692
25951
25285
138014
100381
30306
86870
86866
105681
129598
110497
93821
81138
91689
98380
19694
87520
58510
56572
118474
107733
63156
142974
128815
19691
35731
94962
129305
102023
110443
19689
100955
113770
115220
109897
131184
157915
100954
135578
145726
120267
156970
19646
123015
137916
134179
4140
137983
26363
54890
76622
4141
135419
4138
149529
151588
129662
41770
63796
65833
63795
69151
66008
65835
65465
74323
69248
5985
19007
18196
5988
5977
5984
5982
5978
35550
13655
35580
5983
19006
5980
5981
19554
5987
13671
5979
5976
5986
95253
95246
79264
115618
109364
126255
97702
125702
85087
130479
103745
83556
87862
151891
92331
87861
162988
100020
90044
83555
97703
83306
134555
116348
103746
83310
101128
108068
116349
99723
121430
151890
142650
99724
96133