#Sample project
In this abbreviated study, we answer the question:
What are the most common (non stop-word) words associated with restaurant reviews that are 1, 2, 3, 4, and 5 star?

Method: we will compute word frequencies after first removing all words in NLTK's stop word category. We will then group by restaurant star rating.

##Download data
We will use a filtered dataset collected by from Google Maps (see https://cseweb.ucsd.edu/~jmcauley/datasets.html#google_restaurants)


In [1]:
import urllib.request
filename = 'ratebeer.json.gz'
urllib.request.urlretrieve('https://datarepo.eng.ucsd.edu/mcauley_group/data/beer/ratebeer.json.gz', filename)

('ratebeer.json.gz', <http.client.HTTPMessage at 0x7fd4aa2f3bb0>)

In [24]:
import gzip
import shutil

def unzip_gzip(input_file, output_file):
    with gzip.open(input_file, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

unzip_gzip('ratebeer.json.gz', 'data.json')

In [25]:
import json

def json_to_jsonlines(input_file):
   jsonHolder =  []
   with open(input_file, 'r') as input_file:
     for obj in input_file:
       try:
         data_dict = json.loads(obj.replace("'", "\""))
         jsonHolder.append({"review/profileName": data_dict["review/profileName"], "rating": data_dict['review/overall'], "beer/beerId": data_dict["beer/beerId"]})
       except:
         pass
   return jsonHolder

input_file = 'data.json'
jsonHolder = json_to_jsonlines(input_file)


In [11]:
import json

def get_berr_with_names(input_file):
   jsonHolder =  {}
   with open(input_file, 'r') as input_file:
     for obj in input_file:
       try:
         data_dict = json.loads(obj.replace("'", "\""))
         jsonHolder[data_dict["beer/beerId"]] = data_dict["beer/name"]
       except:
         pass
   return jsonHolder

input_file = 'data.json'
beer_with_name = get_berr_with_names(input_file)

In [29]:
def convert_to_jsonl():
    with open('data.jsonl', 'w') as f:
      for entry in jsonHolder:
        json.dump(entry, f)
        f.write('\n')

convert_to_jsonl()

In [30]:
#shows a sample of the first and last 10 extracted reviews
!head -10 'data.jsonl'

{"review/profileName": "hopdog", "rating": "13/20", "beer/beerId": "63836"}
{"review/profileName": "TomDecapolis", "rating": "13/20", "beer/beerId": "63836"}
{"review/profileName": "hopdog", "rating": "8/20", "beer/beerId": "64125"}
{"review/profileName": "JFGrind", "rating": "14/20", "beer/beerId": "71719"}
{"review/profileName": "egajdzis", "rating": "16/20", "beer/beerId": "71719"}
{"review/profileName": "PhillyBeer2112", "rating": "17/20", "beer/beerId": "71719"}
{"review/profileName": "PhillyBeer2112", "rating": "11/20", "beer/beerId": "71715"}
{"review/profileName": "PhillyBeer2112", "rating": "14/20", "beer/beerId": "1470"}
{"review/profileName": "PhillyBeer2112", "rating": "14/20", "beer/beerId": "7721"}
{"review/profileName": "TomDecapolis", "rating": "12/20", "beer/beerId": "64126"}


## Loading the data into a Dask Dataframe
See https://docs.dask.org/en/latest/generated/dask.dataframe.read_json.html

In [1]:
import dask
from dask import bag as db
import dask.dataframe as dd
import pandas as pd
import json
import numpy as np
data_bag = db.read_text('data.jsonl', blocksize="1MB")

ddf = dd.from_pandas(pd.DataFrame([json.loads(entry) for entry in data_bag]), npartitions=1)

print("First three items of data_bag:")
print(data_bag.take(3))

# Print the first three items of the Dask DataFrame
print("\nFirst three items of ddf:")
print(ddf.head(3))

First three items of data_bag:
('{"review/profileName": "hopdog", "rating": "13/20", "beer/beerId": "63836"}\n', '{"review/profileName": "TomDecapolis", "rating": "13/20", "beer/beerId": "63836"}\n', '{"review/profileName": "hopdog", "rating": "8/20", "beer/beerId": "64125"}\n')

First three items of ddf:
  review/profileName rating beer/beerId
0             hopdog  13/20       63836
1       TomDecapolis  13/20       63836
2             hopdog   8/20       64125


In [2]:
unique_beer_names = ddf['beer/beerId'].unique().compute().tolist()
beer_index_map = {beer: idx for idx, beer in enumerate(unique_beer_names)}

print(f"Unique Beer Names: {unique_beer_names}")
print(len(unique_beer_names))

Unique Beer Names: ['63836', '64125', '71719', '71715', '1470', '7721', '64126', '91592', '114513', '77833', '125204', '58511', '25286', '19690', '19692', '25951', '25285', '138014', '100381', '30306', '86870', '86866', '105681', '129598', '110497', '93821', '81138', '91689', '98380', '19694', '87520', '58510', '56572', '118474', '107733', '63156', '142974', '128815', '19691', '35731', '94962', '129305', '102023', '110443', '19689', '100955', '113770', '115220', '109897', '131184', '157915', '100954', '135578', '145726', '120267', '156970', '19646', '123015', '137916', '134179', '4140', '137983', '26363', '54890', '76622', '4141', '135419', '4138', '149529', '151588', '129662', '41770', '63796', '65833', '63795', '69151', '66008', '65835', '65465', '74323', '69248', '5985', '19007', '18196', '5988', '5977', '5984', '5982', '5978', '35550', '13655', '35580', '5983', '19006', '5980', '5981', '19554', '5987', '13671', '5979', '5976', '5986', '95253', '95246', '79264', '115618', '109364', 

In [3]:
unique_user_names = ddf['review/profileName'].unique().compute().tolist()
print(f"Unique User Names: {unique_user_names}")
print(len(unique_user_names))


Unique User Names: ['hopdog', 'TomDecapolis', 'JFGrind', 'egajdzis', 'PhillyBeer2112', 'JJClark', 'CaptainCougar', 'StFun', 'MI2CA', 'vyvvy', 'kp', 'xmarcnolanx', 'Ughsmash', 'fiulijn', 'Ungstrup', 'yespr', 'Theis', 'Dedollewaitor', 'joergen', 'KimJohansen', 'madsberg', 'Papsoe', 'tjthresh', 'awiseman01', 'BBB63', 'Cornfield', 'merlin48', 'WabashMan', 'paultheguru', 'bu11zeye', 'thirdeye11', 'travita', 'mar', 'kramer', 'Cavie', 'blutt59', 'BMan1113VR', 'Maltajo', 'jason', 'bitbucket', 'dchmela', 'scrizzz', 'durhambeer', 'ucusty', 'emacgee', 'Dorwart', 'FlacoAlto', 'alexsdad06', 'after4ever', 'JCB', 'otakuden', 'hopscotch', 'Sparky', 'jsquire', 'jcwattsrugger', 'smith4498', 'Immy', 'mgumby10', 'TheBeerGod', 'Drake', 'LooseCannon', 'decaturstevo', 'EithCubes', 'thedm', 'Tmoney99', 'brentfeesh', 'heemer77', 'puzzl', 'Suttree', 'shp555', 'beerguy101', 'alexanderj', 'JohnC', 'Taverner', 'MoDog', 'Optigon', 'Kevster', 'Acknud', 'goldtwins', 'hotstuff', 'beastiefan2k', 'JoeMcPhee', 'golubj', 

In [4]:
def create_utility_matrix_zeros(unique_names_list, unique_beer_names):
  user_arrays = {user: np.zeros(len(unique_beer_names)) for user in unique_names_list}
  return user_arrays

utility_matrix = create_utility_matrix_zeros(unique_user_names, unique_beer_names)

In [5]:
def populate_utility_matrix(utility_matrix, beer_index_map, unique_user_names, data_bag):
  for entry in data_bag:
    entry_dict = json.loads(entry)

    beer_id = entry_dict.get("beer/beerId")
    user_name = entry_dict.get("review/profileName")
    rating = entry_dict.get("rating")
    rating = rating.split("/")[0]

    if beer_id in beer_index_map and user_name in utility_matrix:
        beer_index = beer_index_map[beer_id]
        utility_matrix[user_name][beer_index] = rating
  return utility_matrix

populated_utility_matrix = populate_utility_matrix(utility_matrix, beer_index_map, unique_user_names, data_bag)


In [6]:
def get_user(utility_matrix, user):
  return utility_matrix[user]

name = str(input("Select a user"))
user = get_user(populated_utility_matrix, name)
print(user)


[13.  8.  0. ...  0.  0.  0.]


In [15]:
import dask.bag as db
import numpy as np

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

def calculate_similarity(user_data, target_user):
    user, vector = user_data
    similarity = cosine_similarity(vector, target_user)
    return (user, similarity)

def filter_same(user_data, target_user):
    result = 0
    for i in range(len(user_data)):
        if user_data[i] != 0 and target_user[i] != 0:
            result += 1
        if user_data[i] == 0 and target_user[i] == 0:
            result += 1
    return result == len(list)

user_list = list(populated_utility_matrix.items())
print(len(user_list))
user_bag = db.from_sequence(user_list)
filtered_user_bag = user_bag.filter(lambda x: filter_same(x[1], target_user))
print(filtered_user_bag.compute())
target_user = user

similarities_bag = user_bag.map(lambda x: calculate_similarity(x, target_user)).compute()
top_similar_users = sorted(similarities_bag, key=lambda x: x[1], reverse=True)[1:10]

# print("Top 10 most similar users:")
# for similar_user, similarity in top_similar_users:
#     print(f"User: {similar_user}, Similarity: {similarity}")

28437


TypeError: object of type 'type' has no len()

Traceback
---------
  File "/usr/local/lib/python3.10/dist-packages/dask/local.py", line 225, in execute_task
    result = _execute_task(task, data)
  File "/usr/local/lib/python3.10/dist-packages/dask/core.py", line 127, in _execute_task
    return func(*(_execute_task(a, cache) for a in args))
  File "/usr/local/lib/python3.10/dist-packages/dask/bag/core.py", line 1860, in reify
    seq = list(seq)
  File "/tmp/ipykernel_125094/160886648.py", line 28, in <lambda>
  File "/tmp/ipykernel_125094/160886648.py", line 23, in filter_same


In [44]:
# def find_unreviewed_beers(similar_users, query_user, utility_matrix, name):
#     # Beers reviewed by similar users
#     similar_users_beers = set()
#     for user in similar_users:
#         user_vector = utility_matrix[user[0]]
#         for i, rating in enumerate(user_vector):
#             if rating > 0:
#                 similar_users_beers.add(i)

#     # Beers reviewed by the query user
#     query_user_vector = utility_matrix[name]
#     query_user_beers = {i for i, rating in enumerate(query_user_vector) if rating > 0}

#     # Beers reviewed by similar users but not by the query user
#     unreviewed_beers = similar_users_beers - query_user_beers

#     # Convert beer indices back to beer IDs
#     beer_index_map_inv = {v: k for k, v in beer_index_map.items()}
#     unreviewed_beer_ids = [beer_index_map_inv[idx] for idx in unreviewed_beers]

#     return unreviewed_beer_ids

# result = find_unreviewed_beers(top_similar_users, user, populated_utility_matrix, name)
# print(len(result))

# count = 0
# for i in populated_utility_matrix['TomDecapolis']:
#   if i != 0:
#     count += 1

# print(count)

# count = 0
# for i in populated_utility_matrix[name]:
#   if i != 0:
#     count += 1

# print(count)


0


In [52]:
def q1c(similar_users, test_user, populated_utility_matrix, unique_beer_names):
    result = []
    for i in range(len(unique_beer_names)):
        flag = True
        holder = []
        for p in similar_users:
            if populated_utility_matrix[p[0]][i] == 0:
                holder.append(p[0])
                flag = False
            
        if flag:
            result.append(i)

 

    new = []
    for i in range(len(test_user)):
        if test_user[i] == 0 and i in result:
            new.append(i)

    again = []
    for i in unique_beer_names:
        try:
            if int(i) in new:
                again.append(i)
        except ValueError as e:
            print(f"Error: {e} occurred while converting {i} to an integer.")

    recommended_show_names_bag = db.from_sequence(again)
    return recommended_show_names_bag

result = q1c(top_similar_users, user, populated_utility_matrix, unique_beer_names)

print(result.compute())

[('TomDecapolis', 0.596343102191683), ('egajdzis', 0.5929820677848494), ('DocLock', 0.5466473684610734), ('michael-pollack', 0.5190846845358148)]
63836
64125
71719
71715
1470
7721
64126
91592
114513
77833
125204
58511
25286
19690
19692
25951
25285
138014
100381
30306
86870
86866
105681
129598
110497
93821
81138
91689
98380
19694
87520
58510
56572
118474
107733
63156
142974
128815
19691
35731
94962
129305
102023
110443
19689
100955
113770
115220
109897
131184
157915
100954
135578
145726
120267
156970
19646
123015
137916
134179
4140
137983
26363
54890
76622
4141
135419
4138
149529
151588
129662
41770
63796
65833
63795
69151
66008
65835
65465
74323
69248
5985
19007
18196
5988
5977
5984
5982
5978
35550
13655
35580
5983
19006
5980
5981
19554
5987
13671
5979
5976
5986
95253
95246
79264
115618
109364
126255
97702
125702
85087
130479
103745
83556
87862
151891
92331
87861
162988
100020
90044
83555
97703
83306
134555
116348
103746
83310
101128
108068
116349
99723
121430
151890
142650
99724
96133