In [1]:
import urllib.request
filename = 'ratebeer.json.gz'
urllib.request.urlretrieve('https://datarepo.eng.ucsd.edu/mcauley_group/data/beer/ratebeer.json.gz', filename)

('ratebeer.json.gz', <http.client.HTTPMessage at 0x7fdc57892830>)

In [1]:
from dask.distributed import Client, LocalCluster, default_client

try:
    if default_client() is not None:
        default_client().close()
except ValueError:
    pass

cluster = LocalCluster(n_workers=8)
client = Client(cluster)
dashboard_link = client.dashboard_link
print ("Dask Dashboard link: ", dashboard_link)

Dask Dashboard link:  http://127.0.0.1:8787/status


In [2]:
import gzip
import shutil

def unzip_gzip(input_file, output_file):
    with gzip.open(input_file, 'rb') as f_in:
        with open(output_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

unzip_gzip('ratebeer.json.gz', 'data.json')

In [3]:
import json

def json_to_jsonlines(input_file):
   jsonHolder =  []
   with open(input_file, 'r') as input_file:
     for obj in input_file:
       try:
         data_dict = json.loads(obj.replace("'", "\""))
         jsonHolder.append({"review/profileName": data_dict["review/profileName"], "rating": data_dict['review/overall'], "beer/beerId": data_dict["beer/beerId"]})
       except:
         pass
   return jsonHolder

input_file = 'data.json'
jsonHolder = json_to_jsonlines(input_file)

In [4]:
import json

def get_beer_with_names(input_file):
   jsonHolder =  {}
   with open(input_file, 'r') as input_file:
     for obj in input_file:
       try:
         data_dict = json.loads(obj.replace("'", "\""))
         jsonHolder[data_dict["beer/beerId"]] = data_dict["beer/name"]
       except:
         pass
   return jsonHolder

input_file = 'data.json'
beer_with_name = get_beer_with_names(input_file)

In [5]:
def convert_to_jsonl():
    with open('data.jsonl', 'w') as f:
      for entry in jsonHolder:
        json.dump(entry, f)
        f.write('\n')

convert_to_jsonl()

In [6]:
!head -10 'data.jsonl'

{"review/profileName": "hopdog", "rating": "13/20", "beer/beerId": "63836"}
{"review/profileName": "TomDecapolis", "rating": "13/20", "beer/beerId": "63836"}
{"review/profileName": "hopdog", "rating": "8/20", "beer/beerId": "64125"}
{"review/profileName": "JFGrind", "rating": "14/20", "beer/beerId": "71719"}
{"review/profileName": "egajdzis", "rating": "16/20", "beer/beerId": "71719"}
{"review/profileName": "PhillyBeer2112", "rating": "17/20", "beer/beerId": "71719"}
{"review/profileName": "PhillyBeer2112", "rating": "11/20", "beer/beerId": "71715"}
{"review/profileName": "PhillyBeer2112", "rating": "14/20", "beer/beerId": "1470"}
{"review/profileName": "PhillyBeer2112", "rating": "14/20", "beer/beerId": "7721"}
{"review/profileName": "TomDecapolis", "rating": "12/20", "beer/beerId": "64126"}


In [13]:
import dask.dataframe as dd

# Read the JSON lines file
file_path = 'data.jsonl'

dtype = {
    "review/profileName": "object",
    "rating": "object",  # Keep it as object initially because it needs further processing
    "beer/beerId": "object"  # Initially read as object (string)
}

ddf = dd.read_json(file_path, lines=True, blocksize="15MB", dtype=dtype)


In [8]:
# print(ddf.compute())

       review/profileName rating beer/beerId
0                  hopdog  13/20       63836
1            TomDecapolis  13/20       63836
2                  hopdog   8/20       64125
3                 JFGrind  14/20       71719
4                egajdzis  16/20       71719
...                   ...    ...         ...
198954          rauchbier  15/20       12240
198955       imdownthepub  12/20        5677
198956           SilkTork  13/20        5677
198957             omhper  10/20        5677
198958                Bov  15/20        5677

[2783710 rows x 3 columns]


In [12]:
# import dask
# from dask import bag as db
# import dask.dataframe as dd
# import pandas as pd
# import json
# import numpy as np
# data_bag = db.read_text('data.jsonl', blocksize="1MB")

# ddf = dd.from_pandas(pd.DataFrame([json.loads(entry) for entry in data_bag]), npartitions=8)



KeyboardInterrupt: 



In [14]:
unique_profile_names = ddf['review/profileName'].unique().compute()
unique_beer_ids = ddf['beer/beerId'].unique().compute()



In [15]:
num_profiles = len(unique_profile_names)
num_beers = len(unique_beer_ids)

In [16]:
import dask.array as da

def create_utility_matrix_zeros_dask(unique_names_len, unique_beer_ids_len):
    utility_matrix = da.zeros((unique_names_len, unique_beer_ids_len))
    return utility_matrix

utility_matrix = create_utility_matrix_zeros_dask(num_profiles, num_beers)
print(utility_matrix.shape)

(28437, 109279)


In [17]:
user_index_map = {user: idx for idx, user in enumerate(unique_profile_names)}
beer_index_map = {beer: idx for idx, beer in enumerate(unique_beer_ids)}

In [18]:
# import pandas as pd
# ratings_df = pd.DataFrame(ratings)
# ratings_df["rating"] = ratings_df["rating"].apply(lambda x: int(x.split("/")[0]))

In [11]:
# def update_utility_matrix(row):
#     user_index = user_index_map[row["review/profileName"]]
#     beer_index = beer_index_map[row["beer/beerId"]]
#     rating = row["rating"]
#     utility_matrix[user_index, beer_index] = rating

# ratings_df.apply(update_utility_matrix, axis=1)



KeyboardInterrupt: 

In [21]:
def update_utility(row):
    print(row)
    profile_idx = user_index_map[row['review/profileName'].iloc[0]]
    beer_idx = beer_index_map[row['beer/beerId'].iloc[0]]
    rating = float(row['rating'].split('/')[0])  # Assuming rating is a string like '13/20', taking the numerator
    utility_matrix[profile_idx, beer_idx] = rating

print(ddf.compute())
ddf.map_partitions(update_utility).compute()

       review/profileName rating beer/beerId
0                  hopdog  13/20       63836
1            TomDecapolis  13/20       63836
2                  hopdog   8/20       64125
3                 JFGrind  14/20       71719
4                egajdzis  16/20       71719
...                   ...    ...         ...
198954          rauchbier  15/20       12240
198955       imdownthepub  12/20        5677
198956           SilkTork  13/20        5677
198957             omhper  10/20        5677
198958                Bov  15/20        5677

[2783710 rows x 3 columns]
  review/profileName rating beer/beerId
0                  a      a           a
1               <NA>   <NA>        <NA>


ValueError: Metadata inference failed in `update_utility`.

You have supplied a custom function and Dask is unable to 
determine the type of output that that function returns. 

To resolve this please provide a meta= keyword.
The docstring of the Dask function you ran should have more information.

Original error is below:
------------------------
KeyError('a')

Traceback:
---------
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask/dataframe/utils.py", line 195, in raise_on_meta_error
    yield
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask_expr/_expr.py", line 3987, in _emulate
    return func(*_extract_meta(args, True), **_extract_meta(kwargs, True))
  File "/tmp/ipykernel_133859/697536015.py", line 3, in update_utility
    profile_idx = user_index_map[row['review/profileName'].iloc[0]]


In [63]:
def parse_rating(rating_str):
    return int(rating_str.split('/')[0])

def process_rating(rating):
    print(rating)
    user_idx = user_index_map[rating["review/profileName"]]
    beer_idx = beer_index_map[rating["beer/beerId"]]
    rating_value = parse_rating(rating["rating"])
    utility_matrix[user_idx, beer_idx] = rating_value

ddf.map(process_rating).compute()


utility_matrix_np = utility_matrix.compute()

foo


ValueError: Metadata inference failed in `map`.

Original error is below:
------------------------
TypeError('string indices must be integers')

Traceback:
---------
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask/dataframe/utils.py", line 195, in raise_on_meta_error
    yield
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask/dataframe/core.py", line 6851, in elemwise
    meta = partial_by_order(*parts, function=op, other=other)
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask/utils.py", line 1476, in partial_by_order
    return function(*args2, **kwargs)
  File "/csse/users/jbr257/.local/lib/python3.10/site-packages/dask/utils.py", line 1241, in __call__
    return getattr(__obj, self.method)(*args, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py", line 10120, in map
    return self.apply(infer).__finalize__(self, "map")
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py", line 10034, in apply
    return op.apply().__finalize__(self, method="apply")
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 837, in apply
    return self.apply_standard()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 963, in apply_standard
    results, res_index = self.apply_series_generator()
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/apply.py", line 979, in apply_series_generator
    results[i] = self.func(v, *self.args, **self.kwargs)
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/frame.py", line 10118, in infer
    return x._map_values(func, na_action=na_action)
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/base.py", line 921, in _map_values
    return algorithms.map_array(arr, mapper, na_action=na_action, convert=convert)
  File "/usr/local/lib/python3.10/dist-packages/pandas/core/algorithms.py", line 1814, in map_array
    return lib.map_infer(values, mapper, convert=convert)
  File "lib.pyx", line 2920, in pandas._libs.lib.map_infer
  File "/tmp/ipykernel_110032/1802243121.py", line 6, in process_rating
    user_idx = user_index_map[rating["review/profileName"]]
