In [2]:
import gzip
from collections import defaultdict
import pickle

In [3]:
def parse(path):
    g = gzip.open(path, 'rt')
    for l in g:
        yield eval(l)

userIDs = {}
itemIDs = {}
interactions = []

for d in parse("ratebeer.json.gz"):
    if 'review/profileName' in d and 'beer/beerId' in d and 'review/time' in d and 'review/overall' in d:
        u = d['review/profileName']
        i = d['beer/beerId']
        if not u in userIDs:
            userIDs[u] = len(userIDs)
        if not i in itemIDs:
            itemIDs[i] = len(itemIDs)
        t = int(d['review/time'])
        r_tmp = d['review/overall'].split('/')
        r = float(r_tmp[0]) / float(r_tmp[1])
        interactions.append((t, u, i, r))

interactions.sort()

# Take the last 10% of the entire dataset as the test set
test_idx = int(len(interactions) * 0.9)
interactions_test = interactions[test_idx:]
interactions_used = interactions[:test_idx]

# interactions per user
interactionsPerUser_test = defaultdict(list)
for t, u, i, r in interactions_test:
    interactionsPerUser_test[u].append((t, i, r))

interactionsPerUser_used = defaultdict(list)
for t, u, i, r in interactions_used:
    interactionsPerUser_used[u].append((t, i, r))

# Splitting the used data into train, verify sets
interactions_train = []
interactions_ver = []

interactionsPerUser_train = defaultdict(list)
interactionsPerUser_ver = defaultdict(list)

for user, interactions in interactionsPerUser_used.items():
    total = len(interactions)
    train_idx = int(total * 0.8)

    interactionsPerUser_train[user] = interactions[:train_idx]
    interactionsPerUser_ver[user] = interactions[train_idx:]
    interactions_train.extend([(t, user, i, r) for t, i, r in interactions[:train_idx]])
    interactions_ver.extend([(t, user, i, r) for t, i, r in interactions[train_idx:]])

In [4]:
# summary of the data
# Now we have 3 sets of data: train, verify, test
# test: 10% of the entire data (292417) --> interactionsPerUser_test, interactions_test

# used: 90% of the entire data (2631759) --> interactionsPerUser_used, interactions_used
#   - train: 80% of the used data (2091056) --> interactionsPerUser_train, interactions_train
#   - verify: 20% of the used data (540690) --> interactionsPerUser_ver, interactions_ver

# format of the data
# interactionsPerUser_xxx: [user] = [(time, item, rating)]
# interactions_xxx: (time, user, item, rating)

In [5]:
userIDs_train = {}
itemIDs_train = {}
for t, u, i, r in interactions_train:
    if u not in userIDs_train:
        userIDs_train[u] = len(userIDs_train)
    if i not in itemIDs_train:
        itemIDs_train[i] = len(itemIDs_train)

itemIDs_train['dummy'] = len(itemIDs_train)
itemIDs['dummy'] = len(itemIDs)
interactionsWithPrevious = []
interactionsWithPrevious_ver = []
interactionsWithPrevious_test = []

for u in interactionsPerUser_train:
    lastItem = 'dummy'
    for (t, i, r) in interactionsPerUser_train[u]:
        interactionsWithPrevious.append((t, u, i, lastItem, r))
        lastItem = i

for u in interactionsPerUser_ver:
    # "lastItem": the last item of the train set, so we get the last item of the train set for each user
    # "dummy": if there is no interaction in the train set, we use 'dummy' as a placeholder
    if u in interactionsPerUser_train and len(interactionsPerUser_train[u]) > 0:
        lastItem = interactionsPerUser_train[u][-1][1]
    else:
        lastItem = 'dummy'

    for (t, i, r) in interactionsPerUser_ver[u]:
        interactionsWithPrevious_ver.append((t, u, i, lastItem, r))
        lastItem = i

for user, interactions in interactionsPerUser_test.items():
    if user in interactionsPerUser_ver and len(interactionsPerUser_ver[user]) > 0:
        lastItem = interactionsPerUser_ver[user][-1][1]
    elif user in interactionsPerUser_train and len(interactionsPerUser_train[user]) > 0:
        lastItem = interactionsPerUser_train[user][-1][1]
    else:
        lastItem = 'dummy'

    for (t, i, r) in interactions:
        interactionsWithPrevious_test.append((t, user, i, lastItem, r))
        lastItem = i

In [6]:
# instance of an interaction with the previous item
# format: (time, user, item, lastItem, rating)
print(interactionsWithPrevious[0])
print(interactionsWithPrevious[1])
print(interactionsWithPrevious[2])
print(len(interactionsWithPrevious))

# instance of an interaction with the previous item for verify set
print(interactionsWithPrevious_ver[0])
print(interactionsWithPrevious_ver[1])
print(interactionsWithPrevious_ver[2])
print(len(interactionsWithPrevious_ver))

(955497600, 'billb', '132', 'dummy', 0.85)
(956448000, 'billb', '544', '132', 0.85)
(956448000, 'billb', '547', '544', 0.5)
2091056
(1010534400, 'billb', '2090', '10325', 0.8)
(1010707200, 'billb', '4082', '2090', 0.7)
(1010966400, 'billb', '7323', '4082', 0.85)
540690


In [7]:
itemsPerUser_train = defaultdict(set)
for _, u, i, _ in interactions_train:
    itemsPerUser_train[u].add(i)
    
itemsPerUser_ver = defaultdict(set)
for _, u, i, _ in interactions_ver:
    itemsPerUser_ver[u].add(i)

itemsPerUser_test = defaultdict(set)
for _, u, i, _ in interactions_test:
    itemsPerUser_test[u].add(i)

# beerIDs
items_train = list(itemIDs_train.keys())
items = list(itemIDs.keys())

In [9]:
# save the data
beer_variables = {
    # format: xxxIDs[user] = id
    'userIDs': userIDs,
    'itemIDs': itemIDs,
    'userIDs_train': userIDs_train,
    'itemIDs_train': itemIDs_train,
    
    # format: interactions_xxx: (time, user, item, rating)
    'interactions_test': interactions_test,
    'interactions_used': interactions_used,
    'interactions_train': interactions_train,
    'interactions_ver': interactions_ver,
    
    # format: interactionsPerUser_xxx: [user] = [(time, item, rating)]
    'interactionsPerUser_test': interactionsPerUser_test,
    'interactionsPerUser_used': interactionsPerUser_used,
    'interactionsPerUser_train': interactionsPerUser_train,
    'interactionsPerUser_ver': interactionsPerUser_ver,
    
    # format: interactionsWithPrevious_xxx: (time, user, item, lastItem, rating)
    'interactionsWithPrevious': interactionsWithPrevious,
    'interactionsWithPrevious_ver': interactionsWithPrevious_ver,
    'interactionsWithPrevious_test': interactionsWithPrevious_test,
    
    # format: itemsPerUser_xxx: [user] = [item]
    'itemsPerUser_train': itemsPerUser_train,
    'itemsPerUser_ver': itemsPerUser_ver,
    'itemsPerUser_test': itemsPerUser_test,
    
    # format: items_xxx: [item]
    'items_train': items_train,
    'items': items
}

with open('beer_variables.pkl', 'wb') as file:
    pickle.dump(beer_variables, file)