In [2]:
import json, sys, os, gc, ast
import pandas as pd
from tqdm import tqdm
from collections import defaultdict, Counter

project = 'AmazonReviews2018'
path_to_raw = '../data/'+project+'/raw/'
path_to_processed = '../data/'+project+'/'

domains = {'Books':'Books.json',
           'Movies':'Movies_and_TV.json'
          }

In [3]:
# this cell is required for colab only

from google.colab import drive
drive.mount('/content/drive')

path_to_raw = './drive/MyDrive/Science/Multidomain RecSys/' + path_to_raw[3:]
path_to_processed = './drive/MyDrive/Science/Multidomain RecSys/source/iz-dev/' + path_to_processed[3:]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
for d in tqdm(domains):
    # part 1
    new_dir = path_to_processed + d
    if not os.path.isdir(new_dir):
        os.makedirs(new_dir)

    path_to_json = path_to_raw+d+'/'+domains[d]
    path_to_tmp = new_dir+'/tmp_'+domains[d]
    with open(path_to_tmp, 'w') as out_f:
        pass

    with open(path_to_json, 'r') as in_f, open(path_to_tmp, 'a') as out_f:
        for line in in_f:
            line = json.loads(line)
            line = {
                'user_id': line['reviewerID'],
                'item_id': line['asin'],
                'timestamp': line['unixReviewTime']
            }
            out_f.write(json.dumps(line)+'\n')

In [8]:
for d in tqdm(domains):
    new_dir = path_to_processed + d
    path_to_json = new_dir +'/tmp_'+domains[d]

    # part 2
    print('\n...part 2...')
    user_interactions_count = {}
    item_interactions_count = {}

    with open(path_to_json, 'r') as f:
        for line in f:
            line = json.loads(line)
            user_id = line['user_id']
            item_id = line['item_id']
            user_interactions_count[user_id] = user_interactions_count.get(user_id, 0) + 1
            item_interactions_count[item_id] = item_interactions_count.get(item_id, 0) + 1

    threshold = 5
    good_users = set(user_id for user_id, count in user_interactions_count.items() if count >= threshold)
    good_items = set(item_id for item_id, count in item_interactions_count.items() if count >= threshold)

    del user_interactions_count, item_interactions_count
    gc.collect()


    # part 3
    print('...part 3...')
    user_history = defaultdict(list)
    item_history = defaultdict(list)

    with open(path_to_json, 'r') as f:
        for line in f:
            line = json.loads(line)

            user_raw_id = line['user_id']
            item_raw_id = line['item_id']
            interaction_timestamp = line['timestamp']

            if user_raw_id in good_users and item_raw_id in good_items:
                user_history[user_raw_id].append(item_raw_id)
                item_history[item_raw_id].append(user_raw_id)

    del good_users, good_items
    gc.collect()


    # part 4
    print('...part 4...')
    is_changed = True
    good_users = set()
    good_items = set()

    while is_changed:
        old_state = (len(good_users), len(good_items))

        good_users = set()
        good_items = set()

        for user_id, history in user_history.items():
            if len(history) >= threshold:
                good_users.add(user_id)

        for item_id, history in item_history.items():
            if len(history) >= threshold:
                good_items.add(item_id)

        user_history = {
            user_id: list(filter(lambda x: x in good_items, history))
            for user_id, history in user_history.items()
        }

        item_history = {
            item_id: list(filter(lambda x: x in good_users, history))
            for item_id, history in item_history.items()
        }

        new_state = (len(good_users), len(good_items))
        is_changed = (old_state != new_state)
        print(old_state, new_state)

    del user_history, item_history, history
    gc.collect()

    with open(new_dir +'/tmp_good_users.txt', 'w') as f:
        f.write(str(good_users))

    with open(new_dir +'/tmp_good_items.txt', 'w') as f:
        f.write(str(good_items))

    del good_users, good_items
    gc.collect()

In [None]:
lines_counter = 0
for d in tqdm(domains):
    new_dir = path_to_processed + d
    path_to_json = new_dir +'/tmp_'+domains[d]

    # part 5
    print('\n...part 5...')
    user_history = defaultdict(list)
    item_history = defaultdict(list)
    with open(new_dir +'/tmp_good_users.txt', 'r') as f:
        good_users =  ast.literal_eval(f.read())
    with open(new_dir +'/tmp_good_items.txt', 'r') as f:
        good_items =  ast.literal_eval(f.read())

    with open(path_to_json, 'r') as f:
        for line in f:
            line = json.loads(line)

            user_raw_id = line['user_id']
            item_raw_id = line['item_id']
            interaction_timestamp = line['timestamp']

            if user_raw_id in good_users and item_raw_id in good_items:
                #user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
                #item_history[item_raw_id].append({'user_id': user_raw_id, 'timestamp': interaction_timestamp})
                user_history[user_raw_id].append([item_raw_id, interaction_timestamp])
                item_history[item_raw_id].append([user_raw_id, interaction_timestamp])

            lines_counter += 1
            if lines_counter % 1000000 == 0:
                print('...%s lines processed...'%(str(lines_counter)))

    with open(new_dir +'/tmp_user_history.json', 'a') as f:
        for user_id,history in user_history.items():
            f.write(json.dumps(str(user_id)+':'+str(history))+'\n')

    with open(new_dir +'/tmp_item_history.json', 'a') as f:
        for item_id,history in item_history.items():
            f.write(json.dumps(str(item_id)+':'+str(history))+'\n')

    del good_users, good_items
    gc.collect()

In [None]:
for d in tqdm(domains):
    new_dir = path_to_processed + d
    path_to_json = new_dir +'/tmp_'+domains[d]

    # part 6
    print('...part 6...')
    user_history = {}
    item_history = {}

    with open(new_dir +'/tmp_user_history.json', 'r') as f:
        for line in f:
            line = json.loads(line).split(':')
            user_history[line[0]] = ast.literal_eval(line[1])

    with open(new_dir +'/tmp_item_history.json', 'r') as f:
        for line in f:
            line = json.loads(line).split(':')
            item_history[line[0]] = ast.literal_eval(line[1])

    threshold = 5
    user_mapping = {}
    item_mapping = {}
    tmp_user_history = defaultdict(list)
    tmp_item_history = defaultdict(list)

    for user_id, history in tqdm(user_history.items()):
        processed_history = []

        for filtered_item in history:
            #item_id = filtered_item['item_id']
            #item_timestamp = filtered_item['timestamp']
            item_id = filtered_item[0]
            item_timestamp = filtered_item[1]

            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            #processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})
            processed_history.append([processed_item_id, item_timestamp])

        if len(processed_history) >= threshold:
            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            #tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])
            tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x[1])


    for item_id, history in tqdm(item_history.items()):
        processed_history = []

        for filtered_user in history:
            #user_id = filtered_user['user_id']
            #user_timestamp = filtered_user['timestamp']
            user_id = filtered_user[0]
            user_timestamp = filtered_user[1]

            processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
            user_mapping[user_id] = processed_user_id

            #processed_history.append({'user_id': processed_user_id, 'timestamp': user_timestamp})
            processed_history.append([processed_user_id, user_timestamp])

        if len(processed_history) >= threshold:
            processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
            item_mapping[item_id] = processed_item_id

            #tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x['timestamp'])
            tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x[1])

    user_history = tmp_user_history
    item_history = tmp_item_history

    del processed_history, history
    del user_mapping, item_mapping
    del tmp_user_history, tmp_item_history
    gc.collect()


    # part 7
    print('...part 7...')
    for output_name in ['all_data.txt']:
        with open(new_dir+'/'+output_name, 'w') as f:
                for user_id, item_history in user_history.items():
                    f.write(' '.join([str(user_id)] + [
                        #str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
                        str(item_event[0]) for item_event in sorted(item_history, key=lambda x: x[1])
                    ]))
                    f.write('\n')

    del user_history, item_history
    gc.collect()