In [1]:
import os
import sys
import json
import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm
from collections import defaultdict, Counter

%matplotlib inline

In [2]:
path_to_df = '../data/Sports/Sports_and_Outdoors_5.json'

df = defaultdict(list)

with open(path_to_df, 'r') as f:
    for line in f.readlines():
        review = json.loads(line)
        df['user_id'].append(review['reviewerID'])
        df['item_id'].append(review['asin'])
        df['timestamp'].append(review['unixReviewTime'])

print(f'Number of events: {len(df["user_id"])}')

df = pd.DataFrame.from_dict(df)

Number of events: 296337


In [3]:
df.head()

Unnamed: 0,user_id,item_id,timestamp
0,AIXZKN4ACSKI,1881509818,1390694400
1,A1L5P841VIO02V,1881509818,1328140800
2,AB2W04NI4OEAD,1881509818,1330387200
3,A148SVSWKTJKU6,1881509818,1328400000
4,AAAWJ6LW9WMOO,1881509818,1366675200


In [4]:
df.isnull().sum()

user_id      0
item_id      0
timestamp    0
dtype: int64

In [5]:
df.user_id.max(), df.user_id.unique().shape

('AZZXSM5W248P8', (35598,))

In [6]:
df.user_id = pd.factorize(df.user_id)[0] + 1
df.user_id.min(), df.user_id.max(), df.user_id.unique().shape

(1, 35598, (35598,))

In [7]:
df.item_id = pd.factorize(df.item_id)[0] + 1
df.item_id.min(), df.item_id.max(), df.item_id.unique().shape

(1, 18357, (18357,))

In [8]:
df.head()

Unnamed: 0,user_id,item_id,timestamp
0,1,1,1390694400
1,2,1,1328140800
2,3,1,1330387200
3,4,1,1328400000
4,5,1,1366675200


In [9]:
df.user_id.nunique()

35598

In [10]:
df.item_id.nunique()

18357

In [11]:
df.head()

Unnamed: 0,user_id,item_id,timestamp
0,1,1,1390694400
1,2,1,1328140800
2,3,1,1330387200
3,4,1,1328400000
4,5,1,1366675200


In [12]:
data = []

for _, row in tqdm(df.iterrows()):
    data.append({
        'user_id': int(row.user_id),
        'item_id': int(row.item_id),
        'timestamp': int(row.timestamp)
    })

print(len(data))

296337it [00:10, 27833.06it/s]

296337





In [13]:
user_history = defaultdict(list)
item_history = defaultdict(list)

for row in tqdm(data):
    user_raw_id = row['user_id']
    item_raw_id = row['item_id']
    interaction_timestamp = row['timestamp']
    
    user_history[user_raw_id].append({'item_id': item_raw_id, 'timestamp': interaction_timestamp})
    item_history[item_raw_id].append({'user_id': user_raw_id, 'timestamp': interaction_timestamp})

is_changed = True
threshold = 5
good_users = set()
good_items = set()


while is_changed:
    old_state = (len(good_users), len(good_items))
    
    good_users = set()
    good_items = set()

    for user_id, history in user_history.items():
        if len(history) >= threshold:
            good_users.add(user_id)

    for item_id, history in item_history.items():
        if len(history) >= threshold:
            good_items.add(item_id)
    
    user_history = {
        user_id: list(filter(lambda x: x['item_id'] in good_items, history))
        for user_id, history in user_history.items()
    }
    
    item_history = {
        item_id: list(filter(lambda x: x['user_id'] in good_users, history))
        for item_id, history in item_history.items()
    }
    
    new_state = (len(good_users), len(good_items))
    is_changed = (old_state != new_state)
    print(old_state, new_state)

100%|██████████████████████████████████████████████████████████████████████| 296337/296337 [00:00<00:00, 832411.33it/s]


(0, 0) (35598, 18357)
(35598, 18357) (35598, 18357)


In [14]:
user_mapping = {}
item_mapping = {}
tmp_user_history = defaultdict(list)
tmp_item_history = defaultdict(list)

for user_id, history in tqdm(user_history.items()):
    processed_history = []

    for filtered_item in history:
        item_id = filtered_item['item_id']
        item_timestamp = filtered_item['timestamp']

        processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
        item_mapping[item_id] = processed_item_id

        processed_history.append({'item_id': processed_item_id, 'timestamp': item_timestamp})
        
    if len(processed_history) >= threshold:
        processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
        user_mapping[user_id] = processed_user_id

        tmp_user_history[processed_user_id] = sorted(processed_history, key=lambda x: x['timestamp'])

    
for item_id, history in tqdm(item_history.items()):
    processed_history = []

    for filtered_user in history:
        user_id = filtered_user['user_id']
        user_timestamp = filtered_user['timestamp']

        processed_user_id = user_mapping.get(user_id, len(user_mapping) + 1)
        user_mapping[user_id] = processed_user_id

        processed_history.append({'user_id': processed_user_id, 'timestamp': user_timestamp})

    if len(processed_history) >= threshold:
        processed_item_id = item_mapping.get(item_id, len(item_mapping) + 1)
        item_mapping[item_id] = processed_item_id

        tmp_item_history[processed_item_id] = sorted(processed_history, key=lambda x: x['timestamp'])

user_history = tmp_user_history
item_history = tmp_item_history

100%|█████████████████████████████████████████████████████████████████████████| 35598/35598 [00:00<00:00, 99018.12it/s]
100%|█████████████████████████████████████████████████████████████████████████| 18357/18357 [00:00<00:00, 43397.96it/s]


In [15]:
print('Users count:', len(user_mapping))
print('Items count:', len(item_mapping))
print('Actions count:', sum(list(map(lambda x: len(x), user_history.values()))))
print('Avg user history len:', np.mean(list(map(lambda x: len(x), user_history.values()))))
print('Avg item history len:', np.mean(list(map(lambda x: len(x), item_history.values()))))

Users count: 35598
Items count: 18357
Actions count: 296337
Avg user history len: 8.324540704533963
Avg item history len: 16.142997221768262


In [16]:
with open('../data/Sports/all_data.txt', 'w') as f:
    for user_id, item_history in user_history.items():
        f.write(' '.join([str(user_id)] + [
            str(item_event['item_id']) for item_event in sorted(item_history, key=lambda x: x['timestamp'])
        ]))
        f.write('\n')