In [107]:
import json
import matplotlib.pyplot as plt
import numpy as np
import random
import time

# Load data

In [25]:
content = []
with open("./data/user_dedup.json") as f:
    for line in f:
        content.append(json.loads(line))

In [28]:
print(len(content)," lines in total")

3041797  lines in total


# Group Data

In [82]:
def extract(key):
    key_list = []
    for line in content:
        key_list.append(line[key])
    return list(set(key_list))
def group_count(key):
    count = {}
    for line in content:
        if(line[key] in count):
            count[line[key]] += 1
        else:
            count[line[key]] = 1
    return count
def report_count(data,key):
    values = np.array(list(data.values()))
    minimum = np.min(values)
    maximum = np.max(values)
    average = np.mean(values)
    c1000 = len(values[values>1000])
    c100 = len(values[values>100])
    c10 = len(values[values>10])
    c0 = len(values[values<10])
    c1 = len(values[values == 1])
    print("On average, each {} appears {} times, range from {} to {}".format(key,average,minimum,maximum))
    print("{} {} appears more than 1000".format(c1000, key))
    print("{} {} appears more than 100".format(c100, key))
    print("{} {} appears more than 10".format(c10, key))
    print("{} {} appears less than 10".format(c0, key))
    print("{} {} appears only once".format(c1, key))

In [84]:
# user
users = extract("reviewerID")
print("There are",len(users)," unique users")
user_group = group_count("reviewerID")
report_count(user_group,'users')

There are 778696  unique users
On average, each users appears 3.906270226121619 times, range from 1 to 44557
26 users appears more than 1000
1211 users appears more than 100
50742 users appears more than 10
720863 users appears less than 10
400180 users appears only once


In [86]:
# product
products = extract("asin")
print("There are",len(products)," unique products")
product_group = group_count("asin")
report_count(product_group,'products')

There are 1435533  unique products
On average, each products appears 2.118932131828387 times, range from 1 to 939
0 products appears more than 1000
651 products appears more than 100
29714 products appears more than 10
1400916 products appears less than 10
1009030 products appears only once


# Filter Data
whether remove sparse data? What's the criterion?

if necessary, remove product data, don't remove user data

# Train/Validation/Test Split
1. users that appears only once couldn't be validation/test data
2. hold out out-of-matrix validation/test data and in-matrix validation/test data respectively
3. Obtain TEST data according to above 2 constraints. Then random split TEST data into validation and test data, the formmer is for tuning hyper-parameter, the latter is for report performance

In [149]:
new_product_ratio = 0.2
im_ratio = 0.1
val_test_ratio = 0.6
TRAIN = []
VAL = []
TEST = []
im_TEST = []
om_TEST = []

start = time.time()
TRAIN = [line for line in content if user_group[line['reviewerID']] <= 1]
CANDIDATE = [line for line in content if user_group[line['reviewerID']] > 1]
cost =time.time() - start
print("remove once users: {} s".format(cost))
start = time.time()
# choose out-of-matrix test data
cn_new_product = int(len(products) * new_product_ratio)
new_product = sorted(np.random.choice(list(product_group.keys()),cn_new_product))
SCAN = sorted(CANDIDATE,key = lambda x:x['asin'])
s = 0
for line in SCAN:
    for i in range(s,len(new_product)):
        if(line['asin'] == new_product[i] ):
            om_TEST.append(line)
            s = i
            CANDIDATE.remove(line)
            break
print("choose {} out of {} products as new product in out-of-matrix test dataset".format(cn_new_product,len(products)))
cost =time.time() - start
print("out-of-matrix: {} s".format(cost))
start = time.time()
# choose in-matrix test data
for line in CANDIDATE:
    if(np.random.random(1)[0] < im_ratio ):
        im_TEST.append(line)
    else:
        TRAIN.append(line)
cost =time.time() - start
print("in-matrix: {} s".format(cost))
start = time.time()
# split TEST into validation and test
for line in im_TEST:
    if(np.random.random(1)[0] <  val_test_ratio):
        VAL.append(line)
    else:
        TEST.append(line)
cn_im_val = len(VAL)
cn_im_test = len(TEST)
print("{} of im_TEST feed to validation data set".format(cn_im_val))
print("{} of im_TEST feed to validation data set".format(cn_im_test))
for line in om_TEST:
    if(np.random.random(1)[0] <  val_test_ratio):
        VAL.append(line)
    else:
        TEST.append(line)
cn_om_val = len(VAL) - cn_im_val
cn_om_test = len(TEST) - cn_im_test
print("{} of om_TEST feed to validation data set".format(cn_om_val))
print("{} of om_TEST feed to validation data set".format(cn_om_test))
print("TRAIN Size: {}".format(len(TRAIN)))
print("Validation Size: {}".format(len(VAL)))
print("TEST Size: {}".format(len(TEST)))
cost =time.time() - start
print("split TEST data: {} s".format(cost))

remove once users: 2.0579540729522705 s


KeyboardInterrupt: 