## Preprocess Steps

* Build vocab files for user and item, textline format: field  value  idx
* Filter multicore rating data
* Holdout train-val-test dataset

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

DATA_ROOT = "~/datasets/"
PROCESSED_DATA_ROOT = "~/workspace/experiments/"
data_path = DATA_ROOT + "amazon_rating_only/"

In [2]:
# data file name, main_cat field in meta
domain = ["Movies_and_TV", "Movies_and_TV", 40, 10]

target_path = PROCESSED_DATA_ROOT + "amz_" + domain[0] + "/"

### Rating Data

In [3]:
import pandas as pd
df = pd.read_table(data_path + domain[0] + ".csv", sep=",", names = ["ItemID", "UserID", "Response", "Timestamp"])
df[:3]

Unnamed: 0,ItemID,UserID,Response,Timestamp
0,871167042,A2IC3NZN488KWK,5.0,1399161600
1,871167042,A3OT9BYASFGU2X,4.0,1398470400
2,871167042,A28GK1G2KDXHRP,5.0,1397692800


In [4]:
n_user, n_item = len(df.UserID.unique()), len(df.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"sparsity: {1.0 - len(df) / (n_user * n_item)}")

#user: 12483678
#item: 2681297
sparsity: 0.9999990352633114


In [5]:
from data.preprocess import run_multicore, run_multicore_asymetric
# multicore_data = run_multicore(df[["UserID", "ItemID", "Response", "Timestamp"]], n_core = domain[2])
multicore_data = run_multicore_asymetric(df[["UserID", "ItemID", "Response", "Timestamp"]], n_core_user = domain[2], n_core_item = domain[3])
# multicore_data = run_multicore_asymetric(multicore_data, n_core_user = 40, n_core_item = 10)

N-core is set to [5,100]
Filtering (25,5)-core data
Iteration 1


100%|██████████| 32292099/32292099 [00:55<00:00, 578194.20it/s]


Number of removed record: 30237115
Iteration 2


100%|██████████| 2054984/2054984 [00:04<00:00, 459340.88it/s]


Number of removed record: 942823
Iteration 3


100%|██████████| 1112161/1112161 [00:02<00:00, 509268.58it/s]


Number of removed record: 427996
Iteration 4


100%|██████████| 684165/684165 [00:01<00:00, 475807.74it/s]


Number of removed record: 152870
Iteration 5


100%|██████████| 531295/531295 [00:00<00:00, 604135.10it/s]


Number of removed record: 84459
Iteration 6


100%|██████████| 446836/446836 [00:00<00:00, 512922.23it/s]


Number of removed record: 50486
Iteration 7


100%|██████████| 396350/396350 [00:00<00:00, 756167.53it/s]


Number of removed record: 33237
Iteration 8


100%|██████████| 363113/363113 [00:00<00:00, 811367.36it/s]


Number of removed record: 23617
Iteration 9


100%|██████████| 339496/339496 [00:00<00:00, 787554.34it/s]


Number of removed record: 18529
Iteration 10


100%|██████████| 320967/320967 [00:00<00:00, 786311.65it/s]


Number of removed record: 13719
Iteration 11


100%|██████████| 307248/307248 [00:00<00:00, 707826.47it/s]


Number of removed record: 10345
Iteration 12


100%|██████████| 296903/296903 [00:00<00:00, 812444.71it/s]


Number of removed record: 8631
Iteration 13


100%|██████████| 288272/288272 [00:00<00:00, 720921.75it/s]


Number of removed record: 8155
Iteration 14


100%|██████████| 280117/280117 [00:00<00:00, 751718.29it/s]


Number of removed record: 7663
Iteration 15


100%|██████████| 272454/272454 [00:00<00:00, 814353.15it/s]


Number of removed record: 5809
Iteration 16


100%|██████████| 266645/266645 [00:00<00:00, 835169.88it/s]


Number of removed record: 5201
Iteration 17


100%|██████████| 261444/261444 [00:00<00:00, 733864.04it/s]


Number of removed record: 4456
Iteration 18


100%|██████████| 256988/256988 [00:00<00:00, 677125.62it/s]


Number of removed record: 4096
Iteration 19


100%|██████████| 252892/252892 [00:00<00:00, 818093.70it/s]


Number of removed record: 3582
Iteration 20


100%|██████████| 249310/249310 [00:00<00:00, 874612.90it/s]


Number of removed record: 2639
Iteration 21


100%|██████████| 246671/246671 [00:00<00:00, 806263.28it/s]


Number of removed record: 2126
Iteration 22


100%|██████████| 244545/244545 [00:00<00:00, 873063.60it/s]


Number of removed record: 2108
Iteration 23


100%|██████████| 242437/242437 [00:00<00:00, 901445.16it/s]


Number of removed record: 1582
Iteration 24


100%|██████████| 240855/240855 [00:00<00:00, 923760.64it/s]


Number of removed record: 1326
Iteration 25


100%|██████████| 239529/239529 [00:00<00:00, 938182.52it/s]


Number of removed record: 688
Iteration 26


100%|██████████| 238841/238841 [00:00<00:00, 842240.47it/s]


Number of removed record: 376
Iteration 27


100%|██████████| 238465/238465 [00:00<00:00, 824794.69it/s]


Number of removed record: 234
Iteration 28


100%|██████████| 238231/238231 [00:00<00:00, 968009.45it/s]


Number of removed record: 131
Iteration 29


100%|██████████| 238100/238100 [00:00<00:00, 826319.89it/s]


Number of removed record: 98
Iteration 30


100%|██████████| 238002/238002 [00:00<00:00, 783367.27it/s]


Number of removed record: 29
Iteration 31


100%|██████████| 237973/237973 [00:00<00:00, 929987.22it/s] 


Number of removed record: 10
Iteration 32


100%|██████████| 237963/237963 [00:00<00:00, 893171.84it/s]


Number of removed record: 0
Size change: 32292099 --> 237963


In [6]:
n_user, n_item = len(multicore_data.UserID.unique()), len(multicore_data.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"#record: {len(multicore_data)}")
print(f"sparsity: {1.0 - len(multicore_data) / (n_user * n_item)}")

#user: 7076
#item: 3354
#record: 237963
sparsity: 0.989973287719025


In [7]:
items = {iid: False for iid in multicore_data['ItemID'].unique()}
print(len(items))

3354


In [8]:
# import pandas as pd
from tqdm import tqdm
# pd.read_csv(data_path + "meta/filtered_meta.csv", sep = '\t', nrows = 3)
item_meta = {}
with open(data_path + "meta/filtered_meta.csv", 'r') as fin:
    fin.readline()
    for i,line in tqdm(enumerate(fin)):
        meta_info = line.strip().split("\t")
        item_id = meta_info[0]
        if item_id in items:
            item_meta[item_id] = meta_info
            del items[item_id]
            if len(items) == 0:
                break
print("Item meta info of items in data set:")
print(f"Found: {len(item_meta)}")
print(f"Missing: {len(items)}")

2683667it [00:04, 561166.20it/s]

Item meta info of items in data set:
Found: 3354
Missing: 0





In [9]:
selected_rows = [True] * len(multicore_data)
for i,iid in enumerate(multicore_data["ItemID"]):
    if iid in items:
        selected_rows[i] = False
multicore_data = multicore_data[selected_rows]
multicore_data = run_multicore_asymetric(multicore_data, domain[2], domain[3])

N-core is set to [5,100]
Filtering (25,5)-core data
Iteration 1


100%|██████████| 237963/237963 [00:00<00:00, 837305.56it/s]


Number of removed record: 0
Size change: 237963 --> 237963


In [10]:
n_user, n_item = len(multicore_data.UserID.unique()), len(multicore_data.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"#record: {len(multicore_data)}")
print(f"sparsity: {1.0 - len(multicore_data) / (n_user * n_item)}")

#user: 7076
#item: 3354
#record: 237963
sparsity: 0.989973287719025


In [11]:
multicore_data = multicore_data.sort_values(by=['UserID','Timestamp'])

In [12]:
from utils import set_random_seed
set_random_seed(9)
from data.preprocess import holdout_data_sequential, recheck_exist
trainset, valset, testset = holdout_data_sequential(multicore_data, holdout_type = "warm", ratio = [0.8,0.1,0.1])
trainset = trainset.reset_index(drop = True)
valset = valset.reset_index(drop = True)
testset = testset.reset_index(drop = True)
# recheck if there is any unseen item in val or test, if there is move corresponding user history into train
trainset, valset, testset = recheck_exist(trainset, valset, testset, field_name = "ItemID")

Build user history


237963it [00:00, 491040.86it/s]


Holdout user histories


100%|██████████| 7076/7076 [00:00<00:00, 8082.36it/s]


Move unseen ItemID from val to train
0/20660, finish in 626060009.0s.   Moving user data
Before moving: Target DataFrame: 196643, Source Data Frame: 20660


100%|██████████| 81/81 [00:00<00:00, 3320.19it/s]
100%|██████████| 6995/6995 [00:01<00:00, 4755.67it/s]



#user moved: 81
After moving: Target DataFrame: 197028, Source Data Frame: 20275
Move unseen ItemID from test to train, this may also move users in val to train
0/20660, finish in 406372547.1s.   Val --> Train
Moving user data
Before moving: Target DataFrame: 197028, Source Data Frame: 20275


100%|██████████| 181/181 [00:00<00:00, 6589.67it/s]
100%|██████████| 6846/6846 [00:01<00:00, 6015.01it/s]



#user moved: 181
After moving: Target DataFrame: 197550, Source Data Frame: 19753
Test --> Train
Moving user data
Before moving: Target DataFrame: 197550, Source Data Frame: 20660


100%|██████████| 181/181 [00:00<00:00, 4608.15it/s]
100%|██████████| 6895/6895 [00:01<00:00, 4793.62it/s]



#user moved: 181
After moving: Target DataFrame: 198275, Source Data Frame: 19935


In [13]:
len(trainset["UserID"].unique()), len(valset["UserID"].unique()), len(testset["UserID"].unique())

(7076, 6846, 6895)

In [14]:
len(trainset["ItemID"].unique()), len(valset["ItemID"].unique()), len(testset["ItemID"].unique())

(3354, 1323, 1866)

In [15]:
len(trainset), len(valset), len(testset)

(198275, 19753, 19935)

In [16]:
from utils import setup_path
save_path = target_path + "tsv_data/"
setup_path(save_path, is_dir = True)
trainset.to_csv(save_path + "train.tsv", sep = '\t', index = False)
valset.to_csv(save_path + "val.tsv", sep = '\t', index = False)
testset.to_csv(save_path + "test.tsv", sep = '\t', index = False)



### Meta Data

See fields description [here](https://nijianmo.github.io/amazon/index.html)

15023058 lines

Fields: 'category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'image', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat', 'similar_item', 'date', 'price', 'asin'

Selected fields: "asin", "category", "price", "brand"


In [17]:
# selected_fields = ["asin", "category", "price", "also_buy", "also_view", "brand"]
selected_fields = ["asin", "category", "price", "brand"]
# import numpy as np

# def price_to_id(price):
#     if price == '':
#         return 0
#     p = price.strip().replace('$','').split('.')
#     l,v = len(p[0]), int(p[0][0])
# #     return (l-1) * 3 + (0 if v<3 else 1 if v < 6 else 2) + 1
#     return (l-1) * 10 + v + 1

# def clean(textline):
#     return textline.replace(',','_').replace(' ','').replace('&','_').replace('\t','')

# with open(data_path + "meta/filtered_meta.csv", 'w') as fout:
#     fout.write("ItemID\tCategory\tMin_Price\tMax_Price\tAlso_Buy\tAlso_View\tBrand\n")
#     with open(data_path + "meta/All_Amazon_Meta.json", 'r') as fin:
#         for i,line in enumerate(fin):
#             if i % 100000 == 0:
#                 print(f"#line: {i}", end = '\r')
#             info = eval(line)
#             fout.write(info['asin'] + "\t") # ItemID
#             fout.write(",".join([clean(c) for c in info['category'] \
#                                  if len(c) < 30]) + "\t") # Category
#             try:
#                 if len(info['price']) > 20:
#                     fout.write("0\t0\t")
#                 else:
#                     price = info['price'].split("-") # Min_Price and Max_Price
#                     if len(price) == 2:
#                         fout.write(f"{price_to_id(price[0])}\t{price_to_id(price[1])}\t")
#                     else:
#                         pidx = price_to_id(price[0])
#                         fout.write(f"{pidx}\t{pidx}\t")
#             except:
#                 fout.write("0\t0\t")
#             fout.write(",".join(info['also_buy']) + "\t")
#             fout.write(",".join(info['also_view']) + "\t")
#             fout.write(clean(info['brand'][:30]) + "\n")
#     print(f"#line: {i}", end = '\r')

In [18]:
import pandas as pd
pd.read_csv(data_path + "meta/filtered_meta.csv", sep = '\t', nrows = 3)

Unnamed: 0,ItemID,Category,Min_Price,Max_Price,Also_Buy,Also_View,Brand
0,6305121869,"Clothing_Shoes_Jewelry,Women,Clothing,Tops_Tee...",10,12,,,Ninasill_Blouse
1,6318708057,"Clothing_Shoes_Jewelry,Traditional_CulturalWea...",12,12,,,Coolred-Women
2,6342506256,"Clothing_Shoes_Jewelry,Men,Clothing,Shorts,Car...",13,13,,"B07CRJ95M7,B008AHISU4,B07B8F98W2,B07DD98Q7R,B0...",Gaok


In [19]:
items = {iid: False for iid in multicore_data['ItemID'].unique()}
print(len(items))

20026


In [20]:
from tqdm import tqdm
item_meta = {}
with open(data_path + "meta/filtered_meta.csv", 'r') as fin:
    fin.readline()
    for i,line in tqdm(enumerate(fin)):
        meta_info = line.strip().split("\t")
        item_id = meta_info[0]
        if item_id in items:
            item_meta[item_id] = meta_info
            del items[item_id]
            if len(items) == 0:
                break
print("Item meta info of items in data set:")
print(f"Found: {len(item_meta)}")
print(f"Missing: {len(items)}")

8225386it [00:13, 603302.33it/s]

Item meta info of items in data set:
Found: 20026
Missing: 0





In [21]:
import pandas as pd
item_meta_df = pd.DataFrame.from_dict(item_meta, orient = "index", columns = ["ItemID", "Category", "MinPrice", "MaxPrice", "AlsoBuy", "AlsoView", "Brand"])
item_meta_df = item_meta_df[["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"]]
item_meta_df = item_meta_df.reset_index(drop = True)
# item_meta_df = pd.DataFrame.from_dict(item_meta, orient = "index", columns = ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"])
# item_meta_df.insert(0, 'ItemID', item_meta_df.index)
item_meta_df[:3]

Unnamed: 0,ItemID,Category,MinPrice,MaxPrice,Brand
0,0789743035,"Electronics,Computers_amp;Accessories,Computer...",0,0,VisitAmazon'sJohnRayPage
1,0972683275,"Electronics,Accessories_amp;Supplies,Audio_amp...",14,14,VideoSecu
2,106171327X,"Electronics,Computers_amp;Accessories,MemoryCa...",12,12,SanDisk


In [22]:
from utils import setup_path
save_path = target_path + "meta_data/item.meta"
setup_path(save_path, is_dir = False)
item_meta_df.to_csv(save_path, sep = '\t', index = False)



In [23]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/item_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(item_meta_df, save_path, ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"])



In [24]:
import pandas as pd
# item_fields_meta = pd.DataFrame({"field_name": ["ItemID", "Category", "MinPrice", "MaxPrice", "AlsoBuy", "AlsoView", "Brand"], 
#                                  "field_type": ["nominal", "nominal", "ordinal", "ordinal", "nominal", "nominal", "nominal"], 
#                                  "value_type": ["int", "str", "int", "int", "int", "int", "str"], 
#                                  "field_enc": ["v2id", "v2onehot", "v2id", "v2id", "v2multid", "v2multid", "v2id"], 
#                                  "vocab_key": ["ItemID", "Category", "MinPrice", "MaxPrice", "ItemID", "ItemID", "Brand"]})
item_fields_meta = pd.DataFrame({"field_name": ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"], 
                                 "field_type": ["nominal", "nominal", "ordinal", "ordinal", "nominal"], 
                                 "value_type": ["str", "str", "int", "int", "str"], 
                                 "field_enc": ["v2id", "v2onehot", "v2id", "v2id", "v2id"], 
                                 "vocab_key": ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"]})
item_fields_meta.to_csv(target_path + "meta_data/item_fields.meta", 
                        sep = '\t', index = False)

### User Meta

In [25]:
users = {uid: uid for uid in multicore_data['UserID'].unique()}
user_meta_df = pd.DataFrame.from_dict(users, orient = "index", columns = ["UserID"])
user_meta_df = user_meta_df.reset_index(drop = True)
user_meta_df[:3]

Unnamed: 0,UserID
0,A100UD67AHFODS
1,A100WO06OQR8BQ
2,A1013Q9SD2KIE1


In [26]:
from utils import setup_path
save_path = target_path + "meta_data/user.meta"
setup_path(save_path, is_dir = False)
user_meta_df.to_csv(save_path, sep = '\t', index = False)



In [27]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/user_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(user_meta_df, save_path, ["UserID"])



In [28]:
import pandas as pd
user_fields_meta = pd.DataFrame({"field_name": ["UserID"], 
                          "field_type": ["nominal"], 
                          "value_type": ["str"], 
                          "field_enc": ["v2id"], 
                          "vocab_key": ["UserID"]})
user_fields_meta.to_csv(target_path + "meta_data/user_fields.meta", sep = '\t', index = False)