## Preprocess Steps

* Build vocab files for user and item, textline format: field  value  idx
* Filter multicore rating data
* Holdout train-val-test dataset

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

DATA_ROOT = "/home/sl1471/public/"
PROCESSED_DATA_ROOT = "/home/sl1471/workspace/experiments/"
data_path = DATA_ROOT + "amazon_rating_only/"

In [2]:
# data file name, main_cat field in meta
# domain = ["Books", "Books", 50, 50]
# domain = ["Clothing_Shoes_and_Jewelry", "Clothing_Shoes_Jewelry", 25, 5]
# domain = ["Home_and_Kitchen", "Home_and_Kitchen", 25, 5]
domain = ["Electronics", "Electronics", 30, 5]
# domain = ["Sports_and_Outdoors", "Sports", 10, 5]
# domain = ["Movies_and_TV", "Movies_and_TV", 30, 10]
# domain = ["Video_Games", "Video_Games", 15,  5]


target_path = PROCESSED_DATA_ROOT + "amz_" + domain[0] + "/"

### Rating Data

In [3]:
import pandas as pd
df = pd.read_table(data_path + domain[0] + ".csv", sep=",", names = ["ItemID", "UserID", "Response", "Timestamp"])
df[:3]

Unnamed: 0,ItemID,UserID,Response,Timestamp
0,60009810,A1N070NS9CJQ2I,5.0,1026864000
1,60009810,A3P0KRKOBQK1KN,5.0,1025913600
2,60009810,A192HO2ICJ75VU,5.0,1025654400


In [4]:
n_user, n_item = len(df.UserID.unique()), len(df.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"sparsity: {1.0 - len(df) / (n_user * n_item)}")

#user: 9838676
#item: 756489
sparsity: 0.9999971792589499


In [5]:
from data.preprocess import run_multicore, run_multicore_asymetric
# multicore_data = run_multicore(df[["UserID", "ItemID", "Response", "Timestamp"]], n_core = domain[2])
multicore_data = run_multicore_asymetric(df[["UserID", "ItemID", "Response", "Timestamp"]], n_core_user = domain[2], n_core_item = domain[3])
# multicore_data = run_multicore_asymetric(multicore_data, n_core_user = 30, n_core_item = 5)

N-core is set to [5,100]
Filtering (30,5)-core data
Iteration 1


100%|██████████| 20994353/20994353 [00:34<00:00, 611521.82it/s]


Number of removed record: 20144710
Iteration 2


100%|██████████| 849643/849643 [00:01<00:00, 501379.56it/s]


Number of removed record: 297216
Iteration 3


100%|██████████| 552427/552427 [00:00<00:00, 662498.12it/s]


Number of removed record: 159187
Iteration 4


100%|██████████| 393240/393240 [00:00<00:00, 669477.16it/s]


Number of removed record: 45574
Iteration 5


100%|██████████| 347666/347666 [00:00<00:00, 625140.41it/s]


Number of removed record: 22762
Iteration 6


100%|██████████| 324904/324904 [00:00<00:00, 695395.37it/s]


Number of removed record: 10307
Iteration 7


100%|██████████| 314597/314597 [00:00<00:00, 622652.61it/s]


Number of removed record: 4622
Iteration 8


100%|██████████| 309975/309975 [00:00<00:00, 564589.51it/s]


Number of removed record: 2445
Iteration 9


100%|██████████| 307530/307530 [00:00<00:00, 563904.87it/s]


Number of removed record: 982
Iteration 10


100%|██████████| 306548/306548 [00:00<00:00, 591193.81it/s]


Number of removed record: 408
Iteration 11


100%|██████████| 306140/306140 [00:00<00:00, 622085.22it/s]


Number of removed record: 350
Iteration 12


100%|██████████| 305790/305790 [00:00<00:00, 713670.34it/s]


Number of removed record: 181
Iteration 13


100%|██████████| 305609/305609 [00:00<00:00, 648776.56it/s]


Number of removed record: 103
Iteration 14


100%|██████████| 305506/305506 [00:00<00:00, 650880.42it/s]


Number of removed record: 134
Iteration 15


100%|██████████| 305372/305372 [00:00<00:00, 674131.56it/s]


Number of removed record: 93
Iteration 16


100%|██████████| 305279/305279 [00:00<00:00, 668284.41it/s]


Number of removed record: 53
Iteration 17


100%|██████████| 305226/305226 [00:00<00:00, 525748.46it/s]


Number of removed record: 58
Iteration 18


100%|██████████| 305168/305168 [00:00<00:00, 624659.53it/s]


Number of removed record: 47
Iteration 19


100%|██████████| 305121/305121 [00:00<00:00, 567724.75it/s]


Number of removed record: 26
Iteration 20


100%|██████████| 305095/305095 [00:00<00:00, 600341.34it/s]


Number of removed record: 17
Iteration 21


100%|██████████| 305078/305078 [00:00<00:00, 687115.92it/s]


Number of removed record: 0
Size change: 20994353 --> 305078


In [6]:
n_user, n_item = len(multicore_data.UserID.unique()), len(multicore_data.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"#record: {len(multicore_data)}")
print(f"sparsity: {1.0 - len(multicore_data) / (n_user * n_item)}")

#user: 6527
#item: 20080
#record: 305078
sparsity: 0.9976722648245687


In [7]:
items = {iid: False for iid in multicore_data['ItemID'].unique()}
print(len(items))

20080


In [8]:
# import pandas as pd
from tqdm import tqdm
# pd.read_csv(data_path + "meta/filtered_meta.csv", sep = '\t', nrows = 3)
item_meta = {}
with open(data_path + "meta/filtered_meta.csv", 'r') as fin:
    fin.readline()
    for i,line in tqdm(enumerate(fin)):
        meta_info = line.strip().split("\t")
        item_id = meta_info[0]
        if item_id in items:
            item_meta[item_id] = meta_info
            del items[item_id]
            if len(items) == 0:
                break
print("Item meta info of items in data set:")
print(f"Found: {len(item_meta)}")
print(f"Missing: {len(items)}")

15138777it [00:28, 528966.78it/s]

20060/20
#missing item meta: 20





In [9]:
selected_rows = [True] * len(multicore_data)
for i,iid in enumerate(multicore_data["ItemID"]):
    if iid in items:
        selected_rows[i] = False
multicore_data = multicore_data[selected_rows]
multicore_data = run_multicore_asymetric(multicore_data, domain[2], domain[3])

N-core is set to [5,100]
Filtering (30,5)-core data
Iteration 1


100%|██████████| 304717/304717 [00:00<00:00, 666371.42it/s]


Number of removed record: 362
Iteration 2


100%|██████████| 304355/304355 [00:00<00:00, 895454.60it/s] 


Number of removed record: 117
Iteration 3


100%|██████████| 304238/304238 [00:00<00:00, 996410.97it/s] 


Number of removed record: 135
Iteration 4


100%|██████████| 304103/304103 [00:00<00:00, 843322.80it/s]


Number of removed record: 13
Iteration 5


100%|██████████| 304090/304090 [00:00<00:00, 682652.40it/s]


Number of removed record: 0
Size change: 304717 --> 304090


In [10]:
n_user, n_item = len(multicore_data.UserID.unique()), len(multicore_data.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"#record: {len(multicore_data)}")
print(f"sparsity: {1.0 - len(multicore_data) / (n_user * n_item)}")

#user: 6510
#item: 20026
#record: 304090
sparsity: 0.9976674716110224


In [11]:
multicore_data = multicore_data.sort_values(by=['UserID','Timestamp'])
multicore_data[:3]

Unnamed: 0,UserID,ItemID,Response,Timestamp
13378,A100UD67AHFODS,B0001D3K8A,5.0,1150588800
16818,A100UD67AHFODS,B0002SQ2P2,5.0,1150588800
36655,A100UD67AHFODS,B000U0S304,5.0,1349568000


In [12]:
from utils import set_random_seed
set_random_seed(9)
from data.preprocess import holdout_data_sequential, recheck_exist
trainset, valset, testset = holdout_data_sequential(multicore_data, holdout_type = "warm", ratio = [0.8,0.1,0.1])
trainset = trainset.reset_index(drop = True)
valset = valset.reset_index(drop = True)
testset = testset.reset_index(drop = True)
# recheck if there is any unseen item in val or test, if there is move corresponding user history into train
trainset, valset, testset = recheck_exist(trainset, valset, testset, field_name = "ItemID")

Build user history


304090it [00:00, 405038.22it/s]


Holdout user histories


100%|██████████| 6510/6510 [00:00<00:00, 7372.73it/s]


Move unseen ItemID from val to train
0/27946, finish in 1136015176.8s.   Moving user data
Before moving: Target DataFrame: 248198, Source Data Frame: 27946


100%|██████████| 315/315 [00:00<00:00, 3831.15it/s]
100%|██████████| 6195/6195 [00:01<00:00, 4085.66it/s]


#user moved: 315
After moving: Target DataFrame: 250151, Source Data Frame: 25993
Move unseen ItemID from test to train, this may also move users in val to train
0/27946, finish in 915474987.0s.   Val --> Train
Moving user data
Before moving: Target DataFrame: 250151, Source Data Frame: 25993


100%|██████████| 321/321 [00:00<00:00, 5796.16it/s]
100%|██████████| 5931/5931 [00:01<00:00, 5699.22it/s]


#user moved: 321
After moving: Target DataFrame: 251667, Source Data Frame: 24477
Test --> Train
Moving user data
Before moving: Target DataFrame: 251667, Source Data Frame: 27946


100%|██████████| 321/321 [00:00<00:00, 3846.13it/s]
100%|██████████| 6189/6189 [00:01<00:00, 4250.58it/s]


#user moved: 321
After moving: Target DataFrame: 253715, Source Data Frame: 25898


In [13]:
from utils import setup_path
save_path = target_path + "tsv_data/"
setup_path(save_path, is_dir = True)
trainset.to_csv(save_path + "train.tsv", sep = '\t', index = False)
valset.to_csv(save_path + "val.tsv", sep = '\t', index = False)
testset.to_csv(save_path + "test.tsv", sep = '\t', index = False)

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics/tsv_data" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics/tsv_data/" existed


In [14]:
len(trainset["UserID"].unique()), len(valset["UserID"].unique()), len(testset["UserID"].unique())

(6510, 5931, 6189)

In [15]:
len(trainset["ItemID"].unique()), len(valset["ItemID"].unique()), len(testset["ItemID"].unique())

(20026, 9841, 9049)

### Meta Data

See fields description [here](https://nijianmo.github.io/amazon/index.html)

15023058 lines

Fields: 'category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'image', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat', 'similar_item', 'date', 'price', 'asin'

Selected fields: "asin", "category", "price", "brand"


In [13]:
# selected_fields = ["asin", "category", "price", "also_buy", "also_view", "brand"]
selected_fields = ["asin", "category", "price", "brand"]
# import numpy as np

# def price_to_id(price):
#     if price == '':
#         return 0
#     p = price.strip().replace('$','').split('.')
#     l,v = len(p[0]), int(p[0][0])
# #     return (l-1) * 3 + (0 if v<3 else 1 if v < 6 else 2) + 1
#     return (l-1) * 10 + v + 1

# def clean(textline):
#     return textline.replace(',','_').replace(' ','').replace('&','_').replace('\t','')

# with open(data_path + "meta/filtered_meta.csv", 'w') as fout:
#     fout.write("ItemID\tCategory\tMin_Price\tMax_Price\tAlso_Buy\tAlso_View\tBrand\n")
#     with open(data_path + "meta/All_Amazon_Meta.json", 'r') as fin:
#         for i,line in enumerate(fin):
#             if i % 100000 == 0:
#                 print(f"#line: {i}", end = '\r')
#             info = eval(line)
#             fout.write(info['asin'] + "\t") # ItemID
#             fout.write(",".join([clean(c) for c in info['category'] \
#                                  if len(c) < 30]) + "\t") # Category
#             try:
#                 if len(info['price']) > 20:
#                     fout.write("0\t0\t")
#                 else:
#                     price = info['price'].split("-") # Min_Price and Max_Price
#                     if len(price) == 2:
#                         fout.write(f"{price_to_id(price[0])}\t{price_to_id(price[1])}\t")
#                     else:
#                         pidx = price_to_id(price[0])
#                         fout.write(f"{pidx}\t{pidx}\t")
#             except:
#                 fout.write("0\t0\t")
#             fout.write(",".join(info['also_buy']) + "\t")
#             fout.write(",".join(info['also_view']) + "\t")
#             fout.write(clean(info['brand'][:30]) + "\n")
#     print(f"#line: {i}", end = '\r')

In [16]:
import pandas as pd
pd.read_csv(data_path + "meta/filtered_meta.csv", sep = '\t', nrows = 3)

Unnamed: 0,ItemID,Category,Min_Price,Max_Price,Also_Buy,Also_View,Brand
0,6305121869,"Clothing_Shoes_Jewelry,Women,Clothing,Tops_Tee...",10,12,,,Ninasill_Blouse
1,6318708057,"Clothing_Shoes_Jewelry,Traditional_CulturalWea...",12,12,,,Coolred-Women
2,6342506256,"Clothing_Shoes_Jewelry,Men,Clothing,Shorts,Car...",13,13,,"B07CRJ95M7,B008AHISU4,B07B8F98W2,B07DD98Q7R,B0...",Gaok


In [18]:
items = {iid: False for iid in multicore_data['ItemID'].unique()}
print(len(items))

20026


In [19]:
from tqdm import tqdm
item_meta = {}
with open(data_path + "meta/filtered_meta.csv", 'r') as fin:
    fin.readline()
    for i,line in tqdm(enumerate(fin)):
        meta_info = line.strip().split("\t")
        item_id = meta_info[0]
        if item_id in items:
            item_meta[item_id] = meta_info
            del items[item_id]
            if len(items) == 0:
                break
print("Item meta info of items in data set:")
print(f"Found: {len(item_meta)}")
print(f"Missing: {len(items)}")

8225386it [00:15, 533517.71it/s]

Item meta info of items in data set:
Found: 20026
Missing: 0





In [20]:
import pandas as pd
item_meta_df = pd.DataFrame.from_dict(item_meta, orient = "index", columns = ["ItemID", "Category", "MinPrice", "MaxPrice", "AlsoBuy", "AlsoView", "Brand"])
item_meta_df = item_meta_df[["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"]]
item_meta_df = item_meta_df.reset_index(drop = True)
# item_meta_df = pd.DataFrame.from_dict(item_meta, orient = "index", columns = ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"])
# item_meta_df.insert(0, 'ItemID', item_meta_df.index)
item_meta_df[:3]

Unnamed: 0,ItemID,Category,MinPrice,MaxPrice,Brand
0,0789743035,"Electronics,Computers_amp;Accessories,Computer...",0,0,VisitAmazon'sJohnRayPage
1,0972683275,"Electronics,Accessories_amp;Supplies,Audio_amp...",14,14,VideoSecu
2,106171327X,"Electronics,Computers_amp;Accessories,MemoryCa...",12,12,SanDisk


In [21]:
from utils import setup_path
save_path = target_path + "meta_data/item.meta"
setup_path(save_path, is_dir = False)
item_meta_df.to_csv(save_path, sep = '\t', index = False)

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics/meta_data" existed


In [22]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/item_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(item_meta_df, save_path, ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"])

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics/meta_data" existed
Vocab file saved to: /home/sl1471/workspace/experiments/amz_Electronics/meta_data/item_fields.vocab


In [23]:
import pandas as pd
# item_fields_meta = pd.DataFrame({"field_name": ["ItemID", "Category", "MinPrice", "MaxPrice", "AlsoBuy", "AlsoView", "Brand"], 
#                                  "field_type": ["nominal", "nominal", "ordinal", "ordinal", "nominal", "nominal", "nominal"], 
#                                  "value_type": ["int", "str", "int", "int", "int", "int", "str"], 
#                                  "field_enc": ["v2id", "v2onehot", "v2id", "v2id", "v2multid", "v2multid", "v2id"], 
#                                  "vocab_key": ["ItemID", "Category", "MinPrice", "MaxPrice", "ItemID", "ItemID", "Brand"]})
item_fields_meta = pd.DataFrame({"field_name": ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"], 
                                 "field_type": ["nominal", "nominal", "ordinal", "ordinal", "nominal"], 
                                 "value_type": ["str", "str", "int", "int", "str"], 
                                 "field_enc": ["v2id", "v2onehot", "v2id", "v2id", "v2id"], 
                                 "vocab_key": ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"]})
item_fields_meta.to_csv(target_path + "meta_data/item_fields.meta", 
                        sep = '\t', index = False)

### User Meta

In [24]:
users = {uid: uid for uid in multicore_data['UserID'].unique()}
user_meta_df = pd.DataFrame.from_dict(users, orient = "index", columns = ["UserID"])
user_meta_df = user_meta_df.reset_index(drop = True)
user_meta_df[:3]

Unnamed: 0,UserID
0,A100UD67AHFODS
1,A100WO06OQR8BQ
2,A1013Q9SD2KIE1


In [25]:
from utils import setup_path
save_path = target_path + "meta_data/user.meta"
setup_path(save_path, is_dir = False)
user_meta_df.to_csv(save_path, sep = '\t', index = False)

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics/meta_data" existed


In [26]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/user_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(user_meta_df, save_path, ["UserID"])

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics" existed
dir "/home/sl1471/workspace/experiments/amz_Electronics/meta_data" existed
Vocab file saved to: /home/sl1471/workspace/experiments/amz_Electronics/meta_data/user_fields.vocab


In [27]:
import pandas as pd
user_fields_meta = pd.DataFrame({"field_name": ["UserID"], 
                          "field_type": ["nominal"], 
                          "value_type": ["str"], 
                          "field_enc": ["v2id"], 
                          "vocab_key": ["UserID"]})
user_fields_meta.to_csv(target_path + "meta_data/user_fields.meta", sep = '\t', index = False)