## Preprocess Steps

* Build vocab files for user and item, textline format: field  value  idx
* Filter multicore rating data
* Holdout train-val-test dataset

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

DATA_ROOT = "/home/sl1471/public/"
PROCESSED_DATA_ROOT = "/home/sl1471/workspace/experiments/"
data_path = DATA_ROOT + "amazon_rating_only/"

In [2]:
# data file name, main_cat field in meta
# domain = ["Books", "Books", 80, 30]
# domain = ["Clothing_Shoes_and_Jewelry", "Clothing_Shoes_Jewelry", 25, 5]
# domain = ["Home_and_Kitchen", "Home_and_Kitchen", 25, 5]
# domain = ["Electronics", "Electronics", 30, 5]
# domain = ["Sports_and_Outdoors", "Sports", 10, 5]
domain = ["Movies_and_TV", "Movies_and_TV", 40, 10]
# domain = ["Video_Games", "Video_Games", 15,  5]


target_path = PROCESSED_DATA_ROOT + "amz_" + domain[0] + "/"

### Rating Data

In [3]:
import pandas as pd
df = pd.read_table(data_path + domain[0] + ".csv", sep=",", names = ["ItemID", "UserID", "Response", "Timestamp"])
df[:3]

Unnamed: 0,ItemID,UserID,Response,Timestamp
0,1527665,A3478QRKQDOPQ2,5.0,1362960000
1,1527665,A2VHSG6TZHU1OB,5.0,1361145600
2,1527665,A23EJWOW1TLENE,5.0,1358380800


In [4]:
n_user, n_item = len(df.UserID.unique()), len(df.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"sparsity: {1.0 - len(df) / (n_user * n_item)}")

#user: 3826085
#item: 182032
sparsity: 0.9999874142903511


In [5]:
from data.preprocess import run_multicore, run_multicore_asymetric
# multicore_data = run_multicore(df[["UserID", "ItemID", "Response", "Timestamp"]], n_core = domain[2])
multicore_data = run_multicore_asymetric(df[["UserID", "ItemID", "Response", "Timestamp"]], n_core_user = domain[2], n_core_item = domain[3])
# multicore_data = run_multicore_asymetric(multicore_data, n_core_user = 40, n_core_item = 10)

N-core is set to [5,100]
Filtering (40,10)-core data
Iteration 1


100%|██████████| 8765568/8765568 [00:15<00:00, 573537.91it/s]


Number of removed record: 8031261
Iteration 2


100%|██████████| 734307/734307 [00:00<00:00, 802205.09it/s]


Number of removed record: 167037
Iteration 3


100%|██████████| 567270/567270 [00:01<00:00, 552201.80it/s]


Number of removed record: 56148
Iteration 4


100%|██████████| 511122/511122 [00:00<00:00, 738266.72it/s]


Number of removed record: 16655
Iteration 5


100%|██████████| 494467/494467 [00:00<00:00, 503451.53it/s]


Number of removed record: 5829
Iteration 6


100%|██████████| 488638/488638 [00:00<00:00, 540113.36it/s]


Number of removed record: 2257
Iteration 7


100%|██████████| 486381/486381 [00:00<00:00, 809299.26it/s]


Number of removed record: 1041
Iteration 8


100%|██████████| 485340/485340 [00:00<00:00, 822322.52it/s]


Number of removed record: 560
Iteration 9


100%|██████████| 484780/484780 [00:00<00:00, 913801.66it/s]


Number of removed record: 332
Iteration 10


100%|██████████| 484448/484448 [00:00<00:00, 773351.75it/s]


Number of removed record: 216
Iteration 11


100%|██████████| 484232/484232 [00:00<00:00, 915767.31it/s]


Number of removed record: 88
Iteration 12


100%|██████████| 484144/484144 [00:00<00:00, 892795.81it/s]


Number of removed record: 3
Iteration 13


100%|██████████| 484141/484141 [00:00<00:00, 712919.59it/s]


Number of removed record: 0
Size change: 8765568 --> 484141


In [6]:
n_user, n_item = len(multicore_data.UserID.unique()), len(multicore_data.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"#record: {len(multicore_data)}")
print(f"sparsity: {1.0 - len(multicore_data) / (n_user * n_item)}")

#user: 5515
#item: 13509
#record: 484141
sparsity: 0.9935016493151505


In [7]:
items = {iid: False for iid in multicore_data['ItemID'].unique()}
print(len(items))

13509


In [8]:
# import pandas as pd
from tqdm import tqdm
# pd.read_csv(data_path + "meta/filtered_meta.csv", sep = '\t', nrows = 3)
item_meta = {}
with open(data_path + "meta/filtered_meta.csv", 'r') as fin:
    fin.readline()
    for i,line in tqdm(enumerate(fin)):
        meta_info = line.strip().split("\t")
        item_id = meta_info[0]
        if item_id in items:
            item_meta[item_id] = meta_info
            del items[item_id]
            if len(items) == 0:
                break
print("Item meta info of items in data set:")
print(f"Found: {len(item_meta)}")
print(f"Missing: {len(items)}")

15138777it [00:30, 498931.12it/s]

Item meta info of items in data set:
Found: 13493
Missing: 16





In [9]:
selected_rows = [True] * len(multicore_data)
for i,iid in enumerate(multicore_data["ItemID"]):
    if iid in items:
        selected_rows[i] = False
multicore_data = multicore_data[selected_rows]
multicore_data = run_multicore_asymetric(multicore_data, domain[2], domain[3])

N-core is set to [5,100]
Filtering (40,10)-core data
Iteration 1


100%|██████████| 483529/483529 [00:00<00:00, 724512.47it/s]


Number of removed record: 352
Iteration 2


100%|██████████| 483177/483177 [00:00<00:00, 628045.56it/s]


Number of removed record: 58
Iteration 3


100%|██████████| 483119/483119 [00:00<00:00, 1093326.27it/s]


Number of removed record: 89
Iteration 4


100%|██████████| 483030/483030 [00:00<00:00, 922976.79it/s] 


Number of removed record: 259
Iteration 5


100%|██████████| 482771/482771 [00:00<00:00, 875666.39it/s]


Number of removed record: 77
Iteration 6


100%|██████████| 482694/482694 [00:00<00:00, 904587.07it/s]


Number of removed record: 56
Iteration 7


100%|██████████| 482638/482638 [00:00<00:00, 782859.15it/s] 


Number of removed record: 14
Iteration 8


100%|██████████| 482624/482624 [00:00<00:00, 641878.12it/s]


Number of removed record: 47
Iteration 9


100%|██████████| 482577/482577 [00:00<00:00, 809491.27it/s]


Number of removed record: 58
Iteration 10


100%|██████████| 482519/482519 [00:00<00:00, 640432.09it/s]


Number of removed record: 8
Iteration 11


100%|██████████| 482511/482511 [00:00<00:00, 803271.60it/s]


Number of removed record: 0
Size change: 483529 --> 482511


In [10]:
n_user, n_item = len(multicore_data.UserID.unique()), len(multicore_data.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"#record: {len(multicore_data)}")
print(f"sparsity: {1.0 - len(multicore_data) / (n_user * n_item)}")

#user: 5497
#item: 13457
#record: 482511
sparsity: 0.9934772125159124


In [11]:
multicore_data = multicore_data.sort_values(by=['UserID','Timestamp'])

In [12]:
from utils import set_random_seed
set_random_seed(9)
from data.preprocess import holdout_data_sequential, recheck_exist
trainset, valset, testset = holdout_data_sequential(multicore_data, holdout_type = "warm", ratio = [0.8,0.1,0.1])
trainset = trainset.reset_index(drop = True)
valset = valset.reset_index(drop = True)
testset = testset.reset_index(drop = True)
# recheck if there is any unseen item in val or test, if there is move corresponding user history into train
trainset, valset, testset = recheck_exist(trainset, valset, testset, field_name = "ItemID")

Build user history


482511it [00:01, 381098.46it/s]


Holdout user histories


100%|██████████| 5497/5497 [00:00<00:00, 8266.28it/s]


Move unseen ItemID from val to train
0/46010, finish in 1230793476.1s.   Moving user data
Before moving: Target DataFrame: 390491, Source Data Frame: 46010


100%|██████████| 5/5 [00:00<00:00, 2944.20it/s]
100%|██████████| 5492/5492 [00:01<00:00, 5459.45it/s]


#user moved: 5
After moving: Target DataFrame: 390599, Source Data Frame: 45902
Move unseen ItemID from test to train, this may also move users in val to train
0/46010, finish in 1468834638.6s.   Val --> Train
Moving user data
Before moving: Target DataFrame: 390599, Source Data Frame: 45902


100%|██████████| 12/12 [00:00<00:00, 2815.91it/s]
100%|██████████| 5480/5480 [00:00<00:00, 6634.93it/s]


#user moved: 12
After moving: Target DataFrame: 390698, Source Data Frame: 45803
Test --> Train
Moving user data
Before moving: Target DataFrame: 390698, Source Data Frame: 46010


100%|██████████| 12/12 [00:00<00:00, 3999.97it/s]
100%|██████████| 5485/5485 [00:01<00:00, 5172.36it/s]


#user moved: 12
After moving: Target DataFrame: 390797, Source Data Frame: 45911


In [13]:
from utils import setup_path
save_path = target_path + "tsv_data/"
setup_path(save_path, is_dir = True)
trainset.to_csv(save_path + "train.tsv", sep = '\t', index = False)
valset.to_csv(save_path + "val.tsv", sep = '\t', index = False)
testset.to_csv(save_path + "test.tsv", sep = '\t', index = False)

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV/tsv_data" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV/tsv_data/" existed


In [14]:
len(trainset["UserID"].unique()), len(valset["UserID"].unique()), len(testset["UserID"].unique())

(5497, 5480, 5485)

In [15]:
len(trainset["ItemID"].unique()), len(valset["ItemID"].unique()), len(testset["ItemID"].unique())

(13457, 11112, 10994)

### Meta Data

See fields description [here](https://nijianmo.github.io/amazon/index.html)

15023058 lines

Fields: 'category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'image', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat', 'similar_item', 'date', 'price', 'asin'

Selected fields: "asin", "category", "price", "brand"


In [16]:
# selected_fields = ["asin", "category", "price", "also_buy", "also_view", "brand"]
selected_fields = ["asin", "category", "price", "brand"]
# import numpy as np

# def price_to_id(price):
#     if price == '':
#         return 0
#     p = price.strip().replace('$','').split('.')
#     l,v = len(p[0]), int(p[0][0])
# #     return (l-1) * 3 + (0 if v<3 else 1 if v < 6 else 2) + 1
#     return (l-1) * 10 + v + 1

# def clean(textline):
#     return textline.replace(',','_').replace(' ','').replace('&','_').replace('\t','')

# with open(data_path + "meta/filtered_meta.csv", 'w') as fout:
#     fout.write("ItemID\tCategory\tMin_Price\tMax_Price\tAlso_Buy\tAlso_View\tBrand\n")
#     with open(data_path + "meta/All_Amazon_Meta.json", 'r') as fin:
#         for i,line in enumerate(fin):
#             if i % 100000 == 0:
#                 print(f"#line: {i}", end = '\r')
#             info = eval(line)
#             fout.write(info['asin'] + "\t") # ItemID
#             fout.write(",".join([clean(c) for c in info['category'] \
#                                  if len(c) < 30]) + "\t") # Category
#             try:
#                 if len(info['price']) > 20:
#                     fout.write("0\t0\t")
#                 else:
#                     price = info['price'].split("-") # Min_Price and Max_Price
#                     if len(price) == 2:
#                         fout.write(f"{price_to_id(price[0])}\t{price_to_id(price[1])}\t")
#                     else:
#                         pidx = price_to_id(price[0])
#                         fout.write(f"{pidx}\t{pidx}\t")
#             except:
#                 fout.write("0\t0\t")
#             fout.write(",".join(info['also_buy']) + "\t")
#             fout.write(",".join(info['also_view']) + "\t")
#             fout.write(clean(info['brand'][:30]) + "\n")
#     print(f"#line: {i}", end = '\r')

In [17]:
import pandas as pd
pd.read_csv(data_path + "meta/filtered_meta.csv", sep = '\t', nrows = 3)

Unnamed: 0,ItemID,Category,Min_Price,Max_Price,Also_Buy,Also_View,Brand
0,6305121869,"Clothing_Shoes_Jewelry,Women,Clothing,Tops_Tee...",10,12,,,Ninasill_Blouse
1,6318708057,"Clothing_Shoes_Jewelry,Traditional_CulturalWea...",12,12,,,Coolred-Women
2,6342506256,"Clothing_Shoes_Jewelry,Men,Clothing,Shorts,Car...",13,13,,"B07CRJ95M7,B008AHISU4,B07B8F98W2,B07DD98Q7R,B0...",Gaok


In [18]:
items = {iid: False for iid in multicore_data['ItemID'].unique()}
print(len(items))

13457


In [19]:
from tqdm import tqdm
item_meta = {}
with open(data_path + "meta/filtered_meta.csv", 'r') as fin:
    fin.readline()
    for i,line in tqdm(enumerate(fin)):
        meta_info = line.strip().split("\t")
        item_id = meta_info[0]
        if item_id in items:
            item_meta[item_id] = meta_info
            del items[item_id]
            if len(items) == 0:
                break
print("Item meta info of items in data set:")
print(f"Found: {len(item_meta)}")
print(f"Missing: {len(items)}")

12178103it [00:26, 467909.88it/s]

Item meta info of items in data set:
Found: 13457
Missing: 0





In [20]:
import pandas as pd
item_meta_df = pd.DataFrame.from_dict(item_meta, orient = "index", columns = ["ItemID", "Category", "MinPrice", "MaxPrice", "AlsoBuy", "AlsoView", "Brand"])
item_meta_df = item_meta_df[["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"]]
item_meta_df = item_meta_df.reset_index(drop = True)
# item_meta_df = pd.DataFrame.from_dict(item_meta, orient = "index", columns = ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"])
# item_meta_df.insert(0, 'ItemID', item_meta_df.index)
item_meta_df[:3]

Unnamed: 0,ItemID,Category,MinPrice,MaxPrice,Brand
0,5019281,"Movies_TV,Movies",0,0,
1,5119367,"Movies_TV,StudioSpecials,WarnerHomeVideo,AllTi...",0,0,BenKingsley
2,6486576,"Movies_TV,ChristianVideo,General",5,5,BrianDeacon


In [21]:
from utils import setup_path
save_path = target_path + "meta_data/item.meta"
setup_path(save_path, is_dir = False)
item_meta_df.to_csv(save_path, sep = '\t', index = False)

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV/meta_data" existed


In [22]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/item_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(item_meta_df, save_path, ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"])

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV/meta_data" existed
Vocab file saved to: /home/sl1471/workspace/experiments/amz_Movies_and_TV/meta_data/item_fields.vocab


In [23]:
import pandas as pd
# item_fields_meta = pd.DataFrame({"field_name": ["ItemID", "Category", "MinPrice", "MaxPrice", "AlsoBuy", "AlsoView", "Brand"], 
#                                  "field_type": ["nominal", "nominal", "ordinal", "ordinal", "nominal", "nominal", "nominal"], 
#                                  "value_type": ["int", "str", "int", "int", "int", "int", "str"], 
#                                  "field_enc": ["v2id", "v2onehot", "v2id", "v2id", "v2multid", "v2multid", "v2id"], 
#                                  "vocab_key": ["ItemID", "Category", "MinPrice", "MaxPrice", "ItemID", "ItemID", "Brand"]})
item_fields_meta = pd.DataFrame({"field_name": ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"], 
                                 "field_type": ["nominal", "nominal", "ordinal", "ordinal", "nominal"], 
                                 "value_type": ["str", "str", "int", "int", "str"], 
                                 "field_enc": ["v2id", "v2onehot", "v2id", "v2id", "v2id"], 
                                 "vocab_key": ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"]})
item_fields_meta.to_csv(target_path + "meta_data/item_fields.meta", 
                        sep = '\t', index = False)

### User Meta

In [24]:
users = {uid: uid for uid in multicore_data['UserID'].unique()}
user_meta_df = pd.DataFrame.from_dict(users, orient = "index", columns = ["UserID"])
user_meta_df = user_meta_df.reset_index(drop = True)
user_meta_df[:3]

Unnamed: 0,UserID
0,A100WO06OQR8BQ
1,A10175AMUHOQC4
2,A101IGU6UDKW3X


In [25]:
from utils import setup_path
save_path = target_path + "meta_data/user.meta"
setup_path(save_path, is_dir = False)
user_meta_df.to_csv(save_path, sep = '\t', index = False)

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV/meta_data" existed


In [26]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/user_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(user_meta_df, save_path, ["UserID"])

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV" existed
dir "/home/sl1471/workspace/experiments/amz_Movies_and_TV/meta_data" existed
Vocab file saved to: /home/sl1471/workspace/experiments/amz_Movies_and_TV/meta_data/user_fields.vocab


In [27]:
import pandas as pd
user_fields_meta = pd.DataFrame({"field_name": ["UserID"], 
                          "field_type": ["nominal"], 
                          "value_type": ["str"], 
                          "field_enc": ["v2id"], 
                          "vocab_key": ["UserID"]})
user_fields_meta.to_csv(target_path + "meta_data/user_fields.meta", sep = '\t', index = False)