## Preprocess Steps

* Build vocab files for user and item, textline format: field  value  idx
* Filter multicore rating data
* Holdout train-val-test dataset

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

DATA_ROOT = "/home/sl1471/public/"
PROCESSED_DATA_ROOT = "/home/sl1471/workspace/experiments/"
data_path = DATA_ROOT + "amazon_rating_only/"

In [2]:
# data file name, main_cat field in meta
# domain = ["Books", "Books", 50, 50]
# domain = ["Clothing_Shoes_and_Jewelry", "Clothing_Shoes_Jewelry", 25, 5]
# domain = ["Home_and_Kitchen", "Home_and_Kitchen", 25, 5]
# domain = ["Electronics", "Electronics", 24, 10]
# domain = ["Sports_and_Outdoors", "Sports", 10, 5]
# domain = ["Movies_and_TV", "Movies_and_TV", 30, 10]
domain = ["Video_Games", "Video_Games", 15,  5]


target_path = PROCESSED_DATA_ROOT + "amz_" + domain[0] + "/"

### Rating Data

In [3]:
import pandas as pd
df = pd.read_table(data_path + domain[0] + ".csv", sep=",", names = ["ItemID", "UserID", "Response", "Timestamp"])
df[:3]

Unnamed: 0,ItemID,UserID,Response,Timestamp
0,439381673,A21ROB4YDOZA5P,1.0,1402272000
1,439381673,A3TNZ2Q5E7HTHD,3.0,1399680000
2,439381673,A1OKRM3QFEATQO,4.0,1391731200


In [4]:
n_user, n_item = len(df.UserID.unique()), len(df.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"sparsity: {1.0 - len(df) / (n_user * n_item)}")

#user: 1540618
#item: 71982
sparsity: 0.9999768672332068


In [None]:
from data.preprocess import run_multicore, run_multicore_asymetric
# multicore_data = run_multicore(df[["UserID", "ItemID", "Response", "Timestamp"]], n_core = domain[2])
multicore_data = run_multicore_asymetric(df[["UserID", "ItemID", "Response", "Timestamp"]], n_core_user = domain[2], n_core_item = domain[3])
# multicore_data = run_multicore_asymetric(multicore_data, n_core_user = 30, n_core_item = 10)

In [None]:
n_user, n_item = len(multicore_data.UserID.unique()), len(multicore_data.ItemID.unique())
print(f"#user: {n_user}")
print(f"#item: {n_item}")
print(f"#record: {len(multicore_data)}")
print(f"sparsity: {1.0 - len(multicore_data) / (n_user * n_item)}")

In [None]:
multicore_data = multicore_data.sort_values(by=['UserID','Timestamp'])
multicore_data[:3]

In [None]:
from utils import set_random_seed
set_random_seed(9)
from data.preprocess import holdout_data_sequential, recheck_exist
trainset, valset, testset = holdout_data_sequential(multicore_data, holdout_type = "warm", ratio = [0.8,0.1,0.1])
trainset = trainset.reset_index(drop = True)
valset = valset.reset_index(drop = True)
testset = testset.reset_index(drop = True)
# recheck if there is any unseen item in val or test, if there is move corresponding user history into train
trainset, valset, testset = recheck_exist(trainset, valset, testset, field_name = "ItemID")

In [None]:
from utils import setup_path
save_path = target_path + "tsv_data/"
setup_path(save_path, is_dir = True)
trainset.to_csv(save_path + "train.tsv", sep = '\t', index = False)
valset.to_csv(save_path + "val.tsv", sep = '\t', index = False)
testset.to_csv(save_path + "test.tsv", sep = '\t', index = False)

In [None]:
len(trainset["UserID"].unique()), len(valset["UserID"].unique()), len(testset["UserID"].unique())

In [None]:
len(trainset["ItemID"].unique()), len(valset["ItemID"].unique()), len(testset["ItemID"].unique())

### Meta Data

See fields description [here](https://nijianmo.github.io/amazon/index.html)

15023058 lines

Fields: 'category', 'tech1', 'description', 'fit', 'title', 'also_buy', 'image', 'tech2', 'brand', 'feature', 'rank', 'also_view', 'details', 'main_cat', 'similar_item', 'date', 'price', 'asin'

Selected fields: "asin", "category", "price", "brand"


In [None]:
# selected_fields = ["asin", "category", "price", "also_buy", "also_view", "brand"]
selected_fields = ["asin", "category", "price", "brand"]
# import numpy as np

# def price_to_id(price):
#     if price == '':
#         return 0
#     p = price.strip().replace('$','').split('.')
#     l,v = len(p[0]), int(p[0][0])
# #     return (l-1) * 3 + (0 if v<3 else 1 if v < 6 else 2) + 1
#     return (l-1) * 10 + v + 1

# def clean(textline):
#     return textline.replace(',','_').replace(' ','').replace('&','_').replace('\t','')

# with open(data_path + "meta/filtered_meta.csv", 'w') as fout:
#     fout.write("ItemID\tCategory\tMin_Price\tMax_Price\tAlso_Buy\tAlso_View\tBrand\n")
#     with open(data_path + "meta/All_Amazon_Meta.json", 'r') as fin:
#         for i,line in enumerate(fin):
#             if i % 100000 == 0:
#                 print(f"#line: {i}", end = '\r')
#             info = eval(line)
#             fout.write(info['asin'] + "\t") # ItemID
#             fout.write(",".join([clean(c) for c in info['category'] \
#                                  if len(c) < 30]) + "\t") # Category
#             try:
#                 if len(info['price']) > 20:
#                     fout.write("0\t0\t")
#                 else:
#                     price = info['price'].split("-") # Min_Price and Max_Price
#                     if len(price) == 2:
#                         fout.write(f"{price_to_id(price[0])}\t{price_to_id(price[1])}\t")
#                     else:
#                         pidx = price_to_id(price[0])
#                         fout.write(f"{pidx}\t{pidx}\t")
#             except:
#                 fout.write("0\t0\t")
#             fout.write(",".join(info['also_buy']) + "\t")
#             fout.write(",".join(info['also_view']) + "\t")
#             fout.write(clean(info['brand'][:30]) + "\n")
#     print(f"#line: {i}", end = '\r')

In [None]:
import pandas as pd
from tqdm import tqdm
pd.read_csv(data_path + "meta/filtered_meta.csv", sep = '\t', nrows = 3)

In [None]:
items = {iid: False for iid in multicore_data['ItemID'].unique()}
print(len(items))

In [None]:
item_meta = {}
with open(data_path + "meta/filtered_meta.csv", 'r') as fin:
    fin.readline()
    for i,line in tqdm(enumerate(fin)):
        meta_info = line.strip().split("\t")
        item_id = meta_info[0]
        if item_id in items:
            item_meta[item_id] = meta_info
            del items[item_id]
            if len(items) == 0:
                break
print(f"{len(item_meta)}/{len(items)}")
print(f"#missing item meta: {len(items)}")
for iid in items:
    item_meta[iid] = [iid, domain[1], 0, 0, '', '', '']

In [None]:
import pandas as pd
item_meta_df = pd.DataFrame.from_dict(item_meta, orient = "index", columns = ["ItemID", "Category", "MinPrice", "MaxPrice", "AlsoBuy", "AlsoView", "Brand"])
item_meta_df = item_meta_df[["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"]]
item_meta_df = item_meta_df.reset_index(drop = True)
# item_meta_df = pd.DataFrame.from_dict(item_meta, orient = "index", columns = ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"])
# item_meta_df.insert(0, 'ItemID', item_meta_df.index)
item_meta_df[:3]

In [None]:
from utils import setup_path
save_path = target_path + "meta_data/item.meta"
setup_path(save_path, is_dir = False)
item_meta_df.to_csv(save_path, sep = '\t', index = False)

In [None]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/item_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(item_meta_df, save_path, ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"])

In [None]:
import pandas as pd
# item_fields_meta = pd.DataFrame({"field_name": ["ItemID", "Category", "MinPrice", "MaxPrice", "AlsoBuy", "AlsoView", "Brand"], 
#                                  "field_type": ["nominal", "nominal", "ordinal", "ordinal", "nominal", "nominal", "nominal"], 
#                                  "value_type": ["int", "str", "int", "int", "int", "int", "str"], 
#                                  "field_enc": ["v2id", "v2onehot", "v2id", "v2id", "v2multid", "v2multid", "v2id"], 
#                                  "vocab_key": ["ItemID", "Category", "MinPrice", "MaxPrice", "ItemID", "ItemID", "Brand"]})
item_fields_meta = pd.DataFrame({"field_name": ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"], 
                                 "field_type": ["nominal", "nominal", "ordinal", "ordinal", "nominal"], 
                                 "value_type": ["str", "str", "int", "int", "str"], 
                                 "field_enc": ["v2id", "v2onehot", "v2id", "v2id", "v2id"], 
                                 "vocab_key": ["ItemID", "Category", "MinPrice", "MaxPrice", "Brand"]})
item_fields_meta.to_csv(target_path + "meta_data/item_fields.meta", 
                        sep = '\t', index = False)

### User Meta

In [None]:
users = {uid: uid for uid in multicore_data['UserID'].unique()}
user_meta_df = pd.DataFrame.from_dict(users, orient = "index", columns = ["UserID"])
user_meta_df = user_meta_df.reset_index(drop = True)
user_meta_df[:3]

In [None]:
from utils import setup_path
save_path = target_path + "meta_data/user.meta"
setup_path(save_path, is_dir = False)
user_meta_df.to_csv(save_path, sep = '\t', index = False)

In [None]:
from data.preprocess import build_vocab
from utils import setup_path
save_path = target_path + "meta_data/user_fields.vocab"
setup_path(save_path, is_dir = False)
build_vocab(user_meta_df, save_path, ["UserID"])

In [None]:
import pandas as pd
user_fields_meta = pd.DataFrame({"field_name": ["UserID"], 
                          "field_type": ["nominal"], 
                          "value_type": ["str"], 
                          "field_enc": ["v2id"], 
                          "vocab_key": ["UserID"]})
user_fields_meta.to_csv(target_path + "meta_data/user_fields.meta", sep = '\t', index = False)