### Generate Transfer Learning Data

Each line domain's user embedding data contains:
* UserID
* Embedding

In [1]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

data_key = 'amz_subset/'
PROCESSED_DATA_ROOT = "/home/sl1471/workspace/experiments/"
target_path = PROCESSED_DATA_ROOT + data_key

In [17]:
n_domain = 18
target_path = PROCESSED_DATA_ROOT + "amz_subset/"
# domains = ["Books", "Kindle_Store", "Sports_and_Outdoors"]
domains = ["Books", "Clothing_Shoes_and_Jewelry", "Home_and_Kitchen", "Electronics", 
           "Sports_and_Outdoors", "Cell_Phones_and_Accessories", "Tools_and_Home_Improvement", "Movies_and_TV", 
           "Toys_and_Games", "Automotive", "Pet_Supplies", "Kindle_Store", 
           "Office_Products", "Patio_Lawn_and_Garden", "Grocery_and_Gourmet_Food", "CDs_and_Vinyl", 
           "Arts_Crafts_and_Sewing", "Video_Games"][:n_domain]

In [7]:
from model.rec_env.GRU4Rec import GRU4Rec
import argparse
parser = argparse.ArgumentParser()
parser = GRU4Rec.parse_model_args(parser)
args = parser.parse_args(['--model_path', target_path + 'models/env/', 
                          '--loss', 'softmax'])

In [8]:
def load_vocab(file_path):
    vocab = {"UserID": {}, "ItemID": {}}
    with open(file_path, 'r') as fin:
        fin.readline()
        for line in fin:
            if line.strip():
                cols = line.strip().split('\t')
                vocab[cols[0]][cols[1]] = int(cols[2])
    return vocab

In [9]:
import pandas as pd
# model_paths = pd.read_csv(target_path + "meta_data/domain_models.txt", sep = '\t')
domain_models = {}
domain_vocabs = {}
for d in domains:
    domain_models[d] = GRU4Rec(args, None, "cpu")
    domain_models[d].load_from_checkpoint(target_path + 'models/env/GRU4Rec_' + d + '_warm.pkl', with_optimizer = False)
    domain_vocabs[d] = load_vocab(target_path + 'meta_data/' + d + '/vocab.txt')

Load (checkpoint) from /home/sl1471/workspace/experiments/amz_subset/models/env/GRU4Rec_Books_warm.pkl
Load (checkpoint) from /home/sl1471/workspace/experiments/amz_subset/models/env/GRU4Rec_Clothing_Shoes_and_Jewelry_warm.pkl
Load (checkpoint) from /home/sl1471/workspace/experiments/amz_subset/models/env/GRU4Rec_Home_and_Kitchen_warm.pkl


In [10]:
for i,d_from in enumerate(domains[:-1]):
    vocab_from = domain_vocabs[d_from]
    for d_towards in domains[i+1:]:
        vocab_towards = domain_vocabs[d_towards]
        shared_users = [u for u in vocab_from["UserID"].keys() if u in vocab_towards["UserID"]]
        print(f"{d_from}({len(vocab_from['UserID'])})<-->{d_towards}({len(vocab_towards['UserID'])}):" \
              + f" n_common_user = {len(shared_users)}")

Books(78108)<-->Clothing_Shoes_and_Jewelry(197948): n_common_user = 4744
Books(78108)<-->Home_and_Kitchen(127974): n_common_user = 5312
Clothing_Shoes_and_Jewelry(197948)<-->Home_and_Kitchen(127974): n_common_user = 37303


In [11]:
# combined_df = pd.read_csv(target_path + "tsv_data/combined_18/warm_train.tsv", sep = '\t')
combined_data = {}
for phase in ["train", "val", "test"]:
    df_dict = {}
    for i,d in enumerate(domains):
        df = pd.read_csv(target_path + "tsv_data/" + d + "/warm_" + phase + ".tsv", sep = '\t')
        df.insert(2, "Domain", [i+1]*len(df))
        df_dict[d] = df
    combined_data[phase] = pd.concat(df_dict.values(), axis = 0).sort_values(by = ["UserID", "Timestamp"])

In [12]:
print([len(df) for df in combined_data.values()])

[5109524, 483759, 487945]


In [13]:
users = list(combined_data["train"]["UserID"].unique())
for row in combined_data["train"][combined_data["train"]["UserID"] == users[1]].values:
    print(row)

['A0039616ADOZ0KMWQRNX' 'B01CZMQCPC' 2 5.0 1475539200]
['A0039616ADOZ0KMWQRNX' 'B0007KPP7G' 2 5.0 1477440000]
['A0039616ADOZ0KMWQRNX' 'B0007KPPAS' 2 5.0 1477440000]
['A0039616ADOZ0KMWQRNX' 'B00C0ZEE2S' 2 5.0 1477440000]
['A0039616ADOZ0KMWQRNX' 'B01B5DLI88' 2 4.0 1477440000]
['A0039616ADOZ0KMWQRNX' 'B01B5DLG7G' 2 4.0 1477440000]
['A0039616ADOZ0KMWQRNX' 'B00TNJI6T6' 2 4.0 1479945600]
['A0039616ADOZ0KMWQRNX' 'B00IITT30I' 2 5.0 1485648000]
['A0039616ADOZ0KMWQRNX' 'B00NKY99F0' 2 5.0 1485648000]
['A0039616ADOZ0KMWQRNX' 'B00TF1U834' 2 4.0 1491264000]


In [14]:
from tqdm import tqdm
from et_utils import setup_path
setup_path(target_path + "tsv_data/combined_" + str(len(domains)), is_dir = True)
for phase in ["train", "val", "test"]:
    print(phase)
    df = combined_data[phase]
    embeddings = []
    for iid, d in tqdm(zip(df["ItemID"], df["Domain"])):
        domain_name = domains[d-1]
        encoded_iid = domain_vocabs[domain_name]["ItemID"][iid]
        embeddings.append([v for v in domain_models[domain_name].pred_embeddings.weight[encoded_iid].detach().numpy()])
    df.insert(5, "emb", embeddings)
    df.to_csv(target_path + "tsv_data/combined_" + str(len(domains)) + "/warm_" + phase + ".tsv", index = False, sep = '\t')

error when creating ""
dir "/home" existed
dir "/home/sl1471" existed
dir "/home/sl1471/workspace" existed
dir "/home/sl1471/workspace/experiments" existed
dir "/home/sl1471/workspace/experiments/amz_subset" existed
dir "/home/sl1471/workspace/experiments/amz_subset/tsv_data" existed
dir "/home/sl1471/workspace/experiments/amz_subset/tsv_data/combined_3" existed
train


5109524it [01:45, 48482.84it/s]


val


483759it [00:13, 36526.20it/s]


test


487945it [00:08, 54637.83it/s]


In [15]:
import pandas as pd
testset = pd.read_csv(target_path + "tsv_data/combined_" + str(len(domains)) + "/warm_test.tsv", sep = '\t')
testset[:3]

Unnamed: 0,UserID,ItemID,Domain,Response,Timestamp,emb
0,A0039616ADOZ0KMWQRNX,B011T2S93W,2,5.0,1535414400,"[0.21389227, 0.10835785, -0.17878664, -0.01884..."
1,A00463782V7TKAP9EMNL,1548868086,1,5.0,1506297600,"[-0.748047, 0.6860984, 0.47165033, 0.09554045,..."
2,A00463782V7TKAP9EMNL,0997422580,1,4.0,1506297600,"[-0.783454, 0.7996321, 0.7069402, -0.004004198..."


### Cross-domain Meta Data

In [19]:
model_class = "GRU4Rec"
with open(target_path + "meta_data/domain_env.meta", 'w') as fout:
    fout.write("domain_name\tmodel_class\tmodel_path\tvocab_path\n")
    for d in domains:
        fout.write(d + "\t" + model_class + "\t" + target_path + "models/env/" + model_class + "_" + d + "_warm.pkl\t" + 
                   target_path + "meta_data/" + d + "/vocab.txt\n")

In [25]:
import pandas as pd
domain_meta = pd.read_table(target_path + "meta_data/domain_env.meta", sep = '\t', index_col = 0).to_dict(orient = 'index')
domain_meta["Books"]

{'model_class': 'GRU4Rec',
 'model_path': '/home/sl1471/workspace/experiments/amz_subset/models/env/GRU4Rec_Books_warm.pkl',
 'vocab_path': '/home/sl1471/workspace/experiments/amz_subset/meta_data/Books/vocab.txt'}