## Preprocess

In [None]:
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 

from preprocess import setup_path, DOMAINS, N_CORES, ROOT_PATH
data_path = ROOT_PATH + "public/amz_rating/"
target_path = ROOT_PATH + "public/fedct/domain_data/"
setup_path(target_path, is_dir = True)
domains = DOMAINS
n_cores = N_CORES

In [None]:

# domains = ["Books", "Clothing_Shoes_and_Jewelry", "Home_and_Kitchen", 
#            "Electronics", "Sports_and_Outdoors", "Tools_and_Home_Improvement", 
#            "Movies_and_TV", "Toys_and_Games", "Automotive", "Pet_Supplies", 
#            "Kindle_Store", "Office_Products", "Patio_Lawn_and_Garden", 
#            "Grocery_and_Gourmet_Food", "CDs_and_Vinyl", "Video_Games"]

# n_cores = {
#             "Books": [20,30],
#             "Kindle_Store": [5,5],
#             "Home_and_Kitchen": [10,10],
#             "Grocery_and_Gourmet_Food": [5,5],
#             "Clothing_Shoes_and_Jewelry": [10,10],
#             "Office_Products": [5,5],
#             "Pet_Supplies": [5,5],
#             "Tools_and_Home_Improvement": [5,5],
#             "Electronics": [10,10],
#             "Automotive": [5,5],
#             "Sports_and_Outdoors": [5,5],
#             "Patio_Lawn_and_Garden": [5,5],
#             "Toys_and_Games": [5,5],
#             "Movies_and_TV": [10,5],
#             "CDs_and_Vinyl": [5,5],
#             "Video_Games": [5,5]}


### 1. Multicore

In [None]:
import pandas as pd
from tqdm import tqdm
from preprocess import *

def filter_items_with_meta(df, meta_file_path):
    items = {iid: False for iid in df['ItemID'].unique()}
    item_meta = {}
    with open(meta_file_path, 'r') as fin:
        fin.readline()
        for i,line in tqdm(enumerate(fin)):
            meta_info = line.strip().split("\t")
            item_id = meta_info[0]
            if item_id in items:
                item_meta[item_id] = meta_info
                del items[item_id]
                if len(items) == 0:
                    break
    print("Item meta info of items in data set:")
    print(f"Found: {len(item_meta)}")
    print(f"Missing: {len(items)}")
    selected_rows = [True] * len(df)
    for i,iid in enumerate(df["ItemID"]):
        if iid in items:
            selected_rows[i] = False
    df = df[selected_rows]
    return item_meta, df

def multi_domain_multicore(domain_n_cores, target_dir):
    for domain, n_cores in domain_n_cores.items():
        print(domain)
        print(n_cores)
        df = pd.read_table(data_path + domain + ".csv", sep=",", 
                           names = ["ItemID", "UserID", "Response", "Timestamp"])
        multicore_data = run_multicore_asymetric(df[["UserID", "ItemID", "Response", "Timestamp"]], 
                                                 n_core_user = n_cores[0], 
                                                 n_core_item = n_cores[1])
        multicore_data = run_multicore_asymetric(multicore_data, n_cores[0], n_cores[1])
        multicore_data.to_csv(target_dir + "multicore_data/" + domain + ".tsv", sep = '\t', index = False)
        n_user, n_item = len(multicore_data.UserID.unique()), len(multicore_data.ItemID.unique())
        print(f"#user: {n_user}")
        print(f"#item: {n_item}")
        print(f"#record: {len(multicore_data)}")
        print(f"sparsity: {1.0 - len(multicore_data) / (n_user * n_item)}")
        # item meta
        items = {iid: iid for iid in multicore_data['ItemID'].unique()}
        item_meta_df = pd.DataFrame.from_dict(items, orient = "index", columns = ["ItemID"])
        item_meta_df = item_meta_df.reset_index(drop = True)
        item_meta_df.to_csv(target_dir + "meta_data/" + domain + "_item.meta", sep = '\t', index = False)
        # item vocabulary
        build_vocab(item_meta_df, target_dir + "meta_data/" + domain + "_item_fields.vocab", ["ItemID"])
        # item feature description
        item_fields_meta = pd.DataFrame({
            "field_name": ["ItemID"], 
            "field_type": ["nominal"], 
            "value_type": ["str"], 
            "field_enc": ["v2id"], 
            "vocab_key": ["ItemID"]})
        item_fields_meta.to_csv(target_dir + "meta_data/" + domain + "_item_fields.meta", 
                                sep = '\t', index = False)
        # user meta
        users = {uid: uid for uid in multicore_data['UserID'].unique()}
        user_meta_df = pd.DataFrame.from_dict(users, orient = "index", columns = ["UserID"])
        user_meta_df = user_meta_df.reset_index(drop = True)
        user_meta_df.to_csv(target_dir + "meta_data/" + domain + "_user.meta", sep = '\t', index = False)
        # user vocabulary
        build_vocab(user_meta_df, target_dir + "meta_data/" + domain + "_user_fields.vocab", ["UserID"])
        # user feature description
        user_fields_meta = pd.DataFrame({"field_name": ["UserID"], 
                                  "field_type": ["nominal"], 
                                  "value_type": ["str"], 
                                  "field_enc": ["v2id"], 
                                  "vocab_key": ["UserID"]})
        user_fields_meta.to_csv(target_dir + "meta_data/" + domain + "_user_fields.meta", 
                                sep = '\t', index = False)
        
        

In [None]:
setup_path(target_path + "meta_data/", is_dir = True)
setup_path(target_path + "multicore_data/", is_dir = True)
multi_domain_multicore(n_cores, target_path)

### 2. Cross-Domain Statistics

**Note**: Run the first section before running this section.

In [None]:
from tqdm import tqdm
import pandas as pd
def get_cross_domain_frequencies(domains, target_dir):
    user_frequencies = {}
    for i,domain in enumerate(domains):
        print(domain)
        df = pd.read_csv(target_dir + "multicore_data/" + domain + ".tsv", sep = '\t')
        UC = df['UserID'].value_counts()
        uids = list(UC.index)
        counts = list(UC.values)
        for j,u in tqdm(enumerate(uids)):
            c = counts[j]
            if u not in user_frequencies:
                user_frequencies[u] = [0] * len(domains)
            user_frequencies[u][i] += c
    return user_frequencies

In [None]:
UF = get_cross_domain_frequencies(list(n_cores.keys()), target_path)

In [None]:
def get_confusion_matrix(user_freq, domains):
    M = [[0]*len(domains) for _ in range(len(domains))]
    for i in tqdm(range(len(domains))):
        for j in range(len(domains)):
            for k,freq in user_freq.items():
                if freq[i] > 0 and freq[j] > 0:
                    M[i][j] += freq[j]
    return M

In [None]:
def get_cross_domain_matrix(user_freq, domains):
    M = [[0]*len(domains) for _ in range(len(domains))]
    for i in tqdm(range(len(domains))):
        for j in range(len(domains)):
            for k,freq in user_freq.items():
                if freq[i] > 0 and freq[j] > 0:
                    M[i][j] += 1
    return M

In [None]:
# M = get_confusion_matrix(UF, list(n_cores.keys()))
M = get_cross_domain_matrix(UF, list(n_cores.keys()))

In [None]:
import matplotlib.pyplot as plt
import numpy as np
def plot_heatmap(x_labels, y_labels, matrix, title = "confusion matrix", include_diagonal = True):
    assert len(matrix) == len(y_labels) and len(matrix[0]) == len(x_labels)
    fig, ax = plt.subplots(figsize = (len(x_labels),len(y_labels)))
    if not include_diagonal:
        im = ax.imshow([[np.log(matrix[i][j]) if i!=j else 0 for j in range(len(x_labels))] for i in range(len(y_labels))])
    else:
        im = ax.imshow([[np.log(matrix[i][j]) for j in range(len(x_labels))] for i in range(len(y_labels))])

    ax.set_xticks(np.arange(len(x_labels)))
    ax.set_yticks(np.arange(len(y_labels)))
    ax.set_xticklabels(x_labels)
    ax.set_yticklabels(y_labels)
    
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    
    for i in range(len(y_labels)):
        for j in range(len(x_labels)):
            text = ax.text(j, i, matrix[i][j], fontsize = 'small',
                           ha="center", va="center", color="w")
            
    ax.set_title(title)
    fig.tight_layout()
    plt.show()

In [None]:
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 16})
D = list(n_cores.keys())
plot_heatmap(D,D,M)

In [None]:
sharing_prob = []
for i,D1 in enumerate(D):
    from_d_sharing = [float(M[i][j])/M[i][i] for j,D2 in enumerate(D) if i != j]
    sharing_prob.append(np.mean(from_d_sharing))
print("Average common user prob: ", np.mean(sharing_prob))
plt.figure(figsize = (8,6))
plt.barh(D,sharing_prob)
plt.show()

In [None]:
def get_domain_unique_prop(user_freq, domains):
    unique_freq = {d: 0. for d in domains}
    domain_freq = {d: 0. for d in domains}
    for k,freq in tqdm(user_freq.items()):
        sum_freq = sum(freq)
        for i,domain in enumerate(domains):
            if freq[i] > 0:
                domain_freq[domain] += 1
                if freq[i] == sum_freq:
                    unique_freq[domain] += 1
    return {d: unique_freq[d]/domain_freq[d] for d in domains}

In [None]:
unique_prop = get_domain_unique_prop(UF, D)

In [None]:
import numpy as np
V = list(unique_prop.values())
print("Average unique user prob: ", np.mean(V))
plt.figure(figsize = (8,6))
plt.barh(list(unique_prop.keys()),V)
plt.show()

### 3. Holdout Cross-domain Cold-start Users

**Note**: Run the first cell and section 2 before running this section.

In [None]:
holdout_prob = 0.1 # x% of the users
min_holdout_freq = 40 # minimum cross-domain history length
max_holdout_bound = 0.2 # hold out y% of the record in cross-domain history

holdout_candidates = {d: [] for d in D}
holdout_number = {d: holdout_prob * M[i][i] for i,d in enumerate(D)}
for uid, freq in tqdm(UF.items()):
    # all user frequencies across domains
    total_freq = sum(freq)
    if total_freq < min_holdout_freq:
        continue
    # {domain_name: user's frequency on domain_name}
    valid_domain_freq = {D[i]:f for i,f in enumerate(freq) if f > 0}
    # maximum frequency to holdout
    available_freq = max_holdout_bound * total_freq
    # holdout when 
    # * There is still available frequency
    # * There is still less than 10% users held out in the domain
    for d,f in valid_domain_freq.items():
        if f < available_freq and len(holdout_candidates[d]) < holdout_number[d]:
            holdout_candidates[d].append(uid)
            available_freq -= f
print("#user candidates to holdout")
print({d: len(candidates) for d,candidates in holdout_candidates.items()})

In [None]:
'''
Cold holdout will split users by 9-1:
train: random 90% of user
test: random 10% of user
'''
from preprocess import move_user_data
def recheck_exist(trainset, testset, field_name):
        print("Move unseen " + field_name + " from test to train, this may also move users in val to train")
        V = {v:1 for v in trainset[field_name].unique()} # updated ids in train
        test_user_hist = {} # {uid: [row_id]}
        moving_user = {} # [uid], set of users to move from test/val to train
        pos = 0
        for u,i in zip(testset["UserID"], testset[field_name]):
            if u not in test_user_hist:
                test_user_hist[u] = list()
            test_user_hist[u].append(pos)
            pos += 1
            if i not in V:
                moving_user[u] = 1
        moving_user = list(moving_user.keys())
        print("Test --> Train")
        trainset, testset = move_user_data(from_df = testset, to_df = trainset, 
                                           moving_user = moving_user, 
                                           user_hist = test_user_hist, field_name = field_name)
        return trainset, testset
        
def holdout_cross_domain_cold_start(target_dir, holdout_candidates):
    for domain,uids in holdout_candidates.items():
        selected = {u: True for u in uids}
        # data
        df = pd.read_csv(target_dir + "multicore_data/" + domain + ".tsv", sep = '\t')
        # Build user history
        user_hist = {}
        for pos,row in tqdm(enumerate(df.values)):
            u, *record = row
            if u not in user_hist:
                user_hist[u] = list()
            user_hist[u].append(pos)
        # holdout cold-start
        print("Holdout cold-start user histories")
        test_indices = df["UserID"]==-1
        for u,H in tqdm(user_hist.items()):
            if u in selected:
                test_indices.iloc[H] = True
        testset = df[test_indices]
        trainset = df[~test_indices]
        # recheck exist
        trainset = trainset.reset_index(drop = True)
        testset = testset.reset_index(drop = True)
        trainset, testset = recheck_exist(trainset, testset, field_name = "ItemID")
        # save data
        trainset.to_csv(target_dir + "tsv_data/" + domain + "_local.tsv", sep = '\t', index = False)
        testset.to_csv(target_dir + "tsv_data/" + domain + "_test_cold.tsv", sep = '\t', index = False)
        

In [None]:
setup_path(target_path + "tsv_data/", is_dir = True)
holdout_cross_domain_cold_start(target_path, holdout_candidates)

### 4. Holdout Each Domain

Warm or leave-one-out

**Note**: Run the first cell before running this section.

In [None]:
from preprocess import holdout_data_sequential, recheck_exist
import pandas as pd

def holdout_single_domain_data(target_dir, domains, holdout_type = 'leave_one_out'):
    for domain in domains:
        print(domain, ": ")
        # data
        df = pd.read_csv(target_dir + "tsv_data/" + domain + "_local.tsv", sep = '\t')
        # holdout
        df = df.sort_values(by=['UserID','Timestamp'])
        trainset, valset, testset = holdout_data_sequential(df, holdout_type, min_hist_len = 10)
        trainset = trainset.reset_index(drop = True)
        valset = valset.reset_index(drop = True)
        testset = testset.reset_index(drop = True)
        trainset, valset, testset = recheck_exist(trainset, valset, testset, field_name = "ItemID")
        # save
        trainset.to_csv(target_dir + "tsv_data/" + domain + "_train.tsv", sep = '\t', index = False)
        valset.to_csv(target_dir + "tsv_data/" + domain + "_val.tsv", sep = '\t', index = False)
        testset.to_csv(target_dir + "tsv_data/" + domain + "_test.tsv", sep = '\t', index = False)
        print("#User: ", len(trainset["UserID"].unique()), len(valset["UserID"].unique()), len(testset["UserID"].unique()))
        print("#Item: ", len(trainset["ItemID"].unique()), len(valset["ItemID"].unique()), len(testset["ItemID"].unique()))
        print("#Record: ", len(trainset), len(valset), len(testset))

In [None]:
holdout_type = "leave_one_out" # "warm" "leave_one_out"
holdout_single_domain_data(target_path, list(n_cores.keys()), holdout_type)