In [None]:
import numpy as np
import pandas as pd
from apyori import apriori
from itertools import product
from collections import defaultdict

In [None]:
data_dir = 'data/food/'
training_data = np.load(f'{data_dir}train.npy', allow_pickle=True).item()
# test_data = np.load(f'{data_dir}validate.npy', allow_pickle=True).item()
test_data = np.load(f'{data_dir}test.npy', allow_pickle=True).item()
bundle_aspects = np.load(f'{data_dir}bundle_aspects.npy', allow_pickle=True).item()

In [None]:
len(training_data.keys())

In [None]:
filter_data = {k:v for k,v in training_data.items() if len(v)>1}

In [None]:
len(filter_data)

In [None]:
#remove empty set

for user, bundles in filter_data.items():
    for b in bundles:
        if '' in b:
            print(user)
            bundles.remove([''])

In [None]:
statistic_bundles = defaultdict(int)
for bundles in filter_data.values():
    statistic_bundles[len(bundles)] += 1

In [None]:
all_patterns = []
for bundles in filter_data.values():
    if len(bundles) == 2:
        all_patterns += list(product(bundles[0], bundles[1]))
    elif len(bundles) == 3:
        all_patterns += list(product(bundles[0], bundles[1], bundles[2]))
    elif len(bundles) == 4:
        all_patterns += list(product(bundles[0], bundles[1], bundles[2], bundles[3]))
    elif len(bundles) == 5:
        all_patterns += list(product(bundles[0], bundles[1], bundles[2], bundles[3], bundles[4]))

In [None]:
filter_patterns = []

for pattern in all_patterns:
    filter_patterns.append(list(pattern))

filter_patterns = [i for i in filter_patterns if len(i)>1]
print(len(filter_patterns))

In [None]:
association_rules = apriori(filter_patterns, min_support=0.0001, min_confidence=0.1, min_lift=3)
association_results = list(association_rules)
print(len(association_results))

In [None]:
lookup_table = defaultdict(list)
for item in association_results:
    # First index of the inner list contains base item and add item
    pair = item[0] 
    if len(pair) == 1:
        continue
    items = [x for x in pair]
    items = sorted(items)
    lookup_table[items[0]].append(items[1:])

In [None]:
# lookup_table
rules = lookup_table.copy()
for head in rules:
    tails = rules.get(head)
    tails = sum(tails, [])
    rules[head] = tails
#     print(tails)
#     break

In [None]:
# test

# step 1. find all possible bundles (not in training!)
# step 2. rank bundles by counts
# step 3. Pop bundles rec to users who have no pattern
user_rec_candidate = defaultdict(dict)
user_rec_rules = defaultdict(dict)

for user in test_data:
    bundles_aspects = training_data[user]
    for aspects in bundles_aspects:
        for aspect in aspects:
            # intersection
            if aspect not in rules:
                continue
            partial_rules = rules[aspect]
            for bundle, b_aspects in bundle_aspects.items():
                if aspects == b_aspects:
                    continue
                intersection_rules = [a for a in b_aspects if a in partial_rules]
                if len(intersection_rules)>0:
                    intersection_rules.append(aspect)
                    if bundle in user_rec_candidate[user]:
                        user_rec_rules[user][bundle].append(intersection_rules)
                        if len(intersection_rules) > user_rec_candidate[user][bundle]:
                            user_rec_candidate[user][bundle] = len(intersection_rules)
                    else:
                        user_rec_candidate[user][bundle] = len(intersection_rules)
                        user_rec_rules[user][bundle] = [intersection_rules]
#                     user_rec_candidate[user].append((bundle, len(intersection_rules)))

In [None]:
len(user_rec_candidate), len(test_data), len(bundle_aspects)

In [None]:
top20 = {}
count = 0
for user,j in user_rec_candidate.items():
    top20_bundles = sorted(j.items(), key=lambda kv:(kv[1], kv[0]), reverse=True)[:10]
    rec_list = [i[0] for i in top20_bundles]
    top20[user] = rec_list

In [None]:
len(top20)

In [None]:
print('recommending')
u_prediction = {}
for u in top20.keys():
    u_prediction[u] = [1 if i in test_data[u] else 0 for i in top20[u]]
# res = pd.DataFrame({'metric@K': ['hr', 'ndcg']})
print('evaluating')   
for k in [1,5,10]:
    tmp_preds = u_prediction.copy()        
    tmp_preds = {key: rank_list[:k] for key, rank_list in tmp_preds.items()}

    ndcg_k = np.mean([ndcg_at_k(r, k) for r in tmp_preds.values()])
    hr_k = hr_at_k(tmp_preds, test_data)

    print(f'HR@{k}: {hr_k:.4f}')
    print(f'NDCG@{k}: {ndcg_k:.4f}')
#     res[k] = np.array([hr_k, ndcg_k])

In [None]:
def hr_at_k(rs, test_ur):
    """
    Hit Ratio calculation method
    Parameters
    ----------
    rs : Dict, {user : rank items} for test set
    test_ur : (Deprecated) Dict, {user : items} for test set ground truth

    Returns
    -------
    hr : float, HR value
    """
    # another way for calculating hit rate
    # numer, denom = 0., 0.
    # for user in test_ur.keys():
    #     numer += np.sum(rs[user])
    #     denom += len(test_ur[user])

    # return numer / denom
    uhr = 0
    for r in rs.values():
        if np.sum(r) != 0:
            uhr += 1
    hr = uhr / len(rs)

    return hr

def dcg_at_k(r, k):
    """
    Discounted Cumulative Gain calculation method
    Parameters
    ----------
    r : List, Relevance scores (list or numpy) in rank order
                (first element is the first item)
    k : int, top-K number
    Returns
    -------
    dcg : float, DCG value
    """
    assert k >= 1
    r = np.asfarray(r)[:k] != 0
    if r.size:
        dcg = np.sum(np.subtract(np.power(2, r), 1) / np.log2(np.arange(2, r.size + 2)))
        return dcg
    return 0.

def ndcg_at_k(r, k):
    """
    Normalized Discounted Cumulative Gain calculation method
    Parameters
    ----------
    r : List, Relevance scores (list or numpy) in rank order
            (first element is the first item)
    k : int, top-K number
    Returns
    -------
    ndcg : float, NDCG value
    """
    assert k >= 1
    idcg = dcg_at_k(sorted(r, reverse=True), k)
    if not idcg:
        return 0.
    ndcg = dcg_at_k(r, k) / idcg

    return ndcg