In [None]:
import pickle
import pandas as pd
import io

In [None]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
def _ark(actual: list, predicted: list, k=10) -> int:
    """
    Computes the average recall at k.
    Parameters
    ----------
    actual : list
        A list of actual items to be predicted
    predicted : list
        An ordered list of predicted items
    k : int, default = 10
        Number of predictions to consider
    Returns:
    -------
    score : int
        The average recall at k.
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / len(actual)

def mark(actual: list, predicted: list, k=10):
    """
    Computes the mean average recall at k.
    Parameters
    ----------
    actual : a list of lists
        Actual items to be predicted
        example: [['A', 'B', 'X'], ['A', 'B', 'Y']]
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    Returns:
    -------
        mark: int
            The mean average recall at k (mar@k)
    """
    return np.mean([_ark(a,p,k) for a,p in zip(actual, predicted)])

In [None]:
def novelty(predicted: list, pop: dict, u: int, n: int):
    """
    Computes the novelty for a list of recommendations
    Parameters
    ----------
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    pop: dictionary
        A dictionary of all items alongside of its occurrences counter in the training data
        example: {1198: 893, 1270: 876, 593: 876, 2762: 867}
    u: integer
        The number of users in the training data
    n: integer
        The length of recommended lists per user
    Returns
    ----------
    novelty:
        The novelty of the recommendations in system level
    mean_self_information:
        The novelty of the recommendations in recommended top-N list level
    ----------    
    Metric Defintion:
    Zhou, T., Kuscsik, Z., Liu, J. G., Medo, M., Wakeling, J. R., & Zhang, Y. C. (2010).
    Solving the apparent diversity-accuracy dilemma of recommender systems.
    Proceedings of the National Academy of Sciences, 107(10), 4511-4515.
    """
    mean_self_information = []
    k = 0
    for sublist in predicted:
        self_information = 0
        k += 1
        for i in sublist:
            if pop[i]!=0:
                self_information += np.sum(-np.log2(pop[i]/u))
        mean_self_information.append(self_information/n)
    novelty = sum(mean_self_information)/k
    #return novelty, mean_self_information
    return novelty

In [None]:
def prediction_coverage(predicted: list, catalog: list):
    """
    Computes the prediction coverage for a list of recommendations
    Parameters
    ----------
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    catalog: list
        A list of all unique items in the training data
        example: ['A', 'B', 'C', 'X', 'Y', Z]
    Returns
    ----------
    prediction_coverage:
        The prediction coverage of the recommendations as a percent
        rounded to 2 decimal places
    ----------    
    Metric Defintion:
    Ge, M., Delgado-Battenfeld, C., & Jannach, D. (2010, September).
    Beyond accuracy: evaluating recommender systems by coverage and serendipity.
    In Proceedings of the fourth ACM conference on Recommender systems (pp. 257-260). ACM.
    """
    predicted_flattened = [p for sublist in predicted for p in sublist]
    unique_predictions = len(set(predicted_flattened))
    prediction_coverage = round(unique_predictions/(len(catalog)* 1.0)*100,2)
    return prediction_coverage

In [None]:
def catalog_coverage(predicted: list, catalog: list, k: int) -> float:
    """
    Computes the catalog coverage for k lists of recommendations
    Parameters
    ----------
    predicted : a list of lists
        Ordered predictions
        example: [['X', 'Y', 'Z'], ['X', 'Y', 'Z']]
    catalog: list
        A list of all unique items in the training data
        example: ['A', 'B', 'C', 'X', 'Y', Z]
    k: integer
        The number of observed recommendation lists
        which randomly choosed in our offline setup
    Returns
    ----------
    catalog_coverage:
        The catalog coverage of the recommendations as a percent
        rounded to 2 decimal places
    ----------    
    Metric Defintion:
    Ge, M., Delgado-Battenfeld, C., & Jannach, D. (2010, September).
    Beyond accuracy: evaluating recommender systems by coverage and serendipity.
    In Proceedings of the fourth ACM conference on Recommender systems (pp. 257-260). ACM.
    """
    sampling = random.choices(predicted, k=k)
    predicted_flattened = [p for sublist in sampling for p in sublist]
    L_predictions = len(set(predicted_flattened))
    catalog_coverage = round(L_predictions/(len(catalog)*1.0)*100,2)
    return catalog_coverage

In [None]:
real_list = pickle.load(open('./tmp/Challenge_Dataset/real_list.dat', 'rb'))
pred_list = pickle.load(open('./tmp/Challenge_Dataset/pred_list.dat', 'rb'))

In [None]:
mapk(real_list,pred_list,10)

0.10686607242162798

In [None]:
mark(real_list,pred_list,10)

0.10686607242162798

In [None]:
articles = pd.read_csv('data/articles.csv')

In [None]:
df = pd.read_csv('data/train.csv')

In [None]:
article_id=articles['tcm_id'].unique()

In [None]:
def index_set(s):
    i = 0
    s_map = {}
    for key in s:
        s_map[key] = str(i)
        i+=1
    return s_map

In [None]:
article_map = index_set(article_id)

In [None]:
len(article_map.keys())

71

In [None]:
#pred_list
idx = [] 
for tcm_id in df['tcm_id']:
    idx.append(int(article_map[tcm_id]))
df['article_idx']= idx

In [None]:
ids = df.article_idx.value_counts()
print(len(ids))
pop = dict(ids)
for i in range(71):
    if i not in pop.keys():
        pop[i]=0

66


In [None]:
novelty(pred_list,pop,len(article_id),10)

-3.316021570538282

In [None]:
catalog = df['article_idx'].unique().tolist()

In [None]:
prediction_coverage(pred_list, catalog)

93.94

In [None]:
import random
catalog_coverage(pred_list, catalog, 10)

57.58

### convert user, article index to real user, article id 

In [None]:
import pickle
best_pred_path = pickle.load(open('./tmp/Challenge_Dataset/best_pred_path.dat', 'rb'))

In [None]:
def index_set(s):
    i = 0
    s_map = {}
    for key in s:
        s_map[key] = str(i)
        i+=1
    return s_map

In [None]:
user_id = df['ip'].unique()
article_id=articles['tcm_id'].unique()
topic_list = []
topics = articles['topic'].dropna()
for topic in topics:
    t = topic.strip('[').strip(']').strip().split(',')
    for i in range(len(t)):
        j = t[i].strip().replace("'",'')
        if j not in topic_list:
            topic_list.append(j)
topic_tag_list = []
topic_tags = articles['vstopic'].dropna()
for tag in topic_tags:
    t = tag.strip('[').strip(']').strip().split(',')
    for i in range(len(t)):
        j = t[i].strip().replace("'",'')
        if j not in topic_tag_list:
            topic_tag_list.append(j)
product_list = []
products = articles['product'].dropna()
for product in products:
    t = product.strip('[').strip(']').strip().split(',')
    for i in range(len(t)):
        j = t[i].strip().replace("'",'')
        if j not in product_list:
            product_list.append(j)

In [None]:
user_map = index_set(user_id)
article_map = index_set(article_id)
topic_map = index_set(topic_list)
tag_map = index_set(topic_tag_list)
product_map = index_set(product_list)

In [None]:
user_map_trans = {v: k for k,v in user_map.items()}
article_map_trans = {v: k for k,v in article_map.items()}
product_map_trans = {v: k for k,v in product_map.items()}
topic_map_trans = {v: k for k,v in topic_map.items()}
tag_map_trans = {v: k for k,v in tag_map.items()}

In [None]:
best_pred_path_trans={}
for k,v in best_pred_path.items():
    k_d = k
    for i in v:
        j = i[len(i)-1]
        t = 0
        for k in j:
            if k[1]=='user':
                k = list(k)
                k[2]=user_map_trans[str(k[2])]
                k = tuple(k)
                #print(user_map_trans[str(k[2])])
            if k[1]=='article':
                k = list(k)
                k[2]=article_map_trans[str(k[2])]
                k = tuple(k)
            if k[1]=='product':
                k = list(k)
                k[2]=product_map_trans[str(k[2])]
            if k[1]=='topic':
                k = list(k)
                k[2]=topic_map_trans[str(k[2])]
            if k[1]=='tag':
                k = list(k)
                k[2]=tag_map_trans[str(k[2])]
            j[t]=k
            t+=1
        i = list(i)
        i[2]=j
        i = tuple(i)
    best_pred_path_trans[user_map_trans[str(k_d)]]=v

In [None]:
best_pred_path_trans

Output hidden; open in https://colab.research.google.com to view.