In [1]:
library(tidyverse)

Loading tidyverse: ggplot2
Loading tidyverse: tibble
Loading tidyverse: tidyr
Loading tidyverse: readr
Loading tidyverse: purrr
Loading tidyverse: dplyr
Conflicts with tidy packages ---------------------------------------------------
filter(): dplyr, stats
lag():    dplyr, stats


In [2]:
partition_ratings = function(ratings, nparts=5) {
    nRows = nrow(ratings)
    # shuffle the row number
    index = sample(nRows)
    # equally divide nRows into nparts
    partSizes = (nRows %/% nparts) * rep(1, nparts)
    # equally divide the remainder and add to the first "remainder" parts
    remainder = nRows %% nparts
    if ( remainder > 0) {
        partSizes[1:remainder] = partSizes[1:remainder] + 1
    }
    partInd = data_frame(part = 1:nparts, rowNumber=list(NULL))
    startInd = 1
    for (i in 1:nparts) {
        endInd = startInd + partSizes[[i]] - 1
        partInd[[i, "rowNumber"]] = index[startInd:endInd]
        startInd = endInd + 1
    }
    partInd %>% unnest(rowNumber)
}

In [3]:
partition_users = function(ratings, nparts=5, holdout=5) {
    colnames(ratings) = c("userId", "itemId")
    # sample test set, for each user, select "holdout" items
    testSet = ratings %>%
        mutate(rowNumber=row_number()) %>%
        group_by(userId) %>%
        mutate(index = sample(n())) %>%
        filter(index <= holdout) %>%
        ungroup() %>%
        select(-index)
    uniqueUsers = unique(ratings$userId)
    # shuffle the userId
    uniqueUsers = sample(uniqueUsers)
    nusers = length(uniqueUsers)
    # partition users
    userPartitions = data_frame(part = 1:nparts, userId=list(NULL))
    partSizes = (nusers %/% nparts) * rep(1, nparts)
    # equally divide the remainder and add to the first "remainder" parts
    remainder = nusers %% nparts
    if ( remainder > 0) {
        partSizes[1:remainder] = partSizes[1:remainder] + 1
    }
    startInd = 1
    for (i in 1:nparts) {
        endInd = startInd + partSizes[[i]] - 1
        userPartitions[[i, "userId"]] = uniqueUsers[startInd:endInd]
        startInd = endInd + 1
    }
    userPartitions = userPartitions %>% unnest(userId)
    # join userId partition with users' items return rowNumber for reference
    userPartitions %>% 
        inner_join(testSet) %>%
        select(part, rowNumber)
}

In [4]:
recommend_oracle = function(candidates, groundtruth, topN) {
    candidates %>%
        left_join(groundtruth %>% mutate(score = 1)) %>% # the score means prediction
        mutate(score = ifelse(is.na(score), 0, score)) %>%
        group_by(userId) %>%
        mutate(rank = rank(-score, ties.method = "first")) %>%
        ungroup() %>%
        arrange(userId, rank) %>%
        filter(rank <= topN)
}

In [5]:
recommend_popular = function(candidates, observation, topN) {
    # prediction
    popularScore = observation %>%
        group_by(itemId) %>%
        summarize(n = n()) %>%
        mutate(score = n / max(n)) %>%
        select(-n)
    # join with predicted scores
    candidates %>%
        left_join(popularScore) %>%
        group_by(userId) %>%
        mutate(rank = rank(-score, ties.method = "first")) %>%
        ungroup() %>%
        arrange(userId, rank) %>%
        filter(rank <= topN)
}

In [6]:
recommend_random = function(candidates, topN) {
    candidates %>%
        group_by(userId) %>%
        mutate(score = runif(n()),
               rank = rank(-score, ties.method = "first")) %>%
        ungroup() %>%
        arrange(userId, rank) %>%
        filter(rank <= topN)
}

All items excluding train.items means a user's all unrated items. the selected test items are also included as unrated.

In [32]:
# ideal: all rated items for each user. userId-itemId-rel
# recommendations: userId-itemId-score-rank
compute_ndcg = function(recommendations, ideal, topN) {
    dcg = recommendations %>%
        left_join(ideal) %>%
        mutate(rel=ifelse(is.na(rel), 0, rel),
               dg=ifelse(rank==1, rel, rel / log2(rank))) %>%
        group_by(userId) %>%
        summarize(dcg = sum(dg)) %>%
        ungroup()
    idcg = ideal %>%
        group_by(userId) %>%
        mutate(rank = row_number()) %>%
        ungroup() %>%
        filter(rank <= topN) %>%
        group_by(userId) %>%
        mutate(idg=ifelse(rank==1, rel, rel / log2(rank))) %>%
        summarize(idcg = sum(idg)) %>%
        ungroup()
    
    dcg %>%
        inner_join(idcg) %>%
        mutate(ndcg = dcg / idcg) %>%
        select(userId, ndcg)
}

In [None]:
compute_precision = function(recommendations, ideal, topN) {
    recommendations %>%
        filter(rank <= topN) %>%
        left_join(ideal) %>%
        mutate(rel=ifelse(is.na(rel), 0, rel)) %>%
        group_by(userId) %>%
        summarize(precision=mean(rel)) %>%
        ungroup()
}

In [None]:
compute_recall = function(recommendations, ideal, topN) {
    tp = recommendations %>%
        filter(rank <= topN) %>%
        left_join(ideal) %>%
        group_by(userId) %>%
        summarize(tp = sum(rel, na.rm = TRUE)) %>%
        ungroup()
    relevanceCount = ideal %>%
        group_by(userId) %>%
        summarize(relCount = n()) %>%
        ungroup()
    tp %>%
        inner_join(relevanceCount) %>%
        transmute(userId=userId,
                  recall=tp / relCount)
}

In [None]:
compute_reciprocal_rank = function(recommendations, ideal, topN) {
    validUserRR = recommendations %>%
        filter(rank <= topN) %>%
        left_join(ideal) %>%
        filter(!is.na(rel)) %>% # lose users
        group_by(userId) %>%
        summarize(rankFirstRel=min(rank)) %>%
        ungroup() %>%
        transmute(userId=userId,
                  reciprocal.rank=1 / rankFirstRel)
    
    recommendations %>%
        select(userId) %>%
        distinct() %>%
        left_join(validUserRR)
}

In [None]:
compute_average_precision = function(recommendations, ideal, topN) {
    validUserAP = recommendations %>%
        filter(rank <= topN) %>%
        left_join(ideal) %>%
        mutate(rel=ifelse(is.na(rel), 0, rel)) %>%
        group_by(userId) %>%
        arrange(rank) %>%
        mutate(cumsumRel = cumsum(rel),
               precisionAtRank = cumsumRel / rank) %>%
        ungroup() %>%
        filter(rel != 0) %>% # lose users
        group_by(userId) %>%
        summarize(avg.precision=mean(precisionAtRank)) %>%
        ungroup()
    
    recommendations %>%
        select(userId) %>%
        distinct() %>%
        left_join(validUserAP)
}

In [None]:
compute_hit_rate = function(recommendations, ideal, topN) {
    recommendations %>%
        filter(rank <= topN) %>%
        left_join(ideal) %>%
        group_by(userId) %>%
        summarize(hit=any(!is.na(rel))) %>%
        ungroup()
}