
# Forced Balance Analysis

In [None]:
library(rstan)
library(tidyverse)
library(ggplot2)
library(modelr)
library(xtable)

In [None]:
options(repr.plot.height = 5)
options(mc.cores=parallel::detectCores())
rstan_options(auto_write=TRUE)

In [None]:
source("utils.R")

Load eval user results

In [None]:
exp.users.bx = read_csv("build/eval-users-explicit-genderBalance-bx.csv")
imp.users.bx = read_csv("build/eval-users-implicit-genderBalance-bx.csv") 

In [None]:
exp.users.az = read_csv("build/eval-users-explicit-genderBalance-amazon.csv")
imp.users.az = read_csv("build/eval-users-implicit-genderBalance-amazon.csv")

In [None]:
unique(exp.users.az$Algorithm)

In [None]:
results = bind_rows(`BX-E`=exp.users.bx,
                    `BX-I`=imp.users.bx,
                    `AZ-E`=exp.users.az,
                    `AZ-I`=imp.users.az,
                    .id='Run') %>%
    mutate(Algorithm=gsub("^FunkSVD", "MF", Algorithm)) %>%
    filter(!endsWith(Algorithm, "Optimized")) %>%
    mutate(DataSet=gsub('-(E|I)', '', Run),
           DataMode=recode(Run, `BX-E`='Explicit', `BX-I`='Implicit',
                           `AZ-E`='Explicit', `AZ-I`='Implicit'),
           Algorithm=gsub("(ity)?-?BalanceForced$", "-B", Algorithm),
           AlgoFamily=as.factor(gsub("(ity)?(-Implicit)?(-B)?$", "", Algorithm)),
           Mode=as.factor(if_else(endsWith(Algorithm, "-B"), "Balanced", "Natural")))
results %>% select(Run, AlgoFamily, Mode, nDCG) %>% summary()

In [None]:
print(results)

In [None]:
reclist.ndcg = results %>%
    mutate(nDCG=ifelse(!is.nan(nDCG), nDCG, NA)) %>%
    select(Run, AlgoFamily, User, Mode, nDCG) %>%
    spread(Mode, nDCG) %>%
    mutate(Penalty = Natural - Balanced) %>%
    unite(Instance, Run, AlgoFamily, remove=FALSE) %>%
    mutate(Instance=as.factor(Instance), Run=as.factor(Run))
print(reclist.ndcg)

In [None]:
summary(reclist.ndcg)

Let's summarize some statistics:

In [None]:
reclist.ndcg %>%
    group_by(Run, AlgoFamily) %>%
    summarize(Nat=mean(Natural)) %>%
    spread(Run, Nat)

In [None]:
reclist.ndcg %>%
    group_by(Run, AlgoFamily) %>%
    summarize(Bal=mean(Balanced)) %>%
    spread(Run, Bal)

In [None]:
penalties = reclist.ndcg %>%
    group_by(Run, AlgoFamily) %>%
    summarize(AvgPenalty=mean(Penalty, na.rm=TRUE)) %>%
    mutate(AvgPenalty=sprintf("%0.4f", AvgPenalty)) %>%
    spread(Run, AvgPenalty)
penalties

In [None]:
print(xtable(penalties), booktabs = TRUE, include.rownames=FALSE)

And plot the mess!

In [None]:
ndcg.stats = results %>%
    mutate(nDCG=ifelse(!is.nan(nDCG), nDCG, NA)) %>%
    group_by(Run, AlgoFamily, Mode, DataSet, DataMode) %>%
    summarize(nDCG=mean(nDCG, na.rm=TRUE)) %>%
    group_by(DataMode, DataSet) %>%
    mutate(high=nDCG > max(nDCG) * 0.5)

In [None]:
drawplot(file='build/figures/force-balance-results', width=5, height=3.5, {
ggplot(ndcg.stats) +
    aes(x=AlgoFamily, y=nDCG, fill=Mode, label=sprintf("%0.3f", nDCG)) +
    geom_bar(stat='identity', position='dodge') +
    geom_text(position=position_dodge(width=1), size=3, angle=90,
              mapping=aes(hjust=if_else(high, 1.05, -0.05),
                          color=if_else(high, "white", "black"))) +
    facet_grid(DataMode ~ DataSet, scales="free") +
    scale_fill_manual(values=c(Balanced="firebrick4", Natural="indianred2")) +
    scale_color_identity() +
    xlab("Algorithm") +
    theme_paper() +
    theme(axis.text.x=element_text(angle=45, hjust=1),
          legend.position='bottom')
})

## Inference for Penalties

In [None]:
reclist.ndcg %>% mutate(Usable = Natural > 0 | Balanced > 0) %>% select(-Instance) %>% summary()

We will infer the average penalty for each algorithm using a simple Bayesian model: $\bar l_a \sim \mathrm{Normal}(\mu_a, \sigma_a)$.

In [None]:
pen_model = stan_model(file='models/penalty-means.stan')
pen_model

```
pen_fit = sampling(pen_model,
                   data=list(`NA`=length(levels(reclist.ndcg$Instance)),
                             NU=nrow(reclist.ndcg),
                             algo=as.integer(reclist.ndcg$Instance),
                             natAcc=reclist.ndcg$Natural,
                             balAcc=reclist.ndcg$Balanced),
                   chains=4, iter=5000)
```

# Evaluation Results

**Explicit Data BX**

Summarize the mean of each metric.

In [None]:
exp.users.bx %>%
    filter(!endsWith(Algorithm, "Optimized")) %>%
    group_by(DataSet, Algorithm) %>%
    summarize(nDCG = mean(nDCG, na.rm = TRUE),
              MRR = mean(RecipRank, na.rm = TRUE),
              MAP = mean(AvgPrec, na.rm = TRUE),
              TopN.numFemale = mean(TopN.numFemale, na.rm = TRUE),
              TopN.numUnknown = mean(TopN.numUnknown, na.rm = TRUE),
              TopN.numMale = mean(TopN.numMale, na.rm = TRUE)) %>%
    ungroup() %>%
    arrange(Algorithm)

Statistical test for each algorithm

In [None]:
wilcox.test(filter(exp.users.bx, Algorithm == "Bias")$RecipRank,
            filter(exp.users.bx, Algorithm == "BiasBalanceForced")$RecipRank)

In [None]:
wilcox.test(filter(exp.users.bx, Algorithm == "Popular")$RecipRank,
            filter(exp.users.bx, Algorithm == "PopularityBalanceForced")$RecipRank)

In [None]:
wilcox.test(filter(exp.users.bx, Algorithm == "UserUser")$RecipRank,
            filter(exp.users.bx, Algorithm == "UserUserBalanceForced")$RecipRank)

In [None]:
wilcox.test(filter(exp.users.bx, Algorithm == "ItemItem")$RecipRank,
            filter(exp.users.bx, Algorithm == "ItemItemBalanceForced")$RecipRank)

In [None]:
wilcox.test(filter(exp.users.bx, Algorithm == "MF")$RecipRank,
            filter(exp.users.bx, Algorithm == "FunkSVDBalanceForced")$RecipRank)

In [None]:
wilcox.test(filter(exp.users.bx, Algorithm == "PF")$RecipRank,
            filter(exp.users.bx, Algorithm == "PFBalanceForced")$RecipRank)

**Implicit Data BX**

Summarize the mean of each metric.

In [None]:
imp.users.bx %>%
    filter(!endsWith(Algorithm, "Optimized")) %>%
    group_by(DataSet, Algorithm) %>%
    summarize(nDCG = mean(nDCG, na.rm = TRUE),
              MRR = mean(RecipRank, na.rm = TRUE),
              MAP = mean(AvgPrec, na.rm = TRUE),
              TopN.numFemale = mean(TopN.numFemale, na.rm = TRUE),
              TopN.numUnknown = mean(TopN.numUnknown, na.rm = TRUE),
              TopN.numMale = mean(TopN.numMale, na.rm = TRUE)) %>%
    ungroup() %>%
    arrange(DataSet, Algorithm)

In [None]:
wilcox.test(filter(imp.users.bx, Algorithm == "Popular-Implicit")$RecipRank,
            filter(imp.users.bx, Algorithm == "Popularity-Implicit-BalanceForced")$RecipRank)

In [None]:
wilcox.test(filter(imp.users.bx, Algorithm == "UserUser-Implicit")$RecipRank,
            filter(imp.users.bx, Algorithm == "UserUser-Implicit-BalanceForced")$RecipRank)

In [None]:
wilcox.test(filter(imp.users.bx, Algorithm == "ItemItem-Implicit")$RecipRank,
            filter(imp.users.bx, Algorithm == "ItemItem-Implicit-BalanceForced")$RecipRank)

In [None]:
wilcox.test(filter(imp.users.bx, Algorithm == "MF-Implicit")$RecipRank,
            filter(imp.users.bx, Algorithm == "FunkSVD-Implicit-BalanceForced")$RecipRank)

In [None]:
wilcox.test(filter(imp.users.bx, Algorithm == "PF-Implicit")$RecipRank,
            filter(imp.users.bx, Algorithm == "PF-Implicit-BalanceForced")$RecipRank)

## Linear Regression

Fit the linear regression model for each algorithm pair using unbalanced ndcg as the feature, forcebalaced ndcg as the label.

Transform data to the required format.

In [None]:
explicit.algorithm.list = list(Bias=c('Bias', 'BiasBalanceForced'),
                               Popular=c('Popular', 'PopularityBalanceForced'),
                               UserUser=c('UserUser', 'UserUserBalanceForced'),
                               ItemItem=c('ItemItem', 'ItemItemBalanceForced'),
                               FunkSVD=c('MF', 'FunkSVDBalanceForced'),
                               PF=c('PF', 'PFBalanceForced'))

In [None]:
ndcg.explicit.bx = map_dfr(explicit.algorithm.list, function(algorithm) {
    exp.users.bx %>%
        select(DataSet, Algorithm, User, nDCG) %>%
        filter(!is.na(nDCG)) %>%
        filter(Algorithm %in% algorithm) %>%
        mutate(Algorithm = ifelse(Algorithm == algorithm[[1]], 'original', 'forcebalanced')) %>%
        spread(Algorithm, nDCG)
}, .id = "Algorithm")
head(ndcg.explicit.bx)

In [None]:
implicit.algorithm.list = list(Popular=c('Popular-Implicit', 'Popularity-Implicit-BalanceForced'),
                               UserUser=c('UserUser-Implicit', 'UserUser-Implicit-BalanceForced'),
                               ItemItem=c('ItemItem-Implicit', 'ItemItem-Implicit-BalanceForced'),
                               FunkSVD=c('MF-Implicit', 'FunkSVD-Implicit-BalanceForced'),
                               PF=c('PF-Implicit', 'PF-Implicit-BalanceForced'))

In [None]:
ndcg.implicit.bx = map_dfr(implicit.algorithm.list, function(algorithm) {
    imp.users.bx %>%
        select(DataSet, Algorithm, User, nDCG) %>%
        filter(!is.na(nDCG)) %>%
        filter(Algorithm %in% algorithm) %>%
        mutate(Algorithm = ifelse(Algorithm == algorithm[[1]], 'original', 'forcebalanced')) %>%
        spread(Algorithm, nDCG)
}, .id = "Algorithm")
head(ndcg.implicit.bx)

In [None]:
user.ndcg.bx = bind_rows(ndcg.explicit.bx, ndcg.implicit.bx) %>%
    select(DataSet, Algorithm, User, Unbalanced=original, Balanced=forcebalanced)
head(user.ndcg.bx)

Plot it

Explicit ratings

In [None]:
ggplot(user.ndcg.bx %>% filter(DataSet=='bx-explicit')) +
    aes(x=Unbalanced, y=Balanced) +
    geom_point() +
    geom_rug() +
    facet_wrap(~ Algorithm)

Implicit ratings

In [None]:
ggplot(user.ndcg.bx %>% filter(DataSet=='bx-implicit')) +
    aes(x=Unbalanced, y=Balanced) +
    geom_point() +
    geom_rug() +
    facet_wrap(~ Algorithm)

Fit a linear model for each algorithm in each data set

BX Explicit

In [None]:
bxe.algorithms = unique(filter(user.ndcg.bx, DataSet == 'bx-explicit')$Algorithm)
bxe.algorithms

In [None]:
bxe.bias.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-explicit', Algorithm == 'Bias'))
summary(bxe.bias.ndcg.model)

In [None]:
bxe.popular.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-explicit', Algorithm == 'Popular'))
summary(bxe.popular.ndcg.model)

In [None]:
bxe.useruser.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-explicit', Algorithm == 'UserUser'))
summary(bxe.useruser.ndcg.model)

In [None]:
bxe.itemitem.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-explicit', Algorithm == 'ItemItem'))
summary(bxe.itemitem.ndcg.model)

In [None]:
bxe.funksvd.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-explicit', Algorithm == 'FunkSVD'))
summary(bxe.funksvd.ndcg.model)

In [None]:
bxe.pf.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-explicit', Algorithm == 'PF'))
summary(bxe.pf.ndcg.model)

Combine the predictions and the learned parameters (intercept and coefficient ($\alpha$))

In [None]:
bxe.ndcg.models = list(bxe.bias.ndcg.model, bxe.popular.ndcg.model, bxe.useruser.ndcg.model,
                       bxe.itemitem.ndcg.model, bxe.funksvd.ndcg.model, bxe.pf.ndcg.model)

In [None]:
bxe.ndcg.preds = map_dfr(1:length(bxe.algorithms), function(modelID) {
    user.ndcg.bx %>%
        filter(DataSet == 'bx-explicit', Algorithm == bxe.algorithms[[modelID]]) %>%
        add_predictions(bxe.ndcg.models[[modelID]]) %>%
        mutate(resid = Balanced - pred,
               intercept = bxe.ndcg.models[[modelID]]$coefficients[['(Intercept)']],
               coefficient = bxe.ndcg.models[[modelID]]$coefficients[['Unbalanced']])
})
head(bxe.ndcg.preds)

BX implicit

In [None]:
bxi.algorithms = unique(filter(user.ndcg.bx, DataSet == 'bx-implicit')$Algorithm)
bxi.algorithms

In [None]:
bxi.popular.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-implicit', Algorithm == 'Popular'))
summary(bxi.popular.ndcg.model)

In [None]:
bxi.useruser.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-implicit', Algorithm == 'UserUser'))
summary(bxi.useruser.ndcg.model)

In [None]:
bxi.itemitem.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-implicit', Algorithm == 'ItemItem'))
summary(bxi.itemitem.ndcg.model)

In [None]:
bxi.funksvd.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-implicit', Algorithm == 'FunkSVD'))
summary(bxi.funksvd.ndcg.model)

In [None]:
bxi.pf.ndcg.model = lm(Balanced ~ Unbalanced, 
                                 data = filter(user.ndcg.bx, DataSet == 'bx-implicit', Algorithm == 'PF'))
summary(bxi.pf.ndcg.model)

Combine models

In [None]:
bxi.ndcg.models = list(bxi.popular.ndcg.model, bxi.useruser.ndcg.model, bxi.itemitem.ndcg.model, 
                       bxi.funksvd.ndcg.model, bxi.pf.ndcg.model)

In [None]:
bxi.ndcg.preds = map_dfr(1:length(bxi.algorithms), function(modelID) {
    user.ndcg.bx %>%
        filter(DataSet == 'bx-implicit', Algorithm == bxi.algorithms[[modelID]]) %>%
        add_predictions(bxi.ndcg.models[[modelID]]) %>%
        mutate(resid = Balanced - pred,
               intercept = bxi.ndcg.models[[modelID]]$coefficients[['(Intercept)']],
               coefficient = bxi.ndcg.models[[modelID]]$coefficients[['Unbalanced']])
})
head(bxi.ndcg.preds)

Bind predictions of all data sets

In [None]:
bx.ndcg.preds = bind_rows(bxe.ndcg.preds,
                          bxi.ndcg.preds)
head(bx.ndcg.preds)

In [None]:
tail(bx.ndcg.preds)