## Rating Stats For BookCrossing and Amazon Datasets

### Setup

In [None]:
library(tidyverse)

In [None]:
options(repr.matrix.max.rows=15, repr.plot.height=4.5)

In [None]:
source("utils.R")

Set database name, other parameters (e.g. password) are set in a local shell file.

In [None]:
db = DBI::dbConnect(RPostgreSQL::PostgreSQL(), dbname='openlib')

In [None]:
book_genders = db %>% tbl('cluster_first_author_gender')
head(book_genders)

In [None]:
all_gender_stats = book_genders %>% group_by(gender) %>% summarize(count=n()) %>% collect()
all_gender_stats

How many books are known?

In [None]:
sum(all_gender_stats$count)

How many known-gender books?

In [None]:
all_gender_stats %>% filter(gender == "female" | gender == "male") %>% pull(count) %>% sum()

In [None]:
all_gender_stats %>% filter(gender == "female" | gender == "male") %>% mutate(frac=count / sum(count))

How many ISBNs are known?

In [None]:
db %>% tbl('isbn_id') %>% summarize(n=n())

How many books are known?

In [None]:
DBI::dbGetQuery(db, 'SELECT COUNT(DISTINCT COALESCE(cluster, bc_of_isbn(isbn_id))) FROM isbn_id LEFT JOIN isbn_cluster USING (isbn_id)')

## LOC Data

Let's load the table of books, with gender if available:

In [None]:
loc_books = tbl(db, 'loc_rec_isbn') %>%
    inner_join(tbl(db, 'isbn_cluster')) %>%
    select(cluster) %>%
    distinct() %>%
    inner_join(book_genders)
head(loc_books)

How many books do we have?

In [None]:
loc_books %>% summarize(nbooks=n()) %>% collect()

What's the distribution of gender statuses?

In [None]:
loc_gender_stats = loc_books %>% group_by(gender) %>% summarize(count=n()) %>% collect() %>% mutate(fraction=count / sum(count))
loc_gender_stats

How many books in LOC?

In [None]:
loc_book_count = sum(loc_gender_stats$count)
loc_book_count

How many ISBNs in the LOC data?

In [None]:
tbl(db, 'loc_rec_isbn') %>% summarize(n=n_distinct(isbn_id))

How many known-gender?

In [None]:
loc_gender_stats %>% filter(gender == "female" | gender == "male") %>% pull(count) %>% sum()

In [None]:
ggplot(loc_gender_stats) +
    aes(x=gender, y=count) +
    geom_bar(stat='identity') +
    scale_y_continuous(sec.axis = sec_axis(~ . / loc_book_count, labels=scales::percent))

## BX Data

Let's grab the ratings

In [None]:
bx_all_ratings = db %>% tbl('bx_all_ratings')
head(bx_all_ratings)

In [None]:
bx_explicit_ratings = db %>% tbl('bx_explicit_ratings')
head(bx_explicit_ratings)

### How often do we need to compute medians?

All ratings (including implicit):

In [None]:
bx.all.stats = bx_all_ratings %>%
    group_by(nratings) %>%
    summarize(Count=n()) %>%
    collect() %>% ungroup() %>%
    mutate(Frac = Count/ sum(as.numeric(Count)))
bx.all.stats %>%
    arrange(nratings)

Explicit ratings

In [None]:
bx.explicit.stats = bx_explicit_ratings %>%
    group_by(nratings) %>%
    summarize(Count=n()) %>%
    collect() %>% ungroup() %>%
    mutate(Frac = Count/ sum(as.numeric(Count)))
bx.explicit.stats %>%
    arrange(nratings)

Not very often.

Let's look at more statistics:

In [None]:
bxa_stats = tbl(db, 'bx_ratings') %>% 
    inner_join(tbl(db, 'isbn_id'), by=c("isbn")) %>%
    left_join(tbl(db, 'isbn_cluster'), by=c("isbn_id")) %>%
    left_join(book_genders, by=c("cluster")) %>%
    summarize(ratings=n(), isbns=n_distinct(isbn), 
              books=n_distinct(coalesce(cluster, isbn_id)),
              users=n_distinct(user_id),
              matched_books=n_distinct(cluster),
              gender_books=n_distinct(case_when(gender == "male" ~ cluster, gender == "female" ~ cluster))) %>%
    collect()
bxa_stats

In [None]:
bxe_stats = tbl(db, 'bx_ratings') %>% filter(rating > 0) %>% 
    inner_join(tbl(db, 'isbn_id'), by=c("isbn")) %>%
    left_join(tbl(db, 'isbn_cluster'), by=c("isbn_id")) %>%
    left_join(book_genders, by=c("cluster")) %>%
    summarize(ratings=n(), isbns=n_distinct(isbn), 
              books=n_distinct(coalesce(cluster, isbn_id)),
              users=n_distinct(user_id),
              matched_books=n_distinct(cluster),
              gender_books=n_distinct(case_when(gender == "male" ~ cluster, gender == "female" ~ cluster))) %>%
    collect()
bxe_stats

In [None]:
bxa_gender_stats = bx_all_ratings %>%
    left_join(book_genders, by=c("book_id"="cluster")) %>%
    mutate(gender = coalesce(gender, 'no-record')) %>%
    group_by(gender) %>%
    summarize(count=n(), count_books=n_distinct(book_id)) %>%
    collect() %>%
    mutate(fraction = count / sum(count))
bxa_gender_stats

In [None]:
bxe_gender_stats = bx_explicit_ratings %>%
    left_join(book_genders, by=c("book_id"="cluster")) %>%
    mutate(gender = coalesce(gender, 'no-record')) %>%
    group_by(gender) %>%
    summarize(count=n(), count_books=n_distinct(book_id)) %>%
    collect() %>%
    mutate(fraction = count / sum(count))
bxe_gender_stats

## Amazon Data

Load database views

In [None]:
az_ratings = db %>% tbl('az_export_ratings')
head(az_ratings)

How often do we need to compute medians?

In [None]:
az.ratings.stats = az_ratings %>%
    group_by(nratings) %>%
    summarize(Count=n()) %>%
    collect() %>% ungroup() %>%
    mutate(Frac = Count/ sum(as.numeric(Count)))
az.ratings.stats %>%
    arrange(nratings)

Again, not often.

Let's collect numeric stats on our rating table.

In [None]:
az_stats = db %>% tbl('az_ratings') %>% 
    inner_join(tbl(db, 'isbn_id'), by=c("asin"="isbn")) %>%
    left_join(tbl(db, 'isbn_cluster'), by=c("isbn_id")) %>%
    left_join(book_genders, by=c("cluster")) %>%
    summarize(ratings=n(), isbns=n_distinct(asin), 
              books=n_distinct(coalesce(cluster, isbn_id)),
              users=n_distinct(user_key),
              matched_books=n_distinct(cluster),
              gender_books=n_distinct(case_when(gender == "male" ~ cluster, gender == "female" ~ cluster))) %>%
    collect()
az_stats

In [None]:
az_gender_stats = az_ratings %>%
    left_join(book_genders, by=c("book_id"="cluster")) %>%
    mutate(gender = coalesce(gender, 'no-record')) %>%
    group_by(gender) %>%
    summarize(count=n(), count_books=n_distinct(book_id)) %>%
    collect() %>%
    mutate(fraction = count / sum(count))
az_gender_stats

## Integrated Statistics

In [None]:
all_stats = bind_rows(BXA=bxa_stats, BXE=bxe_stats, AZ=az_stats, .id="DataSet")
all_stats

In [None]:
all_gender_stats =
    bind_rows(LOC=loc_gender_stats %>% select(gender, Count=count) %>% mutate(Scope="Books"),
              BXA=bxa_gender_stats %>% select(gender, Ratings=count, Books=count_books) %>%
                  gather("Scope", "Count", Ratings, Books),
              BXE=bxe_gender_stats %>% select(gender, Ratings=count, Books=count_books) %>%
                  gather("Scope", "Count", Ratings, Books),
              AZ=az_gender_stats %>% select(gender, Ratings=count, Books=count_books) %>%
                  gather("Scope", "Count", Ratings, Books),
              .id="Set") %>%
    mutate(Set=recode_factor(Set, LOC="LOC", AZ="AZ", BXA="BXA", BXE="BXE", .ordered = TRUE),
           gender=recode_factor(gender,
                                female="female", male="male", ambiguous="ambiguous",
                                unknown="unknown",
                                "no-loc-author"="unlinked", "no-viaf-author"="unlinked",
                                "no-record"="unlinked", .ordered=TRUE)) %>%
    group_by(Set, Scope, gender) %>%
    summarize(Count=sum(Count)) %>%
    group_by(Set, Scope) %>%
    mutate(fraction=Count/sum(Count))
print(all_gender_stats)

In [None]:
drawplot(file="build/figures/link-coverage", width=5, height=2.8, {
    ggplot(all_gender_stats) +
        aes(x=gender, y=fraction, fill=Scope, label=sprintf("%.1f%%", fraction * 100)) +
        geom_bar(stat='identity', position='dodge') +
        scale_y_continuous(labels=scales::percent) +
        scale_fill_brewer(palette = "Dark2") +
        scale_color_identity() +
        facet_wrap(~ Set, scales="free_y") +
        ylab("Coverage Percent") +
        xlab("Linking Result") +
        theme_paper() +
        theme(axis.text.x = element_text(angle=45, hjust=1),
              legend.margin=margin())        
})

## Test Agreement

We want to see how much disagreement there is of our book rating statuses.

In [None]:
gender_matrix = tbl(db, 'cluster_loc_author_gender') %>% rename(loc_gender=gender) %>%
    inner_join(tbl(db, 'cluster_first_author_gender')) %>%
    group_by(gender, loc_gender) %>%
    summarize(nclusters=n()) %>%
    collect()

In [None]:
gender_matrix %>% spread(gender, nclusters)

## Rating and Gender over Time

In [None]:
cluster_year = tbl(db, 'loc_pub_year') %>%
    inner_join(tbl(db, 'loc_rec_isbn')) %>%
    inner_join(tbl(db, 'isbn_cluster')) %>%
    group_by(cluster) %>%
    summarize(pub_year=min(pub_year))
explain(cluster_year)

In [None]:
cluster_year_gender = cluster_year %>%
    left_join(tbl(db, 'cluster_first_author_gender')) %>%
    group_by(pub_year, gender) %>%
    summarize(count=n()) %>%
    arrange(pub_year) %>%
    collect() %>%
    spread(gender, count, fill=0)

In [None]:
year_az_count = cluster_year %>%
    inner_join(tbl(db, 'az_export_ratings') %>% select(cluster=book_id) %>% distinct()) %>%
    left_join(tbl(db, 'cluster_first_author_gender')) %>%
    group_by(pub_year, gender) %>%
    summarize(nrated=n()) %>%
    arrange(pub_year) %>%
    collect()
year_az_count = year_az_count %>% spread(gender, nrated, fill=0)
print(year_az_count)

In [None]:
cluster_year_totals = cluster_year_gender %>%
    gather("gender", "count", -pub_year) %>%
    group_by(pub_year) %>%
    summarize(total=sum(count))
cluster_year_totals %>% head()

In [None]:
year_az_totals = year_az_count %>%
    gather("gender", "count", -pub_year) %>%
    group_by(pub_year) %>%
    summarize(total=sum(count))
year_az_totals %>% head()

Let us look at % female over time.

In [None]:
ggplot(cluster_year_totals %>% mutate(pub_year=as.integer(pub_year)) %>% filter(pub_year > 1960, pub_year < 2015)) +
    aes(x=pub_year, y=total) +
    geom_line() +
    ylab("# of books")

In [None]:
pct_female = cluster_year_gender %>% ungroup() %>%
    filter(pub_year > 1960, pub_year < 2015) %>%
    mutate(female_prop = female / (female + male), pub_year=as.integer(pub_year))
ggplot(pct_female) +
    aes(x=pub_year, y=female_prop) +
    geom_line() +
    ylab("% Female Books")

Percent rated in Amazon?

In [None]:
pct_rated = cluster_year_totals %>% ungroup() %>%
    filter(pub_year > 1960, pub_year < 2015) %>%
    left_join(year_az_totals %>% rename(rated=total)) %>%
    replace_na(list(rated=0)) %>%
    mutate(rated_prop = rated / total, pub_year=as.integer(pub_year))
ggplot(pct_rated) +
    aes(x=pub_year, y=rated_prop) +
    geom_line() +
    ylab("% Rated")

In [None]:
odds_ratios = cluster_year_gender %>% ungroup() %>%
    filter(pub_year > 1960, pub_year < 2015) %>%
    mutate(year=as.integer(pub_year)) %>%
    left_join(transmute(year_az_count, year=as.integer(pub_year),
                        female_rated=female, male_rated=male, rated = male + female)) %>%
    replace_na(list(female_rated=0, male_rated=0)) %>%
    mutate(female_odds = female_rated / (female - female_rated),
           male_odds = male_rated / (male - male_rated),
           rate_odds = rated / (male + female - rated)) %>%
    mutate(odds_ratio = log(female_odds) - log(male_odds))
ggplot(odds_ratios) +
    aes(x=year, y=odds_ratio) +
    geom_line() +
    ylab("Female/Male Rated Log Odds Ratio") +
    scale_y_continuous(sec.axis = sec_axis(~ exp(.)))

In [None]:
odds_ratios %>% select(year, All=rate_odds, Male=male_odds, Female=female_odds) %>%
    gather("group", "Odds", -year) %>%
    ggplot() +
    aes(x=year, y=Odds, color=group) +
    geom_line()

In [None]:
odds_ratios %>% select(year, All=rate_odds, Male=male_odds, Female=female_odds) %>%
    gather("group", "Odds", -year) %>%
    ggplot() +
    aes(x=year, y=Odds, color=group) +
    geom_line() +
    scale_y_log10()

## Representation by rating values

In [None]:
az_rg = az_ratings %>%
    group_by(book_id) %>%
    summarize(avg_rating=round(mean(rating))) %>%
    left_join(book_genders, by=c("book_id"="cluster")) %>%
    filter(gender == 'male' | gender == 'female') %>%
    group_by(gender, avg_rating) %>%
    summarize(count=n()) %>%
    collect()

In [None]:
bx_rg = bx_explicit_ratings %>%
    group_by(book_id) %>%
    summarize(avg_rating=round(mean(rating))) %>%
    left_join(book_genders, by=c("book_id"="cluster")) %>%
    filter(gender == 'male' | gender == 'female') %>%
    group_by(gender, avg_rating) %>%
    summarize(count=n()) %>%
    collect()

In [None]:
all_rg = bind_rows(AZ=az_rg, BXE=bx_rg, .id="Set") %>% group_by(rating=avg_rating) %>% mutate(frac=count/sum(count))

In [None]:
ggplot(all_rg) + aes(x=rating, y=frac, fill=gender) +
    geom_bar(stat='identity', position='dodge') +
    facet_grid(~ Set, scales="free")