# Gender Coverage

## Setup

In [None]:
library(tidyverse)
library(dbplyr)

In [None]:
db = src_postgres(dbname = 'openlib')

In [None]:
options(repr.matrix.max.rows=15, repr.plot.height=4.5)

## Basic Resolution Stats

Let's code our authors by different resolution statuses.

In [None]:
reso_stats = db %>% tbl('author_resolution_summary') %>%
    group_by(status) %>%
    summarize(Count=n()) %>%
    collect() %>% ungroup() %>%
    mutate(Frac=Count / sum(as.numeric(Count)))
reso_stats

In [None]:
ggplot(reso_stats) +
    aes(x=status, y=Frac) +
    geom_bar(stat='identity') +
    scale_y_continuous(labels=scales::percent)

## Statistics for BookCrossing

In [None]:
bx_rate_stats = tbl(db, 'bx_ratings') %>%
    inner_join(tbl(db, 'isbn_book_id')) %>%
    inner_join(tbl(db, 'bx_book_info')) %>%
    left_join(tbl(db, 'author_resolution_summary')) %>%
    group_by(status) %>%
    summarize(Count=n()) %>%
    collect() %>% ungroup() %>%
    mutate(Frac=Count / sum(as.numeric(Count)))

In [None]:
bx_rate_stats

In [None]:
ggplot(bx_rate_stats) +
    aes(x=status, y=Frac) +
    geom_bar(stat='identity') +
    scale_y_continuous(labels=scales::percent)

73.5% of ratings are of a book with unambiguously resolved gender.

## Statistics for Amazon

In [None]:
az_rate_stats = tbl(db, 'az_ratings') %>%
    select(isbn=asin) %>%
    inner_join(tbl(db, 'isbn_book_id')) %>%
    inner_join(tbl(db, 'az_book_info')) %>%
    left_join(tbl(db, 'author_resolution_summary')) %>%
    group_by(status) %>%
    summarize(Count=n()) %>%
    collect() %>% ungroup() %>%
    mutate(Frac=Count / sum(as.numeric(Count)))

In [None]:
az_rate_stats

In [None]:
ggplot(az_rate_stats) +
    aes(x=status, y=Frac) +
    geom_bar(stat='identity') +
    scale_y_continuous(labels=scales::percent)