Skip to content

Commit

Permalink
calculate jaccard for grouped inputs
Browse files Browse the repository at this point in the history
- fixes #216, closes #260
  • Loading branch information
jayhesselberth committed Jul 23, 2017
1 parent e4d1419 commit ddc93c1
Show file tree
Hide file tree
Showing 6 changed files with 116 additions and 28 deletions.
8 changes: 7 additions & 1 deletion NEWS.md
@@ -1,3 +1,9 @@
# valr 0.3.1.9000

## Minor changes

* `bed_jaccard()` now works with grouped inputs (#216)

# valr 0.3.1

## Enhancements
Expand All @@ -10,7 +16,7 @@

* Improve documentation of interval statistics with more complex examples.

## Minor
## Minor changes

* `bed_sort()` has been de-deprecated to reduce `arrange` calls in library code.

Expand Down
61 changes: 48 additions & 13 deletions R/bed_jaccard.r
Expand Up @@ -20,11 +20,14 @@
#'
#' @return
#' [tbl_interval()] with the following columns:
#' - `len_i` length of the intersection
#' - `len_u` length of the union
#' - `jaccard` jaccard statistic
#'
#' - `len_i` length of the intersection in base-pairs
#' - `len_u` length of the union in base-pairs
#' - `jaccard` value of jaccard statistic
#' - `n_int` number of intersecting intervals between `x` and `y`
#'
#' If inputs are grouped, the return value will contain one set of values per group.
#'
#' @seealso
#' \url{http://bedtools.readthedocs.org/en/latest/content/tools/jaccard.html}
#'
Expand All @@ -36,16 +39,29 @@
#'
#' bed_jaccard(x, y)
#'
#' # calculate jaccard per chromosome
#' bed_jaccard(group_by(x, chrom), group_by(y, chrom))
#'
#' @export
bed_jaccard <- function(x, y) {

if (!is.tbl_interval(x)) x <- as.tbl_interval(x)
if (!is.tbl_interval(y)) y <- as.tbl_interval(y)

groups_shared <- shared_groups(x, y)

x <- bed_merge(x)
y <- bed_merge(y)

res_intersect <- bed_intersect(x, y)

if (!is.null(groups_shared)) {
x <- group_by(x, !!! syms(groups_shared))
y <- group_by(y, !!! syms(groups_shared))

res_intersect <- group_by(res_intersect, !!! syms(groups_shared))
}

res_intersect <- summarize(res_intersect,
sum_overlap = sum(as.numeric(.overlap)),
n_int = as.numeric(n()))
Expand All @@ -56,20 +72,39 @@ bed_jaccard <- function(x, y) {
res_y <- mutate(y, .size = end - start)
res_y <- summarize(res_y, sum_y = sum(as.numeric(.size)))

n_i <- res_intersect$sum_overlap
n <- res_intersect$n_int
if (!is.null(groups_shared)) {

res <- left_join(res_intersect, res_x, by = as.character(groups_shared))
res <- left_join(res, res_y, by = as.character(groups_shared))

res <- mutate(res, sum_xy = sum_x + sum_y)
group_cols <- select(res, !!! syms(groups_shared))

res <- transmute(res,
len_i = sum_overlap,
len_u = sum_xy,
jaccard = sum_overlap / (sum_xy - sum_overlap),
n = n_int)

res <- bind_cols(group_cols, res)

} else {

n_i <- res_intersect$sum_overlap
n <- res_intersect$n_int

n_x <- res_x$sum_x
n_y <- res_y$sum_y
n_x <- res_x$sum_x
n_y <- res_y$sum_y

n_u <- n_x + n_y
n_u <- n_x + n_y

jaccard <- n_i / (n_u - n_i)
jaccard <- n_i / (n_u - n_i)

res <- tibble::tribble(
~len_i, ~len_u, ~jaccard, ~n,
n_i, n_u, jaccard, n
)
res <- tibble(len_i = n_i,
len_u = n_u,
jaccard = jaccard,
n = n)
}

res
}
29 changes: 21 additions & 8 deletions docs/news/index.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

22 changes: 19 additions & 3 deletions docs/reference/bed_jaccard.html

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 8 additions & 3 deletions man/bed_jaccard.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions tests/testthat/test_jaccard.r
Expand Up @@ -25,3 +25,16 @@ test_that("jaccard coeff is calc'd for large data sets", {
res <- bed_jaccard(x, y)
expect_equal(round(res$jaccard, 3), 0.016)
})

test_that("jaccard with grouped inputs are calculated", {
genome <- read_genome(valr_example("hg19.chrom.sizes.gz"))

x <- bed_random(genome, n = 1e5, seed = 10000)
y <- bed_random(genome, n = 1e5, seed = 20000)

res <- bed_jaccard(group_by(x, chrom),
group_by(y, chrom))

expect_equal(nrow(res), 24)
expect_true('chrom' %in% names(res))
})

0 comments on commit ddc93c1

Please sign in to comment.