Skip to content
Browse files

Update to VariantAnnotation 1.5.24. Thanks to Chris Wallace for the a…

…ddition of snpSummary methods.
  • Loading branch information...
1 parent 285823f commit a2a7e4524644732cc40ba90b2394922b88b78237 vobencha committed Dec 19, 2012
View
11 VariantAnnotation/DESCRIPTION
@@ -3,16 +3,17 @@ Type: Package
Title: Annotation of Genetic Variants
Description: Annotate variants, compute amino acid coding changes,
predict coding outcomes
-Version: 1.5.19
-Author: Valerie Obenchain, Martin Morgan, Michael Lawrence
+Version: 1.5.24
+Author: Valerie Obenchain, Martin Morgan, Michael Lawrence with
+ contributions from Stephanie Gogarten.
Maintainer: Valerie Obenchain <vobencha@fhcrc.org>
License: Artistic-2.0
biocViews: DataImport, Sequencing, HighThroughputSequencing, SNP, Annotation,
Genetics, Homo_sapiens
-Depends: R (>= 2.8.0), methods, GenomicRanges (>= 1.9.56),
+Depends: R (>= 2.8.0), methods, BiocGenerics, GenomicRanges (>= 1.9.56),
Rsamtools (>= 1.11.7), IRanges (>= 1.17.4)
-Imports: methods, BiocGenerics (>= 0.1.0), IRanges, Biobase (>= 2.15.1),
- Rsamtools (>= 1.9.4), AnnotationDbi (>= 1.17.11), Biostrings (>= 2.25.2),
+Imports: methods, BiocGenerics, IRanges, Biostrings, Biobase,
+ Rsamtools (>= 1.9.4), AnnotationDbi (>= 1.17.11),
zlibbioc, BSgenome, GenomicFeatures (>= 1.9.35), DBI, utils
Suggests: RUnit, BSgenome.Hsapiens.UCSC.hg19, TxDb.Hsapiens.UCSC.hg19.knownGene,
SNPlocs.Hsapiens.dbSNP.20110815, SNPlocs.Hsapiens.dbSNP.20101109,
View
24 VariantAnnotation/NAMESPACE
@@ -1,43 +1,33 @@
useDynLib(VariantAnnotation, .registration=TRUE)
import(methods)
+import(BiocGenerics)
import(Rsamtools)
import(GenomicRanges)
import(zlibbioc)
-import(BiocGenerics)
-
-importClassesFrom(Biostrings, DNAStringSet, DNAStringSetList)
-importFrom(Biostrings, AAStringSet, DNAStringSet)
-importMethodsFrom(Biostrings,
- as.list, as.matrix, duplicated,
- end, head, intersect, match, nchar, setdiff,
- start, "subseq<-", substr, substring, summary,
- translate, unlist, width)
-
-importClassesFrom(Biobase, ScalarCharacter, AssayData)
-importFrom(Biobase, mkScalar, selectSome)
+importClassesFrom(Biobase, AssayData)
importClassesFrom(AnnotationDbi, AnnotationDb)
importMethodsFrom(AnnotationDbi, colnames, cols, exists, keys, ncol,
nrow, select)
+importFrom(GenomicFeatures, extractTranscriptsFromGenome)
importClassesFrom(GenomicFeatures, TranscriptDb)
importMethodsFrom(GenomicFeatures, cdsBy, exons, transcripts,
fiveUTRsByTranscript, threeUTRsByTranscript,
isActiveSeq, "isActiveSeq<-")
-importFrom(GenomicFeatures, extractTranscriptsFromGenome)
+importFrom(IRanges, CharacterList, DataFrame, IRanges, isSingleString,
+ PartitioningByWidth, successiveViews, subsetByFilter, expand)
importClassesFrom(IRanges, DataFrame, Ranges, RangesList, RangedData)
importMethodsFrom(IRanges, "%in%", append, as.matrix, as.vector,
countOverlaps, elementLengths, end, eval, findOverlaps,
follow, gsub, lapply, match, "metadata<-", narrow, order,
precede, queryHits, rev, Rle, runValue, sapply, shift,
split, start, "start<-", subjectHits, "subseq<-", substring,
- unique, unlist, values, "values<-", which, width,
+ unlist, values, "values<-", which, width,
expand)
-importFrom(IRanges, CharacterList, DataFrame, IRanges, isSingleString,
- PartitioningByWidth, successiveViews, subsetByFilter, expand)
importMethodsFrom(DBI, dbCommit, dbConnect, dbDisconnect, dbExistsTable,
dbGetQuery, dbReadTable, dbWriteTable, dbListTables,
@@ -64,7 +54,7 @@ exportMethods(filterVcf,
scanVcf, scanVcfHeader, ScanVcfParam,
readVcf, readVcfLongForm, writeVcf,
predictCoding, getTranscriptSeqs, refLocsToLocalLocs,
- MatrixToSnpMatrix, genotypeToSnpMatrix,
+ MatrixToSnpMatrix, genotypeToSnpMatrix, snpSummary,
locateVariants, summarizeVariants,
fixed, "fixed<-", ref, "ref<-", alt, "alt<-", qual, "qual<-",
View
6 VariantAnnotation/R/AllGenerics.R
@@ -101,8 +101,8 @@ setGeneric("scanVcf", signature = c("file", "param"),
setGeneric("filterVcf", signature = "file",
function(file, genome, destination, ..., verbose = FALSE,
- index = FALSE, filters = FilterRules(),
- param = ScanVcfParam())
+ index = FALSE, prefilters = FilterRules(),
+ filters = FilterRules(), param = ScanVcfParam())
standardGeneric("filterVcf")
)
@@ -207,3 +207,5 @@ setGeneric("genotypeToSnpMatrix", signature = "x",
standardGeneric("genotypeToSnpMatrix")
)
+setGeneric("snpSummary", function(x, ...) standardGeneric("snpSummary") )
+
View
173 VariantAnnotation/R/methods-CollapsedVCF-class.R
@@ -22,105 +22,100 @@ setReplaceMethod("alt", c("CollapsedVCF", "DNAStringSetList"),
})
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-### expand
+### show
###
-setMethod("expand", "CollapsedVCF",
- function(x, ...)
- {
- elt <- elementLengths(alt(x))
- if (all(elt == 1L) || (is(alt(x), "CharacterList"))) {
- fxd <- fixed(x)
- fxd$ALT <- unlist(alt(x), use.names=FALSE)
- return(VCF(rowData=rowData(x), colData=colData(x),
- exptData=exptData(x), fixed=fxd,
- info=info(x), geno=geno(x),
- ..., collapsed=FALSE))
- }
- idx <- rep.int(seq_len(nrow(x)), elt)
- hdr <- exptData(x)$header
- ## info
- iexp <- .expandInfo(x, hdr, elt, idx)
- ## fixed
- fexp <- fixed(x)[idx, ]
- fexp$ALT <- unlist(alt(x), use.names=FALSE)
- ## geno
- gexp <- .expandGeno(x, hdr, elt, idx)
- ## rowData
- rdexp <- rowData(x)[idx, "paramRangeID"]
+setMethod(show, "CollapsedVCF",
+ function(object)
+{
+ .showVCFSubclass(object)
+})
- ## exptData, colData untouched
- VCF(rowData=rdexp, colData=colData(x), exptData=exptData(x),
- fixed=fexp, info=iexp, geno=gexp, ..., collapsed=FALSE)
- }
-)
+### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+### snpSummary
+###
-.expandGeno <- function(x, hdr, elt, idx)
+setMethod("snpSummary", "CollapsedVCF",
+ function(x, ...)
{
- isA <- geno(hdr)$Number == "A"
- geno <- geno(x)
- if (any(isA)) {
- gnms <- rownames(geno(hdr))[geno(hdr)$Number == "A"]
- gelt <- sapply(gnms, function(i)
- elt - elementLengths(geno[[i]]))
- ## elementLengths same as ALT
- csums <- colSums(gelt) == 0L
- if (any(csums))
- geno[gnms[csums]] <- endoapply(geno[gnms[csums]], function(i)
- matrix(unlist(i, use.names=FALSE)))
- ## elementLengths shorter than ALT
- if (any(!csums)) {
- nms <- names(geno) %in% names(csums)[!csums]
- reps <- lapply(list(gelt[!csums] + 1L), rep.int,
- x=seq_len(nrow(x)))
- geno[nms] <- mendoapply(geno[nms], function(d, r)
- unlist(d[r], use.names=FALSE),
- r=reps)
- }
- geno
+ alt <- alt(x)
+ if (is(alt,"CompressedCharacterList")) {
+ warning("ALT must be a DNAStringSetList.")
+ return(.emptySnpSummary())
+ }
+ gt <- geno(x)$GT
+ if (is.null(gt)) {
+ warning("No genotype data found in VCF.")
+ return(.emptySnpSummary())
}
- geno[!isA] <- endoapply(geno[!isA], function(i) {
- if (is(i, "matrix"))
- matrix(i[idx, ], ncol=ncol(x))
- else
- i[idx, , ]})
- geno
+ if (ncol(gt) == 0L) {
+ warning("No genotype data found in VCF.")
+ return(.emptySnpSummary())
+ }
+
+ ## Genotype count
+ snv <- .isSNV(ref(x), alt)
+ gmap <- .genotypeToIntegerSNV(FALSE)
+ gmat <- matrix(gmap[gt], nrow=nrow(gt))
+ gcts <- matrix(NA_integer_, nrow(gmat), 3)
+ gcts[snv,] <- sapply(1:3, function(i) {
+ rowSums(gmat[snv,] == i)
+ })
+ gcts <- matrix(as.integer(gcts), nrow=nrow(gcts),
+ dimnames=list(NULL, c("g00", "g01", "g11")))
+
+ ## Allele frequency
+ acts <- gcts %*% matrix(c(2,1,0,0,1,2), nrow=3)
+ afrq <- acts/rowSums(acts)
+ colnames(afrq) <- c("a0Freq", "a1Freq")
+
+ ## HWE
+ HWEzscore <- .HWEzscore(gcts, acts, afrq)
+ HWEpvalue <- pchisq(HWEzscore^2, 1, lower.tail=FALSE)
+
+ data.frame(gcts, afrq, HWEzscore, HWEpvalue,
+ row.names=rownames(x))
+})
+
+.HWEzscore <- function(genoCounts, alleleCounts, alleleFrequency)
+{
+ a0 <- alleleFrequency[,"a0Freq"]
+ a1 <- alleleFrequency[,"a1Freq"]
+ expt <- rowSums(alleleCounts) * cbind(a0^2/2, a0*a1, a1^2/2)
+ chisq <- rowSums((genoCounts - expt)^2 / expt)
+ z <- rep(NA, nrow(genoCounts))
+ zsign <- sign(genoCounts[,"g01"] - expt[,2] * rowSums(genoCounts))
+ zsign * sqrt(chisq)
}
-.expandInfo <- function(x, hdr, elt, idx)
+## Maps diploid genotypes, phased or unphased.
+.genotypeToIntegerSNV <- function(raw=TRUE)
{
- icol <- info(x)
- inms <- rownames(info(hdr))[info(hdr)$Number == "A"]
- if (length(inms) > 0L) {
- ielt <- sapply(inms, function(i)
- elt - elementLengths(info(x)[[i]]))
- ## elementLengths same as ALT
- csums <- colSums(ielt) == 0L
- if (any(csums))
- res <- expand(icol, inms[csums], TRUE)
- else
- res <- icol[idx, ]
- ## elementLengths shorter than ALT
- if (any(!csums)) {
- nms <- colnames(icol) %in% names(csums)[!csums]
- reps <- lapply(list(ielt[!csums] + 1L), rep.int,
- x=seq_len(nrow(x)))
- res[nms] <- Map(function(d, r)
- unlist(d[r], use.names=FALSE),
- icol[nms], reps)
- }
- res
- } else {
- icol[idx, ]
- }
+ name <- c(".|.", "0|0", "0|1", "1|0", "1|1",
+ "./.", "0/0", "0/1", "1/0", "1/1")
+ value <- rep(c(0, 1, 2, 2, 3), 2)
+ if (raw)
+ setNames(sapply(value, as.raw), name)
+ else
+ setNames(value, name)
}
-### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-### show
-###
+## Detects valid SNVs based on having a ref allele and
+## single alt allele both of length 1.
+## ref = DNAStringSet
+## alt = DNASTringSetList
+.isSNV <- function(ref, alt)
+{
+ altelt <- elementLengths(alt) == 1L
+ altseq <- logical(length(alt))
+ idx <- rep(altelt, elementLengths(alt))
+ altseq[altelt] = width(unlist(alt, use.names=FALSE)[idx]) == 1L
+ altseq & (width(ref) == 1L)
+}
-setMethod(show, "CollapsedVCF",
- function(object)
+.emptySnpSummary <- function()
{
- .showVCFSubclass(object)
-})
+ data.frame(g00=integer(), g01=integer(), g11=integer(),
+ a0Freq=numeric(), a1Freq=numeric(),
+ HWEzscore=numeric(), HWEpvalue=numeric())
+}
View
11 VariantAnnotation/R/methods-ExpandedVCF-class.R
@@ -22,17 +22,6 @@ setReplaceMethod("alt", c("ExpandedVCF", "DNAStringSet"),
})
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-### expand
-###
-
-setMethod("expand", "ExpandedVCF",
- function(x, ...)
- {
- x
- }
-)
-
-### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### show
###
View
10 VariantAnnotation/R/methods-VCF-class.R
@@ -365,13 +365,15 @@ setMethod(show, "VCF",
cat("info(vcf):\n")
printSmallDataTable(info(object), margin=margin)
cat("info(header(vcf)):\n")
- info <- as.data.frame(info(exptData(object)$header))
- headerrec(info, "info")
+ info <- info(exptData(object)$header)[colnames(info(object)),]
+ if (nrow(info) > 0)
+ headerrec(as.data.frame(info), "info")
cat("geno(vcf):\n")
printSimpleList(geno(object), margin=margin)
cat("geno(header(vcf)):\n")
- geno <- as.data.frame(geno(exptData(object)$header))
- headerrec(geno, "geno")
+ geno <- geno(exptData(object)$header)[names(geno(object)),]
+ if (nrow(geno) > 0)
+ headerrec(as.data.frame(geno), "geno")
}
### - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
View
101 VariantAnnotation/R/methods-expand.R
@@ -0,0 +1,101 @@
+### =========================================================================
+### expand methods
+### =========================================================================
+
+setMethod("expand", "CollapsedVCF",
+ function(x, ...)
+ {
+ elt <- elementLengths(alt(x))
+ if (all(elt == 1L) || (is(alt(x), "CharacterList"))) {
+ fxd <- fixed(x)
+ fxd$ALT <- unlist(alt(x), use.names=FALSE)
+ return(VCF(rowData=rowData(x), colData=colData(x),
+ exptData=exptData(x), fixed=fxd,
+ info=info(x), geno=geno(x),
+ ..., collapsed=FALSE))
+ }
+ idx <- rep.int(seq_len(nrow(x)), elt)
+ hdr <- exptData(x)$header
+ ## info
+ iexp <- .expandInfo(x, hdr, elt, idx)
+ ## fixed
+ fexp <- fixed(x)[idx, ]
+ fexp$ALT <- unlist(alt(x), use.names=FALSE)
+ ## geno
+ gexp <- .expandGeno(x, hdr, elt, idx)
+ ## rowData
+ rdexp <- rowData(x)[idx, "paramRangeID"]
+
+ ## exptData, colData untouched
+ VCF(rowData=rdexp, colData=colData(x), exptData=exptData(x),
+ fixed=fexp, info=iexp, geno=gexp, ..., collapsed=FALSE)
+ }
+)
+
+.expandGeno <- function(x, hdr, elt, idx)
+{
+ isA <- geno(hdr)$Number == "A"
+ geno <- geno(x)
+ if (any(isA)) {
+ gnms <- rownames(geno(hdr))[geno(hdr)$Number == "A"]
+ gelt <- sapply(gnms, function(i)
+ elt - elementLengths(geno[[i]]))
+ ## elementLengths same as ALT
+ csums <- colSums(gelt) == 0L
+ if (any(csums))
+ geno[gnms[csums]] <- endoapply(geno[gnms[csums]], function(i)
+ matrix(unlist(i, use.names=FALSE)))
+ ## elementLengths shorter than ALT
+ if (any(!csums)) {
+ nms <- names(geno) %in% names(csums)[!csums]
+ reps <- lapply(list(gelt[!csums] + 1L), rep.int,
+ x=seq_len(nrow(x)))
+ geno[nms] <- mendoapply(geno[nms], function(d, r)
+ unlist(d[r], use.names=FALSE),
+ r=reps)
+ }
+ geno
+ }
+ geno[!isA] <- endoapply(geno[!isA], function(i) {
+ if (is(i, "matrix"))
+ matrix(i[idx, ], ncol=ncol(x))
+ else
+ i[idx, , ]})
+ geno
+}
+
+.expandInfo <- function(x, hdr, elt, idx)
+{
+ icol <- info(x)
+ inms <- rownames(info(hdr))[info(hdr)$Number == "A"]
+ if (length(inms) > 0L) {
+ ielt <- sapply(inms, function(i)
+ elt - elementLengths(info(x)[[i]]))
+ ## elementLengths same as ALT
+ csums <- colSums(ielt) == 0L
+ if (any(csums))
+ res <- expand(icol, inms[csums], TRUE)
+ else
+ res <- icol[idx, ]
+ ## elementLengths shorter than ALT
+ if (any(!csums)) {
+ nms <- colnames(icol) %in% names(csums)[!csums]
+ reps <- lapply(list(ielt[!csums] + 1L), rep.int,
+ x=seq_len(nrow(x)))
+ res[nms] <- Map(function(d, r)
+ unlist(d[r], use.names=FALSE),
+ icol[nms], reps)
+ }
+ res
+ } else {
+ icol[idx, ]
+ }
+}
+
+setMethod("expand", "ExpandedVCF",
+ function(x, ...)
+ {
+ x
+ }
+)
+
View
116 VariantAnnotation/R/methods-filterVcf.R
@@ -1,19 +1,78 @@
setMethod("filterVcf", "character",
function(file, genome, destination, ..., verbose = TRUE,
- index = FALSE, filters = FilterRules(),
- param = ScanVcfParam())
+ index = FALSE, prefilters = FilterRules(),
+ filters = FilterRules(), param = ScanVcfParam())
{
- tbx <- open(TabixFile(file, yieldSize=10000))
+ tbx <- open(TabixFile(file, yieldSize=100000))
on.exit(close(tbx))
filterVcf(tbx, genome = genome, destination=destination, ...,
- verbose = verbose, index = index, filters = filters,
- param = param)
+ verbose = verbose, index = index, prefilters = prefilters,
+ filters = filters, param = param)
})
+.unlistScan <- function(...)
+ unlist(scanTabix(...), use.names=FALSE)
+
+.prefilter <-
+ function(tbxFile, verbose, index, prefilters, param, ...)
+{
+ if (!isOpen(tbxFile)) {
+ open(tbxFile)
+ on.exit(close(tbxFile), add=TRUE)
+ }
+
+ prefilteredFilename <- tempfile()
+ prefiltered <- file(prefilteredFilename, "w")
+ needsClosing <- TRUE
+ on.exit(if (needsClosing) close(prefiltered))
+
+ ## copy header
+ writeLines(headerTabix(tbxFile)$header, prefiltered)
+
+ ## prefilter
+ param <- vcfWhich(param)
+ while (length(tbxChunk <- .unlistScan(tbxFile, ..., param=param))) {
+ tbxChunk <- subsetByFilter(tbxChunk, prefilters)
+ writeLines(tbxChunk, prefiltered)
+ }
+ close(prefiltered)
+ needsClosing <- FALSE
+
+ if (index) {
+ prefilteredFilename <- bgzip(prefilteredFilename, overwrite = TRUE)
+ indexTabix(prefilteredFilename, format = "vcf")
+ }
+
+ TabixFile(prefilteredFilename, yieldSize=yieldSize(tbxFile))
+}
+
+.filter <-
+ function(tbxFile, genome, destination, verbose, filters, param, ...)
+{
+ if (!isOpen(tbxFile)) {
+ open(tbxFile)
+ on.exit(close(tbxFile), add=TRUE)
+ }
+
+ filtered <- file(destination, open="a")
+ needsClosing <- TRUE
+ on.exit(if (needsClosing) close(filtered))
+
+ while (nrow(vcfChunk <- readVcf(tbxFile, genome, ..., param=param))) {
+ vcfChunk <- subsetByFilter(vcfChunk, filters)
+ writeVcf(vcfChunk, filtered)
+ }
+ close(filtered)
+ needsClosing <- FALSE
+
+ destination
+}
+
setMethod("filterVcf", "TabixFile",
function(file, genome, destination, ..., verbose = TRUE,
- index = FALSE, filters = FilterRules(),
+ index = FALSE,
+ prefilters = FilterRules(), filters = FilterRules(),
param = ScanVcfParam())
{
if (!isSingleString(destination))
@@ -23,41 +82,28 @@ setMethod("filterVcf", "TabixFile",
if (!isTRUEorFALSE(index))
stop("'index' must be TRUE or FALSE")
- ## desination: character(1) file path
- con <- file(destination, open="a")
- needsClosing <- TRUE
- on.exit(if (needsClosing) close(con))
+ if (!length(prefilters) && !length(filters))
+ stop("no 'prefilters' or 'filters' specified")
- if (!isOpen(file)) {
- open(file)
- on.exit(close(file), add=TRUE)
+ ## prefilters
+ if (length(prefilters)) {
+ doIndex <- index || (length(filters) != 0)
+ file <- .prefilter(file, verbose, doIndex, prefilters, param,
+ ...)
}
- if (verbose) {
- wd <- options()$width
- pbi <- 0L
- pb <- txtProgressBar(max=wd)
- }
- while (nrow(vcfChunk <- readVcf(file, genome, ..., param=param))) {
- if (verbose) {
- pbi <- (pbi + 1L) %% wd
- if (pbi == 0)
- message()
- setTxtProgressBar(pb, pbi)
- }
- vcfChunk <- subsetByFilter(vcfChunk, filters)
- writeVcf(vcfChunk, con)
- }
- if (verbose)
- message()
- close(con) # so bgzip is on a closed file
- needsClosing <- FALSE
+
+ ## filters
+ if (length(filters))
+ file <- .filter(file, genome, destination, verbose, filters,
+ param, ...)
+ ## desination: character(1) file path
if (index) {
- filenameGZ <- bgzip(destination, overwrite = TRUE)
+ filenameGZ <- bgzip(file, overwrite = TRUE)
indexTabix(filenameGZ, format = "vcf")
- unlink(destination)
+ unlink(file)
invisible(filenameGZ)
} else {
- invisible(destination)
+ invisible(file)
}
})
View
8 VariantAnnotation/R/methods-predictCoding.R
@@ -16,8 +16,12 @@ setMethod("predictCoding", c("CollapsedVCF", "TranscriptDb", "ANY", "missing"),
if (!is(alt, "DNAStringSetList"))
stop("alt(query) must be a DNAStringSetList")
rd <- rep(rowData(query), elementLengths(alt))
- callGeneric(rd, subject, seqSource, unlist(alt, use.names=FALSE), ...,
- ignore.strand=ignore.strand)
+ res <- callGeneric(rd, subject, seqSource, unlist(alt, use.names=FALSE),
+ ..., ignore.strand=ignore.strand)
+ ## adjust QUERYID for expansion of rowData
+ res$QUERYID <- rep(seq_len(length(alt)),
+ elementLengths(alt))[res$QUERYID]
+ res
})
setMethod("predictCoding", c("ExpandedVCF", "TranscriptDb", "ANY", "missing"),
View
2 VariantAnnotation/R/methods-writeVcf.R
@@ -65,7 +65,7 @@ setMethod(writeVcf, c("VCF", "connection"),
FILTER[is.na(FILTER)] <- "."
INFO <- .makeVcfInfo(info(obj))
ans <- paste(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, sep = "\t")
- if (length(geno(obj)) > 0L) {
+ if (nrow(colData(obj)) > 0L) {
GENO <- .makeVcfGeno(geno(obj))
FORMAT <- GENO[,1]
GENO <- GENO[,-1,drop=FALSE]
View
178 VariantAnnotation/inst/doc/VariantAnnotation.Rnw
@@ -43,14 +43,15 @@ options(width=72)
This vignette outlines a work flow for annotating and filtering
genetic variants using the \VariantAnnotation package. Sample data
are in VariantCall Format (VCF) and are a subset of chromosome 22
-from 1000 Genomes,
-\url{ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20110521/}.
+from
+\href{ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/20110521/}{1000
+Genomes}.
VCF text files contain meta-information lines, a header line with
column names, data lines with information about a position in the
genome, and optional genotype information on samples for each
-position. A full description of the VCF format is on the 1000 Genomes
-page,
-\url{http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41}
+position. The 1000 Genomes page describes the
+\href{http://www.1000genomes.org/wiki/Analysis/Variant\%20Call\%20Format/vcf-variant-call-format-version-41}{VCF
+format} in detail.
Data are read in from a VCF file and variants identified according
to region such as \Rcode{coding}, \Rcode{intron}, \Rcode{intergenic},
@@ -59,79 +60,153 @@ non-synonymous variants and SIFT and PolyPhen databases provide predictions
of how severly the coding changes affect protein function.
\section{Variant Call Format (VCF) files}
-\subsection{Import complete files}
+\subsection{Data import and exploration}
Data are parsed into a \Robject{VCF} object with \Rfunction{readVcf}.
<<readVcf>>=
library(VariantAnnotation)
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
-@
-
-The show method gives an overview of how the data are organized.
-<<show>>=
vcf
@
-Extract the header information.
+\subsubsection{Header information}
+Header information can be extracted from the VCF with
+\Rfunction{header()}. We see there are 5 samples, 1 piece
+of meta information, 22 info fields and 3 geno fields.
<<readVcf_showheader>>=
-hdr <- header(vcf)
-hdr
+header(vcf)
@
-Data can be further extracted with \Rcode{fixed}, \Rcode{info},
-\Rcode{geno}, \Rcode{samples} and \Rcode{meta} accessors.
+Data can be further extracted using the named accessors.
<<headeraccessors>>=
-samples(hdr)
-geno(hdr)
-head(info(hdr), 3)
+samples(header(vcf))
+geno(header(vcf))
@
-\Rcode{rowData} contains information in the CHROM, POS, and ID
+\subsubsection{Genomic positions}
+\Rcode{rowData} contains information from the CHROM, POS, and ID
fields of the VCF file, represented as a \Robject{GRanges}.
The \Rcode{paramRangeID} column is meaningful when reading
subsets of data and is discussed further below.
<<readVcf_rowData>>=
-head(rowData(vcf))
+head(rowData(vcf), 3)
@
-The REF, ALT, QUAL and FILTER fields can be accessed with
-\Rcode{ref}, \Rcode{alt}, \Rcode{qual} and \Rcode{filt}.
+Individual fields can be pulled out with named accessors. Here
+we see \Rcode{REF} is stored as a \Robject{DNAStringSet} and
+\Rcode{qual} is a numeric vector.
<<readVcf_fixed>>=
ref(vcf)[1:5]
qual(vcf)[1:5]
@
-\Robject{ALT} is a \Robject{DNAStringSetList} or \Rcode{DNAStringSet}.
-In the case of structural variants, \Robject{ALT} will be a
+\Robject{ALT} is a \Robject{DNAStringSetList} (allows for
+multiple alternate alleles per variant) or a \Rcode{DNAStringSet}.
+When structural variants are present it will be a
\Robject{CharacterList}.
<<readVcf_ALT>>=
-alt(vcf)
+alt(vcf)[1:5]
@
-Genotype data described in the \Rcode{FORMAT} field are parsed
-into matrices or arrays and can be accessed with the \Rcode{geno}
-accessor.
+\subsubsection{Genotype data}
+Genotype data described in the \Rcode{FORMAT} fields are parsed
+into the geno slot. The data are unique to each sample and
+each sample may have multiple values variable. Because of this,
+the data are parsed into matrices or arrays where the rows
+represent the variants and the columns the samples. Multidimentional
+arrays indicate multiple values per sample. In this file all
+variables are matrices.
<<geno_hdr>>=
-geno(hdr)
geno(vcf)
-geno(vcf)$GT[1:3,1:5]
+sapply(geno(vcf), class)
+@
+
+Let's take a closer look at the genotype dosage (DS) variable.
+The header provides the variable definition and type.
+<<explore_geno>>=
+geno(header(vcf))["DS",]
+@
+
+These data are stored as a 10376 x 5 matrix. Each of the five
+samples (columns) has a single value per variant location (row).
+<<dim_geno>>=
+DS <-geno(vcf)$DS
+dim(DS)
+DS[1:3,]
+@
+
+DS is also known as 'posterior mean genotypes' and range in value
+from [0, 2]. To get a sense of variable distribution, we compute
+a five number summary of the minimum, lower-hinge (first quartile),
+median, upper-hinge (third quartile) and maximum.
+<<fivenum>>=
+fivenum(DS)
@
-The \Rcode{geno} data are unique for each sample. In contrast,
-the \Rcode{info} data are unique to the variant and the same
-across all samples. These data are stored in a \Rcode{DataFrame}.
+The majority of these values (86\%) are zero.
+<<DS_zero>>=
+length(which(DS==0))/length(DS)
+@
+
+View the distribution of the non-zero values.
+<<DS_hist, fig=TRUE>>=
+hist(DS[DS != 0], breaks=seq(0, 2, by=0.05),
+ main="DS non-zero values", xlab="DS")
+@
+
+\subsubsection{Info data}
+In contrast to the genotype data, the info data are unique to
+the variant and the same across samples. All info variables
+are represented in a single \Rcode{DataFrame}.
<<info>>=
info(vcf)[1:4, 1:5]
@
+We will use the info data to compare quality measures between
+novel (i.e., not in dbSNP) and known (i.e., in dbSNP) variants
+and the variant type present in the file. Variants with
+membership in dbSNP can be identified by using the appropriate
+SNPlocs package for hg19.
+<<examine_dbSNP>>=
+library(SNPlocs.Hsapiens.dbSNP.20101109)
+dbsnprd <- renameSeqlevels(rowData(vcf), c("22"="ch22"))
+ch22snps <- getSNPlocs("ch22")
+dbsnpchr22 <- sub("rs", "", names(dbsnprd)) %in% ch22snps$RefSNP_id
+table(dbsnpchr22)
+@
+
+Info variables of interest are 'VT', 'LDAF' and 'RSQ'.
+The header offers more details on these variables.
+<<header_info>>=
+info(header(vcf))[c("VT", "LDAF", "RSQ"),]
+@
+
+Create a data frame of quality measures of interest ...
+<<examine_quality>>=
+metrics <- data.frame(QUAL=qual(vcf), inDbSNP=dbsnpchr22,
+ VT=info(vcf)$VT, LDAF=info(vcf)$LDAF, RSQ=info(vcf)$RSQ)
+@
+
+and visualize the distribution of qualities using \Rcode{ggplot2}.
+For instance, genotype imputation quality is higher for the known
+variants in dbSNP.
+<<examine_ggplot2, fig=TRUE>>=
+library(ggplot2)
+ggplot(metrics, aes(x=RSQ, fill=inDbSNP)) +
+ geom_density(alpha=0.5) +
+ scale_x_continuous(name="MaCH / Thunder Imputation Quality") +
+ scale_y_continuous(name="Density") +
+ theme(legend.position="top")
+@
+
\subsection{Import data subsets}
When working with large VCF files it may be more efficient to
read in subsets of the data. This can be accomplished by
selecting genomic coordinates (ranges) or by specific fields
from the VCF file.
-\subsubsection{Genomic coordinates}
+\subsubsection{Select genomic coordinates}
To read in a portion of chromosome 22, create a \Robject{GRanges}
with the regions of interest.
<<subset_ranges>>=
@@ -155,7 +230,7 @@ from which param range.
head(rowData(vcf_rng), 3)
@
-\subsubsection{VCF fields}
+\subsubsection{Select VCF fields}
Data import can also be defined by the \Rcode{fixed}, \Rcode{info}
and \Rcode{geno} fields. Fields available for import are
described in the header information. To view the header before
@@ -185,37 +260,6 @@ svp_all <- ScanVcfParam(info="LDAF", geno="GT", which=rng)
svp_all
@
-\subsection{Examine VCF contents}
-It is easy to examine the contents of a VCF file. Let's compare
-quality measure between novel (i.e., not in dbSNP) and known
-(i.e., in dbSNP) variants and the variant type present in the
-file. Variants with membership in dbSNP can be identified by
-using the appropriate SNPlocs package for hg19.
-<<examine_dbSNP>>=
-library(SNPlocs.Hsapiens.dbSNP.20101109)
-dbsnprd <- renameSeqlevels(rowData(vcf), c("22"="ch22"))
-ch22snps <- getSNPlocs("ch22")
-dbsnpchr22 <- sub("rs", "", names(dbsnprd)) %in% ch22snps$RefSNP_id
-table(dbsnpchr22)
-@
-
-Create a data frame of quality measures of interest ...
-<<examine_quality>>=
-metrics <- data.frame(QUAL=qual(vcf), inDbSNP=dbsnpchr22,
- VT=info(vcf)$VT, LDAF=info(vcf)$LDAF, RSQ=info(vcf)$RSQ)
-@
-
-and visualize the distribution of qualities using \Rcode{ggplot2}.
-For instance, genotype imputation quality is higher for the known
-variants in dbSNP.
-<<examine_ggplot2, fig=TRUE>>=
-library(ggplot2)
-ggplot(metrics, aes(x=RSQ, fill=inDbSNP)) +
- geom_density(alpha=0.5) +
- scale_x_continuous(name="MaCH / Thunder Imputation Quality") +
- scale_y_continuous(name="Density") +
- theme(legend.position="top")
-@
\section{Locating variants in and around genes}
Variant location with respect to genes can be identified with the
@@ -414,7 +458,7 @@ allele2 <- res$map[["allele.2"]]
unique(elementLengths(allele2))
@
-\subsection{Expand "flatten" a VCF}
+\subsection{Expand a VCF}
Coming soon ...
CollapsedVCF and ExpandedVCF classes and expand,CollapsedVCF-method.
View
3 VariantAnnotation/inst/unitTests/test_filterVcf.R
@@ -3,7 +3,8 @@ test_filterVcf_TabixFile <- function()
fl <- system.file("extdata", "chr22.vcf.gz", package="VariantAnnotation")
tbx <- TabixFile(fl, yieldSize=5000)
dest <- tempfile()
- ans <- filterVcf(tbx, "hg19", dest)
+ filt <- FilterRules(list(fun=function(...) TRUE))
+ ans <- filterVcf(tbx, "hg19", dest, filters=filt)
checkIdentical(dest, ans)
vcf0 <- readVcf(fl, "hg19")
View
40 VariantAnnotation/inst/unitTests/test_snpSummary.R
@@ -0,0 +1,40 @@
+test_snpSummary_empty <- function()
+{
+ cnames <- c("g00", "g01", "g11", "a0Freq", "a1Freq",
+ "HWEzscore", "HWEpvalue")
+ ## no genotype
+ fl <- system.file("unitTests", "cases",
+ "FORMAT_header_no_SAMPLEs.vcf",
+ package="VariantAnnotation")
+ res <- suppressWarnings(snpSummary(readVcf(fl, "hg19")))
+ checkTrue(all(dim(res) == c(0L, 7L)))
+ checkEquals(names(res), cnames)
+
+ ## structural
+ fl <- system.file("extdata", "structural.vcf",
+ package="VariantAnnotation")
+ res <- suppressWarnings(snpSummary(readVcf(fl, "hg19")))
+ checkTrue(all(dim(res) == c(0L, 7L)))
+ checkEquals(names(res), cnames)
+}
+
+test_snpSummary_output <- function()
+{
+ fl <- system.file("extdata", "ex2.vcf",
+ package="VariantAnnotation")
+ vcf <- readVcf(fl, "hg19")
+ res <- snpSummary(vcf)
+
+ checkTrue(is.integer(res$g00))
+ checkTrue(is.integer(res$g01))
+ checkTrue(is.integer(res$g11))
+
+ checkEquals(nrow(res), nrow(vcf))
+ checkEquals(rownames(res), rownames(vcf))
+
+ checkEquals(as.integer(res[1,c("g00", "g01", "g11")]), c(1L, 1L, 1L))
+ checkEquals(as.integer(res[2,c("g00", "g01", "g11")]), c(2L, 1L, 0L))
+ checkEquals(round(res[["a0Freq"]], 2), c(0.50, 0.83, NA, NA, NA))
+ checkEquals(round(res[["a1Freq"]], 2), c(0.50, 0.17, NA, NA, NA))
+}
+
View
38 VariantAnnotation/man/filterVcf-methods.Rd
@@ -9,10 +9,12 @@
\usage{
\S4method{filterVcf}{character}(file, genome, destination, ..., verbose = TRUE,
- index = FALSE, filters = FilterRules(), param = ScanVcfParam())
+ index = FALSE, prefilters = FilterRules(), filters = FilterRules(),
+ param = ScanVcfParam())
\S4method{filterVcf}{TabixFile}(file, genome, destination, ..., verbose = TRUE,
- index = FALSE, filters = FilterRules(), param = ScanVcfParam())
+ index = FALSE, prefilters = FilterRules(), filters = FilterRules(),
+ param = ScanVcfParam())
}
\arguments{
@@ -34,8 +36,11 @@
should be compressed and indexed (using \code{\link{bgzip}} and
\code{indexTabix}).}
+ \item{prefilters}{A \code{\link{FilterRules}} instance contains rules for
+ filtering un-parsed lines of the VCF file.}
+
\item{filters}{A \code{\link{FilterRules}} instance contains rules for
- filtering.}
+ filtering fully parsed VCF objects.}
\item{param}{A \code{\link{ScanVcfParam}} instance restricting input
of particular \code{info} or \code{geno} fileds, or genomic
@@ -44,12 +49,22 @@
}
\details{
+
This function transfers content of one VCF file to another, removing
- records that fail to satisfy \code{filters}. Filtering is done in a
- memory efficient manner, iterating over the input VCF file in chunks
- of default size 10,000 (when invoked with \code{character(1)} for
- \code{file}) or as specified by the \code{yieldSize} argument of
- \code{TabixFile} (when invoked with \code{TabixFile}).
+ records that fail to satisfy \code{prefilters} and
+ \code{filters}. Filtering is done in a memory efficient manner,
+ iterating over the input VCF file in chunks of default size 100,000
+ (when invoked with \code{character(1)} for \code{file}) or as
+ specified by the \code{yieldSize} argument of \code{TabixFile} (when
+ invoked with \code{TabixFile}).
+
+ There are up to two passes. In the first pass, unparsed lines are
+ passed to \code{prefilters} for filtering, e.g., searching for a fixed
+ character string. In the second pass lines successfully passing
+ \code{prefilters} are parsed into \code{VCF} instances and made
+ available for further filtering. One or both of \code{prefilter} and
+ \code{filter} can be present.
+
}
\value{The destination file path as a \code{character(1)}.}
@@ -67,9 +82,12 @@
fl <- system.file(package="VariantAnnotation", "extdata",
"chr22.vcf.gz")
destination <- tempfile()
+pre <- FilterRules(list(isLowCoverageExomeSnp = function(x) {
+ grepl("LOWCOV,EXOME", x, fixed=TRUE)
+}))
filt <- FilterRules(list(isSNP = function(x) info(x)$VT == "SNP"))
-filtered <- filterVcf(fl, "hg19", destination, filters=filt)
-readVcf(filtered, "hg19")
+filtered <- filterVcf(fl, "hg19", destination, prefilters=pre, filters=filt)
+vcf <- readVcf(filtered, "hg19")
}
\keyword{manip}
View
99 VariantAnnotation/man/snpSummary.Rd
@@ -0,0 +1,99 @@
+\name{snpSummary}
+
+\alias{snpSummary}
+\alias{snpSummary,CollapsedVCF-method}
+
+
+\title{Counts and distribution statistics for SNPs in a VCF object}
+
+\description{
+ Counts and distribution statistics for SNPs in a VCF object
+}
+
+\usage{
+ \S4method{snpSummary}{CollapsedVCF}(x, ...)
+}
+
+\arguments{
+ \item{x}{
+ A \link{CollapsedVCF} object.
+ }
+ \item{\dots}{
+ Additional arguments to methods.
+ }
+}
+
+\details{
+ Genotype counts, allele counts and Hardy Weinberg equilibrium
+ (HWE) statistics are calculated for single nucleotide variants
+ in a \link{CollapsedVCF} object. HWE has been established as a
+ useful quality filter on genotype data. This equilibrium should
+ be attained in a single generation of random mating. Departures
+ from HWE are indicated by small p values and are almost invariably
+ indicative of a problem with genotype calls.
+
+ The following caveats apply:
+ \itemize{
+ \item No distinction is made between phased and unphased genotypes.
+ \item Only diploid calls are included.
+ \item Only `valid' SNPs are included. A `valid' SNP is defined
+ as having a reference allele of length 1 and a single
+ alternate allele of length 1.
+ }
+ Variants that do not meet these criteria are set to NA.
+}
+
+\value{
+ The object returned is a \code{data.frame} with seven columns.
+ \describe{
+ \item{g00}{
+ Counts for genotype 00 (homozygous reference).
+ }
+ \item{g01}{
+ Counts for genotype 01 or 10 (heterozygous).
+ }
+ \item{g11}{
+ Counts for genotype 11 (homozygous alternate).
+ }
+ \item{a0Freq}{
+ Frequency of the reference allele.
+ }
+ \item{a1Freq}{
+ Frequency of the alternate allele.
+ }
+ \item{HWEzscore}{
+ Z-score for departure from a null hypothesis of Hardy Weinberg equilibrium.
+ }
+ \item{HWEpvalue}{
+ p-value for departure from a null hypothesis of Hardy Weinberg equilibrium.
+ }
+ }
+}
+
+\author{
+ Chris Wallace <cew54@cam.ac.uk>
+}
+
+\seealso{
+ \link{genotypeToSnpMatrix},
+ \link{probabilityToSnpMatrix}
+}
+
+\examples{
+ fl <- system.file("extdata", "ex2.vcf", package="VariantAnnotation")
+ vcf <- readVcf(fl, "hg19")
+
+ ## The return value is a data.frame with genotype counts
+ ## and allele frequencies.
+ df <- snpSummary(vcf)
+ df
+
+ ## Compare to ranges in the VCF object:
+ rowData(vcf)
+
+ ## No statistics were computed for the variants in rows 3, 4
+ ## and 5. They were omitted because row 3 has two alternate
+ ## alleles, row 4 has none and row 5 is not a SNP.
+}
+
+\keyword{manip}

0 comments on commit a2a7e45

Please sign in to comment.
Something went wrong with that request. Please try again.