Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Update to VariantAnnotation 1.5.26. Thanks to Stephanie for additiona…

…l work on genotypeToSnpMatrix and friends. Pull request has been merged.
  • Loading branch information...
commit f8a269f51cd38d9ebba09097f102d043abb4f88a 1 parent a2a7e45
vobencha vobencha authored
2  VariantAnnotation/DESCRIPTION
View
@@ -3,7 +3,7 @@ Type: Package
Title: Annotation of Genetic Variants
Description: Annotate variants, compute amino acid coding changes,
predict coding outcomes
-Version: 1.5.24
+Version: 1.5.26
Author: Valerie Obenchain, Martin Morgan, Michael Lawrence with
contributions from Stephanie Gogarten.
Maintainer: Valerie Obenchain <vobencha@fhcrc.org>
2  VariantAnnotation/R/methods-MatrixToSnpMatrix.R
View
@@ -11,6 +11,8 @@
setMethod("MatrixToSnpMatrix", c("matrix", "DNAStringSet", "DNAStringSetList"),
function(callMatrix, ref, alt, ...)
{
+ .Deprecated("genotypeToSnpMatrix")
+
ok <- suppressWarnings(require("snpStats", quietly=TRUE,
character.only=TRUE))
ok || stop("'snpStats' required; try biocLite('snpStats')", call.=FALSE)
99 VariantAnnotation/R/methods-VAFilter-class.R
View
@@ -1,99 +0,0 @@
-### =========================================================================
-### vaFilter methods
-### =========================================================================
-
-setMethod(.vaValidity, "VAFilter", function (object) {
- msg <- NULL
- fmls <- formals(object)
- if (length(fmls) == 0 || names(fmls)[[1]] != "x")
- msg <- c(msg, paste("'filter' must have at least one argument, 'x'"))
- if (is.null(msg)) TRUE else msg
-})
-
-setMethod(vaFilter, "missing", function(fun, name, ...) {
- vaFilter(function(x) !logical(length(x)), name=name, ...)
-})
-
-setMethod(vaFilter, "function",
- function(fun, name, ...)
-{
- name <- mkScalar(as.character(name))
- fmls <- formals(fun)
- if (length(fmls) == 0 || names(fmls)[[1]] != "x")
- stop("UserArgumentMismatch",
- "'filter' must have at least one argument, 'x'")
-
- env <- new.env(parent=environment(fun))
- env[[".stats"]] <- NULL
- fun <- eval(substitute(function(x, subset=TRUE) {
- res <- FUN(x)
- VAFilterResult(x, res, NAME, subset)
- }, list(FUN=fun, NAME=name)))
- environment(fun) <- env
-
- new("VAFilter", fun, name=name, ...)
-})
-
-setMethod(vaFilter, "VAFilter", function(fun, name, ...) {
- slot(fun, ".Data")
-})
-
-setMethod(name, "VAFilter", function(x, ...) slot(x, "name"))
-
-setMethod(show, "VAFilter", function(object) {
- cat("class:", class(object), "\n")
- cat("name:", name(object), "\n")
- cat("use vaFilter(object) to see filter\n")
-})
-
-dbSNPFilter <-
- function(dbSNP=character(0), .name="dbSNPFilter")
-{
- #.check_type_and_length(regex, "character", 0:1)
- require(dbSNP, quietly=TRUE, character.only=TRUE)
- vaFilter(function(x) {
- .idx <- logical(length(x))
- nms <- rep(runValue(seqnames(x)), runLength(seqnames(x)))
- df <- data.frame(chrom=nms, start=start(x), index=seq_len(length(x)))
- snps <- df[width(x) == 1,]
- chr <- names(getSNPcount())[names(getSNPcount()) %in% snps$chrom]
- res <- lapply(chr, function(i, snps) {
- s <- getSNPlocs(i)
- q <- snps[snps$chrom %in% i,]
- dbsnp <- q$start %in% s$loc
- q$index[dbsnp]
- }, snps)
- .idx[do.call(rbind, res)] <- TRUE
- .idx
- }, name = .name)
-}
-
-regionFilter <-
- function(txdb, region="coding", .name="regionFilter")
-{
- #.check_type_and_length(min, "numeric", 1)
- vaFilter(function(x) {
- loc <- locateVariants(x, txdb)
- res <- logical(length(x))
- res[unique(loc$queryID[loc$location %in% region])] <- TRUE
- res
- }, name=.name)
-}
-
-compose <-
- function(filt, ..., .name)
-{
- lst <- if (missing(filt)) list(...) else list(filt, ...)
- #for (`filt, ...` in lst)
- # .check_type_and_length(`filt, ...`, "VAFilter", NA)
- if (missing(.name))
- .name <- paste(sapply(lst, name), collapse=" o ")
- vaFilter(function(x) {
- .idx <- VAFilterResult(x, !logical(length(x)), subset=FALSE)
- for (elt in rev(lst)){
- .idx <- .idx & elt(x, subset=FALSE)@.Data
- }
- .idx
- }, name = .name)
-}
-
58 VariantAnnotation/R/methods-VAFilterResult-class.R
View
@@ -1,58 +0,0 @@
-### =========================================================================
-### VAFilterResult methods
-### =========================================================================
-
-VAFilterResult <-
- function(data=GRanges(), x=logical(), name=NA_character_,
- subset=TRUE, input=length(x), passing=sum(x), op=NA_character_)
-{
- name=mkScalar(as.character(name)[length(name)])
- stats <- data.frame(Name=as.character(name), Input=input, Passing=passing,
- Op=op, stringsAsFactors=FALSE)
- if (subset) {
- res <- data[x]
- metadata(res) <- list(name=name, stats=stats)
- res
- } else {
- new("VAFilterResult", x, name=name, stats=stats)
- }
-}
-
-setMethod(name, "VAFilterResult", function(x, ...) slot(x, "name"))
-
-setMethod(stats, "VAFilterResult", function(x, ...) slot(x, "stats"))
-
-setMethod(Logic, c("VAFilterResult", "VAFilterResult"),
- function(e1, e2)
-{
- x <- callNextMethod()
- s1 <- stats(e1); s2 <- stats(e2)
- op <- as.character(.Generic)
- name <- sprintf("(%s %s %s)", name(e1), op, name(e2))
- s <- rbind(stats(e1), stats(e2),
- data.frame(Name=name, Input=length(x), Passing=sum(x),
- Op=op, stringsAsFactors=FALSE))
- VAFilterResult(x, s$Name, s$Input, s$Passing, s$Op)
-})
-
-setMethod("!", "VAFilterResult",
- function(x)
-{
- name <- sprintf("!(%s)", name(x))
- y <- callNextMethod()
- s <- rbind(stats(x),
- data.frame(Name=name, Input=length(y), Passing=sum(y),
- Op="!", stringsAsFactors=FALSE))
- VAFilterResult(y, s$Name, s$Input, s$Passing, s$Op)
-})
-
-setMethod(show, "VAFilterResult",
- function(object)
-{
- cat("class:", class(object), "\n")
- cat("name:", name(object), "\n")
- cat("output:", selectSome(object), "\n")
- cat("stats:\n")
- print(stats(object))
-})
-
40 VariantAnnotation/R/methods-filterVcf.R
View
@@ -15,7 +15,7 @@ setMethod("filterVcf", "character",
unlist(scanTabix(...), use.names=FALSE)
.prefilter <-
- function(tbxFile, verbose, index, prefilters, param, ...)
+ function(tbxFile, verbose, prefilters, param, ...)
{
if (!isOpen(tbxFile)) {
open(tbxFile)
@@ -25,7 +25,7 @@ setMethod("filterVcf", "character",
prefilteredFilename <- tempfile()
prefiltered <- file(prefilteredFilename, "w")
needsClosing <- TRUE
- on.exit(if (needsClosing) close(prefiltered))
+ on.exit(if (needsClosing) close(prefiltered), add=TRUE)
## copy header
writeLines(headerTabix(tbxFile)$header, prefiltered)
@@ -39,12 +39,13 @@ setMethod("filterVcf", "character",
close(prefiltered)
needsClosing <- FALSE
- if (index) {
- prefilteredFilename <- bgzip(prefilteredFilename, overwrite = TRUE)
- indexTabix(prefilteredFilename, format = "vcf")
- }
+ ## TabixFile needs to be bgzipped and indexed
+ ## FIXME: all records are read at next stage, so no need to index?
+ gzFilename<- bgzip(prefilteredFilename, overwrite = TRUE)
+ indexTabix(gzFilename, format = "vcf")
+ unlink(prefilteredFilename)
- TabixFile(prefilteredFilename, yieldSize=yieldSize(tbxFile))
+ TabixFile(gzFilename, yieldSize=yieldSize(tbxFile))
}
.filter <-
@@ -57,7 +58,7 @@ setMethod("filterVcf", "character",
filtered <- file(destination, open="a")
needsClosing <- TRUE
- on.exit(if (needsClosing) close(filtered))
+ on.exit(if (needsClosing) close(filtered), add=TRUE)
while (nrow(vcfChunk <- readVcf(tbxFile, genome, ..., param=param))) {
vcfChunk <- subsetByFilter(vcfChunk, filters)
@@ -85,25 +86,22 @@ setMethod("filterVcf", "TabixFile",
if (!length(prefilters) && !length(filters))
stop("no 'prefilters' or 'filters' specified")
- ## prefilters
- if (length(prefilters)) {
- doIndex <- index || (length(filters) != 0)
- file <- .prefilter(file, verbose, doIndex, prefilters, param,
- ...)
- }
-
- ## filters
+ if (length(prefilters))
+ file <- .prefilter(file, verbose, prefilters, param, ...)
+
if (length(filters))
file <- .filter(file, genome, destination, verbose, filters,
param, ...)
- ## desination: character(1) file path
if (index) {
- filenameGZ <- bgzip(file, overwrite = TRUE)
- indexTabix(filenameGZ, format = "vcf")
+ gzFilename <- sprintf("%s.gz", destination)
+ gzFilename <- bgzip(file, gzFilename, overwrite = TRUE)
+ destination <- indexTabix(gzFilename, format = "vcf")
unlink(file)
- invisible(filenameGZ)
} else {
- invisible(file)
+ if (file != destination)
+ file.rename(file, destination)
}
+
+ invisible(destination)
})
37 VariantAnnotation/R/methods-genotypeToSnpMatrix.R
View
@@ -8,6 +8,15 @@
## 2 = heterozygous (0|1 or 1|0)
## 3 = homozygous alternate (risk) allele (1|1)
+## empty matrix to return if conditions not met
+.emptySnpMatrix <- function() {
+ list(genotype=new("SnpMatrix"),
+ map=DataFrame(snp.names=character(),
+ allele.1=DNAStringSet(),
+ allele.2=DNAStringSetList(),
+ ignore=character()))
+}
+
setMethod("genotypeToSnpMatrix", "CollapsedVCF",
function(x, uncertain=FALSE, ...)
{
@@ -17,12 +26,15 @@ setMethod("genotypeToSnpMatrix", "CollapsedVCF",
alt <- alt(x)
if (is(alt, "CompressedCharacterList")) {
- warning(paste0("structural variants detected and not supported ",
- "by SnpMatrix; returning NULL"))
- return(NULL)
+ warning("ALT must be DNAStringSetList")
+ return(.emptySnpMatrix())
}
ref <- ref(x)
+ if (ncol(x) == 0) {
+ warning("no samples in VCF")
+ }
+
if (!uncertain) {
gt <- geno(x)$GT
} else {
@@ -40,7 +52,7 @@ setMethod("genotypeToSnpMatrix", "CollapsedVCF",
gt <- GLtoGP(gt)
} else {
warning("uncertain=TRUE requires GP or GL; returning NULL")
- return(NULL)
+ return(.emptySnpMatrix())
}
}
@@ -56,10 +68,11 @@ setMethod("genotypeToSnpMatrix", "array",
stop("'alt' must be a DNAStringSetList")
# query ref and alt alleles for valid SNPs
altelt <- elementLengths(alt) == 1L
- altseq <- logical(length(alt))
- idx <- rep(altelt, elementLengths(alt))
- altseq[altelt] = width(unlist(alt))[idx] == 1L
- snv <- altseq & (width(ref) == 1L)
+ #altseq <- logical(length(alt))
+ #idx <- rep(altelt, elementLengths(alt))
+ #altseq[altelt] = width(unlist(alt))[idx] == 1L
+ #snv <- altseq & (width(ref) == 1L)
+ snv <- .isSNV(ref, alt)
# if x is a matrix, we have GT with a single value for each snp
if (is.matrix(x)) {
@@ -72,10 +85,10 @@ setMethod("genotypeToSnpMatrix", "array",
warning("non-single nucleotide variations are set to NA")
x[!snv,] <- ".|."
}
-
- map <- setNames(sapply(rep(c(0, 1, 2, 2, 3), 2), as.raw),
- c(".|.", "0|0", "0|1", "1|0", "1|1",
- "./.", "0/0", "0/1", "1/0", "1/1"))
+ map <- .genotypeToIntegerSNV(TRUE)
+ #map <- setNames(sapply(rep(c(0, 1, 2, 2, 3), 2), as.raw),
+ # c(".|.", "0|0", "0|1", "1|0", "1|1",
+ # "./.", "0/0", "0/1", "1/0", "1/1"))
diploid <- x %in% names(map)
if (!all(diploid)) {
warning("non-diploid variants are set to NA")
42 VariantAnnotation/inst/doc/VariantAnnotation.Rnw
View
@@ -425,11 +425,11 @@ head(pp[!is.na(pp$PREDICTION), ])
\subsection{Create a SnpMatrix}
The 'GT' element in the \Rcode{FORMAT} field of the VCF represents the
-genotype. These data can be converted into a \Robject{snpMatrix} object
+genotype. These data can be converted into a \Robject{SnpMatrix} object
which can then be used with the functions offered in \Rpackage{snpStats}
and other packages making use of the \Robject{SnpMatrix} class.
-The \Rfunction{MatrixToSnpMatrix} function converts the genotype calls in
+The \Rfunction{genotypeToSnpMatrix} function converts the genotype calls in
\Rcode{geno} to a \Robject{SnpMatrix}. No dbSNP package is used in this
computation. The return value is a named list where 'genotypes' is a
\Robject{SnpMatrix} and 'map' is a \Robject{DataFrame} with SNP names and
@@ -437,16 +437,15 @@ alleles at each loci. The \Rcode{ignore} column in 'map' indicates which
variants were set to NA (missing) because they met one or more of the following
criteria,
\begin{itemize}
-\item{only diploid calls are included; others are set to NA}
-\item{only single nucleotide variants are included; others are set to NA}
\item{variants with >1 ALT allele are set to NA}
+\item{only single nucleotide variants are included; others are set to NA}
+\item{only diploid calls are included; others are set to NA}
\end{itemize}
-See ?\Rfunction{MatrixToSnpMatrix} for more details.
+See ?\Rfunction{genotypeToSnpMatrix} for more details.
<<snpMatrix>>=
-calls <- geno(vcf)$GT
-res <- MatrixToSnpMatrix(calls, ref(vcf), alt(vcf))
+res <- genotypeToSnpMatrix(vcf)
res
@
@@ -458,6 +457,35 @@ allele2 <- res$map[["allele.2"]]
unique(elementLengths(allele2))
@
+In addition to the called genotypes, genotype likelihoods or
+probabilities can also be converted to a \Robject{SnpMatrix}, using
+the \Rpackage{snpStats} encoding of posterior probabilities as byte
+values. To use the values in the 'GL' or 'GP' \Rcode{FORMAT} field
+instead of the called genotypes, use the \Rcode{uncertain=TRUE} option
+in \Rcode{genotypeToSnpMatrix}.
+
+<<>>=
+fl.gl <- system.file("extdata", "gl_chr1.vcf", package="VariantAnnotation")
+vcf.gl <- readVcf(fl.gl, "hg19")
+geno(vcf.gl)
+
+## Convert the "GL" FORMAT field to a SnpMatrix
+res <- genotypeToSnpMatrix(vcf.gl, uncertain=TRUE)
+res
+t(as(res$genotype, "character"))[c(1,3,7), 1:5]
+
+## Compare to a SnpMatrix created from the "GT" field
+res.gt <- genotypeToSnpMatrix(vcf.gl, uncertain=FALSE)
+t(as(res.gt$genotype, "character"))[c(1,3,7), 1:5]
+
+## What are the original likelihoods for rs58108140?
+geno(vcf.gl)$GL["rs58108140", 1:5]
+@
+
+For variant rs58108140 in sample NA06989, the "A/B" genotype is much
+more likely than the others, so the \Rcode{SnpMatrix} object displays
+the called genotype.
+
\subsection{Expand a VCF}
Coming soon ...
0  .../inst/unitTests/test_summarizeVariaints-methods.R → ...n/inst/unitTests/test-summarizeVariants-methods.R
View
File renamed without changes
20 VariantAnnotation/inst/unitTests/test_genotypeToSnpMatrix.R
View
@@ -1,3 +1,5 @@
+library(snpStats) ## SnpMatrix class
+quiet <- suppressWarnings
test_gSM_array_GT <- function() {
mat <- matrix(c(".|.", "0|0", "0|1", "1|0", "1|1",
@@ -37,7 +39,7 @@ test_gSM_array_GT_2alt <- function() {
allele.1=ref,
allele.2=alt,
ignore=rep(TRUE,3))
- gtsm <- genotypeToSnpMatrix(mat, ref, alt)
+ gtsm <- quiet(genotypeToSnpMatrix(mat, ref, alt))
checkIdentical(sm, gtsm$genotypes)
checkIdentical(map, gtsm$map)
}
@@ -57,7 +59,7 @@ test_gSM_array_GT_nonsnv <- function() {
allele.1=ref,
allele.2=alt,
ignore=rep(TRUE,3))
- gtsm <- genotypeToSnpMatrix(mat, ref, alt)
+ gtsm <- quiet(genotypeToSnpMatrix(mat, ref, alt))
checkIdentical(sm, gtsm$genotypes)
checkIdentical(map, gtsm$map)
}
@@ -65,7 +67,7 @@ test_gSM_array_GT_nonsnv <- function() {
test_gSM_VCF_GL <- function() {
fl <- system.file("extdata", "gl_chr1.vcf", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
- gtsm <- genotypeToSnpMatrix(vcf, uncertain=TRUE)
+ gtsm <- quiet(genotypeToSnpMatrix(vcf, uncertain=TRUE))
checkIdentical(colnames(vcf), rownames(gtsm$genotypes))
checkIdentical(rownames(vcf), colnames(gtsm$genotypes))
checkIdentical(rownames(vcf), gtsm$map$snp.names)
@@ -78,7 +80,17 @@ test_gSM_VCF_GL <- function() {
test_gSM_VCF_structural <- function() {
fl <- system.file("extdata", "structural.vcf", package="VariantAnnotation")
vcf <- readVcf(fl, "hg19")
- checkIdentical(NULL, genotypeToSnpMatrix(vcf))
+ checkIdentical(VariantAnnotation:::.emptySnpMatrix(),
+ genotypeToSnpMatrix(vcf))
+}
+
+test_gSM_VCF_noSamples <- function() {
+ fl <- system.file("unitTests", "cases",
+ "FORMAT_header_no_SAMPLEs.vcf",
+ package="VariantAnnotation")
+ vcf <- readVcf(fl, "hg19")
+ gtsm <- quiet(genotypeToSnpMatrix(vcf))
+ checkEquals(0, nrow(gtsm$genotypes))
}
test_pSM_valid <- function() {
38 VariantAnnotation/man/MatrixToSnpMatrix-methods.Rd
View
@@ -1,11 +1,14 @@
\name{MatrixToSnpMatrix}
\alias{MatrixToSnpMatrix}
+\alias{MatrixToSnpMatrix-deprecated}
\alias{MatrixToSnpMatrix,matrix,DNAStringSet,DNAStringSetList-method}
\title{Convert genotype calls from a VCF file to a SnpMatrix object}
\description{
+ This method is deprecated. Use \link{genotypeToSnpMatrix} instead.
+
Convert a matrix of genotype calls from the "GT" FORMAT field of a VCF
file to a \link[snpStats:SnpMatrix-class]{SnpMatrix}.
}
@@ -73,40 +76,7 @@
}
\examples{
- ## Read a vcf file into a VCF object.
- vcfFile <- system.file("extdata", "ex2.vcf", package="VariantAnnotation")
- vcf <- readVcf(vcfFile, "hg19")
-
- ## Genotype calls are stored in the assays slot.
- ## Reference and alternate (variant) alleles are in ref and
- ## alt slots.
- geno(vcf)
- calls <- geno(vcf)$GT
- a0 <- ref(vcf)
- a1 <- alt(vcf)
- mat <- MatrixToSnpMatrix(calls, a0, a1)
-
- ## The result is a list of length 2.
- names(mat)
-
- ## Compare original coding in the VCF file to the SnpMatrix coding.
- geno(vcf)$GT
- t(as(mat$genotype, "character"))
-
- ## The 'ignore' column of the map element indicates which variants
- ## were set to missing as per the criteria stated in the details section.
- mat$map
-
- ## Variant rs6040355 was ignored because it has more than one alternate
- ## allele. The microsat1 variant has a reference allele with length
- ## greater than 1 and more than 1 alternate allele.
- ## Variant chr20:1230237 was ignored because the alternate allele is not
- ## of length 1.
-
- ## View the reference and alternate alleles.
- DataFrame(reference = ref(vcf),
- alternate = CharacterList(as.list(alt(vcf))),
- row.names = rownames(vcf))
+## see ?genotypeToSnpMatrix
}
\keyword{manip}
180 VariantAnnotation/man/VAFilter-class.Rd
View
@@ -1,180 +0,0 @@
-\name{VAFilter-class}
-\docType{class}
-
-% Class:
-\alias{VAFilter-class}
-\alias{class:VAFilter}
-\alias{VAFilter}
-
-% Constructors:
-\alias{vaFilter,VAFilter-method}
-\alias{vaFilter,missing-method}
-\alias{vaFilter,function-method}
-\alias{vaFilter}
-\alias{dbSNPFilter}
-\alias{regionFilter}
-\alias{compose}
-
-% Other methods:
-\alias{name,VAFilter-method}
-\alias{name}
-\alias{show,VAFilter-method}
-
-
-\title{VAFilter objects - under construction}
-
-\description{
- Objects of the \code{VAFilter} class are functions for filtering variants.
- A \code{VAFilter} instance applied to a \code{\linkS4class{GRanges}}
- returns either a subset of the records that passed the filter or a
- \code{\linkS4class{VAFilterResult}} object. The \code{VAFilterResult} is a
- logical vector indicating which records passed the filter and contains
- summary statistics on the records that passed.
-}
-
-\details{
- A \code{VAFilter} is constructed by using one of the built-in filters,
- \code{dbSNPFilter} or \code{regionFilter}, or a user can create their
- own with \code{vaFilter}. Once the filter is created it can be applied to a
- \code{\linkS4class{GRanges}} object. The format of the output is dictated by
- the optional\code{subset} argument. If \code{subset=TRUE} (the default) a
- \code{\linkS4class{GRanges}} is returned containing only the records that
- passed the filter. If \code{subset=FALSE} a \code{\linkS4class{VAFilterResult}}
- object is returned. This object contains a logical vector indicating which
- records passed the filter and summary statistics on the records that passed.
-}
-
-\section{Slots}{
- \describe{
- \item{\code{.Data}:}{Object of class \code{"function"} taking a
- single named argument \code{x} corresponding to the
- \code{\link{GRanges}} object that the filter will be applied to.
- The return value of the filter function is expected to be a logical
- vector that can be used to subset \code{x} to include those elements
- of \code{x} satisfying the filter.
- }
- \item{\code{name}:}{Object of class \code{"ScalarCharacter"}
- representing the name of the filter. The name is useful for
- suggesting the purpose of the filter, and for debugging failed
- filters.
- }
- }
-}
-
-\section{Constructors}{
- Built-in filters :
- \describe{
- \item{}{
- \code{dbSNPFilter(dbSNP=character(0), name="dbSNPFilter")}:
- Overlaps the records with a range width of 1 with snp locations in the
- specified SNPlocs package.
- }
- \item{}{
- \code{regionFilter(txdb, region="coding", name="regionFilter")}:
- Records will be filtered by the name of the region supplied. Possible
- regions are the same as those in the \code{Location} output from
- \code{locateVariants}: `coding', `intronic', `3UTR', `5UTR', `intergenic',
- `noGeneMatch' and `unknown'. This regions is specified when calling the
- filter on a \code{\linkS4class{GRanges}} object.
- }
- }
- Combining filters :
- \describe{
- \item{}{
- \code{compose(filt, ..., .name)}:
- The \code{compose} function constructs a new filter from one or more
- existing filters. The result is a filter that returns records that pass
- the criteria for all filters. If a name is not provided for the filter, a
- name will be constructed by joining the names of all component filters
- separated by \code{" o "}.
- }
- }
- User defined filters :
- A user can construct their own filter using the \code{vaFilter} function. The
- \code{fun} argument must be a function accepting a single argument \code{x}
- and returning a logical vector indicating which records passed the filter.
- \describe{
- \item{}{
- \code{vaFilter(fun="VAFilter", name=NA_character_, ...)}:
- Returns the function representing the underlying filter. This is primarily
- for viewing the filter function.
- }
- \item{}{
- \code{vaFilter(fun="missing", name=NA_character_, ...)}:
- Creates a default filter that returns a vector of \code{TRUE} values with
- length equal to \code{length(x)}.
- }
- }
-}
-
-\section{Other Methods}{
- \describe{
- \item{name}{\code{signature(x = "VAFilter")}: Return, as a
- \code{ScalarCharacter}, the name of the function.
- }
- \item{show}{\code{signature(object = "VAFilter")}: display a brief
- summary of the filter
- }
- }
-}
-
-\arguments{
- \item{fun}{An object of class \code{function} to be used as a
- filter. \code{fun} must accept a single named argument \code{x}, and
- is expected to return a logical vector such that \code{x[fun(x)]}
- selects only those elements of \code{x} satisfying the conditions of
- \code{fun}
- }
- \item{name}{A \code{character(1)} object to be used as the name of the
- filter. The \code{name} is useful for debugging and reference.
- }
- \item{filt}{A \code{\linkS4class{VAFilter}} object, to be used with
- additional arguments to create a composite filter.
- }
- \item{.name}{An optional \code{character(1)} object used to over-ride
- the name applied to default filters.
- }
- \item{dbSNP}{The \code{character(0)} name of the dbSNP package
- to be used. The default (\code{character(0)})
- performs no filtering
- }
- \item{txdb}{The \link[GenomicFeatures]{TranscriptDb} object used
- to identifying gene regions.
- }
- \item{region}{A \code{character(1)} object specifying the region
- on which to filter the results. Possible values are `coding',
- `intron', `3'UTR', `5'UTR', `intergenic', `noGeneMatch' and
- `unknown'. See ?\code{locateVariants} for regions descriptions.
- }
- \item{\dots}{Additional arguments to methods.
- }
-}
-
-\author{Valerie Obenchain <vobencha@fhcrc.org>}
-
-\seealso{
- \code{\link{VAFilterResult}}
-}
-
-\examples{
-# data(variants)
-#
-# ## Create a filter to select only records in introns :
-# library(TxDb.Hsapiens.UCSC.hg19.knownGene)
-# txdb19 <- TxDb.Hsapiens.UCSC.hg19.knownGene
-# regionFilt <- regionFilter(txdb19, region="intron")
-# ## View the filter with vaFilter()
-# vaFilter(regionFilt)
-# regionFilt(variants, subset=FALSE)
-#
-# ## Create a filter to identify variants present in dbSNP :
-# library("SNPlocs.Hsapiens.dbSNP.20110815")
-# snpFilt <- dbSNPFilter("SNPlocs.Hsapiens.dbSNP.20110815")
-# ## Modify the seqlevels to match the SNPlocs pacakge
-# variants <- renameSeqlevels(variants, c("chr16"="ch16"))
-# ## Apply the filter to chromosome 16 snps :
-# ## Optional subset argument is TRUE (default)
-# snpFilt(variants[seqnames(variants) == "ch16"], subset=TRUE)
-}
-
-\keyword{classes}
120 VariantAnnotation/man/VAFilterResult-class.Rd
View
@@ -1,120 +0,0 @@
-\name{VAFilterResult-class}
-\docType{class}
-
-\alias{VAFilterResult-class}
-\alias{VAFilterResult}
-\alias{Logic,VAFilterResult,VAFilterResult-method}
-\alias{!,VAFilterResult-method}
-\alias{name,VAFilterResult-method}
-\alias{show,VAFilterResult-method}
-\alias{stats}
-\alias{stats,VAFilterResult-method}
-
-\title{"VAFilterResult" for VAFilter output and statistics}
-
-\description{
- Objects of this class are logical vectors indicating records passing
- the applied filter, with an associated data frame summarizing the
- name, input number of records, records passing filter, and logical
- operation used for all filters in which the result participated.
-}
-
-\usage{
- VAFilterResult(data = GRanges(), x = logical(), name = NA_character_,
- subset = TRUE, input = length(x), passing = sum(x), op = NA_character_)
- \S4method{Logic}{VAFilterResult,VAFilterResult}(e1, e2)
- \S4method{name}{VAFilterResult}(x, ...)
- stats(x, ...)
- \S4method{show}{VAFilterResult}(object)
-}
-
-\arguments{
- \item{data}{A \code{linkS4class{GRanges}} of the data to be filtered.
- }
- \item{x, object, e1, e2}{For \code{VAFilterResult}, \code{logical()}
- indicating records that passed filter or, for others, an instance of
- \code{VAFilterResult} class.
- }
- \item{name}{\code{character()} indicating the name by which the filter
- is to be referred. Internally, \code{name}, \code{input},
- \code{passing}, and \code{op} may all be vectors representing
- columns of a \code{data.frame} summarizing the application of
- successive filters.
- }
- \item{subset}{\code{logical()} when TRUE, the function returns a
- subset of the data that passed the filter as a
- \code{linkS4class{GRanges}} object. The \code{name} and \code{stats}
- information are in the metadata slot. When FALSE, an object of
- \code{VAFilterResult} is returned.
- }
- \item{input}{\code{integer()} indicating the length of the original
- input.
- }
- \item{passing}{\code{integer()} indicating the number of records
- passing the filter.
- }
- \item{op}{\code{character()} indicating the logical operation, if any,
- associated with this filter.
- }
- \item{\dots}{Additional arguments, unused in methods documented on this
- page.
- }
-}
-
-\section{Objects from the Class}{
- Objects can be created through \code{\link{VAFilterResult}}, but these
- are automatically created by the application of \code{\link{vaFilter}}
- instances.
-}
-
-\section{Slots}{
- \describe{
-
- \item{\code{.Data}:}{Object of class \code{"logical"} indicating
- records that passed the filter. }
-
- \item{\code{name}:}{Object of class \code{"ScalarCharacter"}
- representing the name of the filter whose results are
- summarized. The name is either the actual name of the filter, or a
- combination of filter names and logical operations when the
- outcome results from application of several filters in a single
- logical expression. }
-
- \item{\code{stats}:}{Object of class \code{"data.frame"} summarizing
- the name, input number of records, records passing filter, and
- logical operation used for all filters in which the result
- participated. The \code{data.frame} rows correspond either to
- single filters, or to logical combinations of filters.}
- }
-}
-
-\section{Methods}{
- \describe{
-
- \item{Logic}{\code{signature(e1 = "VAFilterResult", e2 =
- "VAFilterResult")}: logic operations on filters.}
-
- \item{!}{\code{signature(x = "VAFilterResult")}: Negate the outcome
- of the current filter results }
-
- \item{name}{\code{signature(x = "VAFilterResult")}: The name of the
- filter that the results are based on.}
-
- \item{stats}{\code{signature(x = "VAFilterResult")}: a
- \code{data.frame} as described in the \sQuote{Slots} section of
- this page.}
-
- \item{show}{\code{signature(object = "VAFilterResult")}: summary of
- filter results.}
- }
-}
-
-\author{Valerie Obenchain <vobencha@fhcrc.org>}
-
-\seealso{\code{\link{VAFilter}}}
-
-\examples{
-## see ?VAFilter
-}
-
-\keyword{classes}
28 VariantAnnotation/man/VAUtil-class.Rd
View
@@ -1,28 +0,0 @@
-\name{VAUtil-class}
-\docType{class}
-\alias{.VAUtil-class}
-
-
-\title{".VAUtil" and potentially other related classes - in progress}
-
-\description{
- These classes provide important utility functions in the
- \pkg{VariantAnnotation} package, but may occasionally be seen by the user and
- are documented here for that reason.
-}
-
-\section{Objects from the Class}{
- Utility classes include:
- \itemize{
- \item \code{.VAUtil-class} a virtual base class from which all
- utility classes are derived.
- }
-}
-
-\author{Valerie Obenchain <vobencha@fhcrc.org>}
-
-\examples{
- getClass(".VAUtil", where=getNamespace("VariantAnnotation"))
-}
-
-\keyword{classes}
55 VariantAnnotation/man/genotypeToSnpMatrix-methods.Rd
View
@@ -4,7 +4,6 @@
\alias{genotypeToSnpMatrix,CollapsedVCF-method}
\alias{genotypeToSnpMatrix,array-method}
-
\title{Convert genotype calls from a VCF file to a SnpMatrix object}
\description{
@@ -20,7 +19,7 @@
\arguments{
\item{x}{
A \code{CollapsedVCF} object or a \code{array} of genotype data
- from the "GT" FORMAT field of a VCF file. This \code{array} is created
+ from the "GT", "GP", or "GL" FORMAT field of a VCF file. This \code{array} is created
with a call to \code{readVcf} and can be accessed with \code{geno(<VCF>)}.
}
\item{uncertain}{
@@ -60,16 +59,34 @@
and "B" is the risk allele. Equivalent statements to those made with 0 and 1
allele values would be 0 = missing, 1 = "A/A", 2 = "A/B" or "B/A" and
3 = "B/B".
+
+ The three genotype fields are defined as follows:
+ \itemize{
+ \item{GT : genotype, encoded as allele values separated by either of
+ "/" or "|". The allele values are 0 for the reference allele and 1
+ for the alternate allele.}
+ \item{GL : genotype likelihoods comprised of comma separated
+ floating point log10-scaled likelihoods for all possible
+ genotypes. In the case of a reference allele A and a single
+ alternate allele B, the likelihoods will be ordered "A/A", "A/B", "B/B".}
+ \item{GP : the phred-scaled genotype posterior probabilities for all
+ possible genotypes; intended to store
+ imputed genotype probabilities. The ordering of values is the
+ same as for the GL field.}
+ }
If \code{uncertain=TRUE}, the posterior probabilities of the three
genotypes ("A/A", "A/B", "B/B") are encoded (approximately) as byte
- values. See the \link[snpStats]{snpStats} documentation for more
- details.
+ values. This encoding allows uncertain genotypes to be used in
+ \link[snpStats]{snpStats} functions, which in some cases may be more
+ appropriate than using only the called genotypes. The byte encoding
+ conserves memory by allowing the uncertain genotypes to be stored in a
+ two-dimensional raw matrix.
+ See the \link[snpStats]{snpStats} documentation for more details.
}
\value{
A list with the following elements,
-
\item{genotypes}{
The output genotype data as an object of class
\code{"SnpMatrix"}. The columns are snps and the rows are the samples.
@@ -82,6 +99,10 @@
}
}
+\references{
+ \url{http://www.1000genomes.org/wiki/Analysis/Variant\%20Call\%20Format/vcf-variant-call-format-version-41}
+}
+
\author{
Stephanie Gogarten, Valerie Obenchain <vobencha@fhcrc.org>
}
@@ -129,13 +150,31 @@
## Convert the "GL" FORMAT field to a SnpMatrix
mat <- genotypeToSnpMatrix(vcf, uncertain=TRUE)
- ## Only 3 of the 9 variants passed the filters.
+ ## Only 3 of the 9 variants passed the filters. The
+ ## other 6 variants had no alternate alleles.
mat$map
- ## Compare original vs SnpMatrix coding for a subset of
- ## samples in variant rs180734498.
+ ## Compare genotype representations for a subset of
+ ## samples in variant rs180734498.
+ ## Original called genotype
+ geno(vcf)$GT["rs180734498", 14:16]
+
+ ## Original genotype likelihoods
geno(vcf)$GL["rs180734498", 14:16]
+
+ ## Posterior probability (computed inside genotypeToSnpMatrix)
+ GLtoGP(geno(vcf)$GL["rs180734498", 14:16, drop=FALSE])[1,]
+
+ ## SnpMatrix coding.
t(as(mat$genotype, "character"))["rs180734498", 14:16]
+ t(as(mat$genotype, "numeric"))["rs180734498", 14:16]
+
+ ## For samples NA11829 and NA11830, one probability is significantly
+ ## higher than the others, so SnpMatrix calls the genotype. These
+ ## calls match the original coding: "0|1" -> "A/B", "0|0" -> "A/A".
+ ## Sample NA11831 was originally called as "0|1" but the probability
+ ## of "0|0" is only a factor of 3 lower, so SnpMatrix calls it as
+ ## "Uncertain" with an appropriate byte-level encoding.
}
\keyword{manip}
54 VariantAnnotation/src/writevcf.c
View
@@ -1,54 +0,0 @@
-#include <stdio.h> /* sprintf */
-#include <Rinternals.h>
-#include "writevcf.h"
-
-SEXP code_allele_observations(SEXP options, SEXP observed)
-{
- /* options and observed are both vectors of character arrays where
- each character array is a collection of alleles at that
- position */
-
- SEXP obs_string, opt_string;
- const char *obs_char, *opt_char;
- SEXP coded = PROTECT(allocVector(STRSXP, length(options)));
- int max_num_alleles = 10; /* World will end if index more than
- * one character long */
- int buffer_index = 0;
- char coded_buffer[max_num_alleles * 2]; /* One int char and one
- * comma for each
- * observation at a
- * position */
-
- for (int i=0; i < length(options); i++) {
- opt_string = VECTOR_ELT(options, i);
- obs_string = VECTOR_ELT(observed, i);
- if (length(obs_string) > 10)
- error("no more than 10 alleles allowed\n");
- buffer_index = 0;
- for (int observed_index=0; observed_index < length(obs_string);
- observed_index++)
- {
- coded_buffer[ buffer_index ] = '.'; /* NA to begin with */
- coded_buffer[buffer_index + 1] = '/';
- obs_char = CHAR(STRING_ELT(obs_string,observed_index));
-
- for (int option_index=0; option_index < length(opt_string);
- option_index++)
- {
- opt_char = CHAR(STRING_ELT(opt_string,option_index));
- if (obs_char == opt_char) {
- sprintf(coded_buffer + buffer_index, "%i/",
- option_index);
- break;
- }
- }
- buffer_index += 2;
- }
- SET_STRING_ELT(coded, i,
- mkCharLenCE(coded_buffer, buffer_index-1,
- CE_UTF8));
- }
-
- UNPROTECT(1);
- return coded;
-}
9 VariantAnnotation/src/writevcf.h
View
@@ -1,9 +0,0 @@
-#ifndef _WRITEVCF_H_
-#define _WRITEVCF_H_
-
-#include <Rdefines.h>
-
-SEXP code_allele_observations(SEXP options, SEXP observed);
-
-#endif /* _WRITEVCF_H_ */
-
Please sign in to comment.
Something went wrong with that request. Please try again.