-
Notifications
You must be signed in to change notification settings - Fork 0
/
platjam-import-salmonquant.R
482 lines (464 loc) · 19.5 KB
/
platjam-import-salmonquant.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
#' Import Salmon quant.sf files to SummarizedExperiment
#'
#' Import Salmon quant.sf files to SummarizedExperiment
#'
#' This function is intended to automate the process of importing
#' a series of `quant.sf` files, then generating `SummarizedExperiment`
#' objects at the transcript and gene level. It optionally includes
#' sample annotation provided as a `data.frame` in argument `curation_txt`.
#' It also includes transcript and gene annotations through either
#' `data.frame` from argument `tx2gene`, or it derives `tx2gene`
#' from a GTF file from argument `gtf`. The GTF file option then calls
#' `splicejam::makeTx2geneFromGtf()`.
#'
#' This function can optionally process data that includes full length
#' gene body regions, annotated with `"gene_body"`. This option is specific
#' for Salmon quantitation where the transcripts include full length
#' gene body for multi-exon genes, for example to measure unspliced
#' transcript abundance.
#'
#' * `import_types="gene"` summarizes only the proper transcripts,
#' excluding `"gene_body"` entries.
#' * `import_types="gene_body"` summarizes all transcript
#' and full gene entries into one summary transcript abundance.
#' * `import_types="gene_tx"` summarizes proper transcript to gene level,
#' and separately represents `"gene_body"` entries for comparison.
#'
#' The current recommendation is to use default values for `import_types`
#' which imports all the following types of data:
#' 1. `TxSE` as transcripts per row
#' 2. `GeneSE` as genes per row, excluding unspliced `gene_body` transcripts.
#' 3. `GeneBodySE` as genes per row, summarizing unspliced
#' and spliced `"gene_body"` transcripts together for each gene.
#' 4. `GeneTxSE` as spliced and unspliced genes per row, so that
#' unspliced, spliced, or both can be analyzed together.
#'
#' We typically use `GeneBodySE` and define a subset of `rownames(GeneBodySE)`
#' to use only spliced transcripts during analysis. Optionally we may
#' run `limma::diffSplice()` to compare the unspliced:spliced ratio
#' across experiment groups.
#'
#' @return `list` with `SummarizedExperiment` objects, each of which
#' contain assay names `c("counts", "abundance", "length)`, where
#' `c("counts", "abundance")` are transformed with `log2(1 + x)`.
#' The transform can be reversed with `10^x - 1`.
#' The `SummarizedExperiment` objects by name:
#' * `"TxSE"`: transcript-level values imported from `quant.sf`.
#' * `"GeneSE"`: gene-level summary values, excluding
#' `"gene_body"` entries.
#' * `"GeneBodySE"`: gene-level summary values, summarizing unspliced
#' and spliced `"gene_body"` transcripts together for each gene.
#' * `"GeneTxSE"`: gene-level summary values, where transcripts are
#' combined to gene level, and `"gene_body"` entries are represented
#' separately, with suffix `"_gene_body"` added to the gene name.
#'
#' @family jam import functions
#'
#' @param salmonOut_paths `character` vectors to each individual folder
#' that contains the `"quant.sf"` output file for Salmon.
#' @param import_types `character` indicating which type or types of
#' data to return. Note that the distinction between `gene` and
#' `gene_body` is only relevant when there are transcript entries
#' defined with `transcript_type="gene_body"`. These entries specifically
#' represent unspliced transcribed regions for a gene locus, and
#' only for multi-exon genes.
#' * `tx`: transcript quantitation, direct import of `quant.sf` files.
#' * `gene`: gene quantitation after calling `tximport::summarizeToGene()`,
#' excluding `transcript_type="gene_body"`.
#' * `gene_body`: gene quantitation after calling `tximport::summarizeToGene()`,
#' including `transcript_type="gene_body"`.
#' @param tx2gene `character` path to file, or `data.frame` with at
#' least two columns matching `tx_colname` and `gene_colname` below.
#' When supplied, the `gtf` argument is ignored, unless the file
#' path is not accessible, or the data is not `data.frame`.
#' @param gtf `character` path to a GTF file, used only when `tx2gene`
#' is not supplied. When used, `splicejam::makeTx2geneFromGtf()` is
#' called to create a `data.frame` object `tx2gene`.
#' @param curation_txt `data.frame` whose first column should match the
#' sample column headers found in the PD abundance columns, and
#' subsequent columns contain associated sample annotations.
#' If `curation_txt` is not supplied, then values will be split into
#' columns by `_` underscore or `" "` whitespace characters.
#' @param tx_colname,gene_colname `character` strings indicating colnames
#' in `tx2gene` that should be used.
#' * `tx_colname` represents unique identifier for each transcript,
#' usually `"transcript_id"`.
#' * `gene_colname` represents a gene label associated with gene
#' summarized expression values, typically `"gene_name"`.
#' @param geneFeatureType,txFeatureType `character` arguments passed to
#' `splicejam::makeTx2geneFromGtf()` only when supplying argument
#' `gtf` with a path to a GTF file.
#' @param countsFromAbundance `character` string passed to
#' `tximport::summarizeToGene()` to define the method for calculating
#' abundance.
#' @param gene_body_ids `character` optional vector with specific row
#' identifiers that should be considered `transcript_type="gene_body"`
#' entries, relevant to argument `import_types` above. When `gene_body_ids`
#' is defined, these entries are used directly without using `tx2gene`.
#' When `gene_body_ids` is not defined, `tx2gene$transcript_type` is used
#' if present. If that column is not present, or does not contain any
#' entries with `"gene_body"`, then all transcripts are used for
#' `import_types="gene"`, and `import_types="gene_body"` is not valid
#' and therefore is not returned.
#' @param trim_tx_from,trim_tx_to `character` vector with one or more
#' regular expression patterns used to curate the values in `tx_colname`
#' prior to assigning them as `rownames()`, and back to `tx_colname`.
#' These values are joined to `tx2gene[[tx_colname]]` to assign
#' additional gene annotations.
#' * The default as of version 0.0.79.900 is to leave strand information
#' `"(-)"` and `"(+)"` without removing it, since in some rare cases a
#' gene's unspliced transcripts can be present on two strands.
#' * The previous default (version <= 0.0.78.900) was to remove
#' `"(-)"` and `"(+)"` from the transcript_id `tx_colname` column.
#' @param verbose `logical` indicating whether to print verbose output.
#' @param ... additional arguments are passed to supporting functions.
#'
#' @export
import_salmon_quant <- function
(salmonOut_paths,
import_types=c("tx",
"gene",
"gene_body",
"gene_tx"),
gtf=NULL,
tx2gene=NULL,
curation_txt=NULL,
tx_colname="transcript_id",
gene_colname="gene_name",
gene_body_colname="transcript_type",
geneFeatureType="exon",
txFeatureType="exon",
countsFromAbundance="lengthScaledTPM",
gene_body_ids=NULL,
trim_tx_from=NULL,#c("[(][-+][)]"),
trim_tx_to=NULL,
verbose=FALSE,
...)
{
#
import_types <- match.arg(import_types,
several.ok=TRUE);
if (!jamba::check_pkg_installed("SummarizedExperiment")) {
stop("The Bioconductor package SummarizedExperiment is required.")
}
# tx2gene
if (length(tx2gene) > 0) {
if (length(tx2gene) == 1 && file.exists(tx2gene)) {
tx2gene <- data.table::fread(tx2gene,
data.table=FALSE);
} else if (!"data.frame" %in% class(tx2gene)) {
warning(paste("tx2gene should be a file path, or data.frame.",
"Ignoring tx2gene."));
tx2gene <- NULL;
}
}
# gtf
if (length(tx2gene) == 0) {
if (length(gtf) == 1 && file.exists(gtf)) {
# create tx2gene data.frame
gtf_tx2gene <- gsub("[.]gtf(|[.].*)$",
".tx2gene.txt",
ignore.case=TRUE,
gtf);
if (file.exists(gtf_tx2gene)) {
tx2gene <- data.table::fread(gtf_tx2gene,
data.table=FALSE);
rownames(tx2gene) <- tx2gene$transcript_id;
} else {
tx2gene <- splicejam::makeTx2geneFromGtf(gtf,
geneFeatureType=geneFeatureType,
txFeatureType=txFeatureType,
...);
rownames(tx2gene) <- tx2gene[[tx_colname]];
# save for re-use
tryCatch({
data.table::fwrite(x=tx2gene,
file=gtf_tx2gene,
sep="\t",
row.names=FALSE);
}, error=function(e){
warning(paste("Could not write tx2gene to file for re-use: ",
gtf_tx2gene));
});
}
} else {
stop(paste("Input GTF must be a single accessible filename."));
}
}
# tximport
if (length(names(salmonOut_paths)) == 0) {
names(salmonOut_paths) <- jamba::makeNames(
gsub("_salmonOut$",
"",
basename(salmonOut_paths)));
}
salmon_files <- file.path(salmonOut_paths, "quant.sf")
names(salmon_files) <- names(salmonOut_paths);
if (any(!file.exists(salmon_files))) {
warning(paste("Note some files were not accessible: ",
jamba::cPaste(salmon_files[!file.exists(salmon_files)], sep=", ")));
salmon_files <- salmon_files[file.exists(salmon_files)];
}
if (length(salmon_files) == 0) {
stop("No salmon quant.sf files were accessible.");
}
# Import transcript quant.sf
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"importing tx data: ",
"TxSE");
if (verbose > 1) {
print(data.frame(salmon_files));
}
}
txiTx <- tximport::tximport(salmon_files,
type="salmon",
importer=data.table::fread,
countsFromAbundance=countsFromAbundance,
txOut=TRUE);
isamples <- colnames(txiTx[[1]]);
# optionally curate transcript rownames
if (length(trim_tx_from) > 0) {
if (length(trim_tx_to) == 0) {
trim_tx_to <- "";
}
trim_tx_to <- rep(trim_tx_to,
length.out=length(trim_tx_from));
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"trimming rownames using: ",
c("trim_tx_from", "trim_tx_to"));
}
for (itype in c("counts", "abundance", "length")) {
for (iseq in seq_along(trim_tx_from)) {
rownames(txiTx[[itype]]) <- gsub(trim_tx_from[[iseq]],
trim_tx_to[[iseq]],
rownames(txiTx[[itype]]))
}
}
}
rownames(tx2gene) <- tx2gene[[tx_colname]];
# import sample annotations
if (length(curation_txt) > 0) {
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"applying sample annotations via curation_txt.");
}
# sample_df
sample_df <- curate_to_df_by_pattern(
isamples,
df=curation_txt,
...);
isamples1 <- rownames(sample_df);
if (!all(isamples1 %in% isamples)) {
filename_colname <- head(jamba::vigrep("^filename$",
colnames(sample_df)),
1);
isamples_match <- match(sample_df[[filename_colname]],
isamples);
isamples_from <- isamples[isamples_match];
isamples_to <- isamples1;
if (verbose > 1) {
jamba::printDebug("import_salmon_quant(): ",
"renaming sample colnames to match sample_df:");
print(data.frame(isamples_from,
isamples_to));
}
for (i in names(txiTx)) {
if ("matrix" %in% class(txiTx[[i]])) {
txiTx[[i]] <- jamba::renameColumn(txiTx[[i]],
from=isamples_from,
to=isamples_to);
}
}
}
isamples <- isamples1;
if (verbose) {
if (verbose > 1) {
jamba::printDebug("import_salmon_quant(): ",
"sample_df:");
print(sample_df);
}
}
} else {
# create default
sample_df <- data.frame(check.names=FALSE,
stringsAsFactors=FALSE,
sample_id=isamples);
rownames(sample_df) <- isamples;
}
ret_list <- list();
if ("tx" %in% import_types) {
# rowData for transcripts
# table(rownames(txiTx[[1]]) %in% tx2gene$transcript_id)
rowData_tx <- tx2gene[match(rownames(txiTx[[1]]), tx2gene[[tx_colname]]),,drop=FALSE];
# TODO: debug workarounds when this step fails,
# usually occurs when rownames do not match tx2gene, which is
# often the version numbers ENST000012.1_1
rownames(rowData_tx) <- rowData_tx[[tx_colname]];
# Create tx SummarizedExperiment
TxSE <- SummarizedExperiment(
assays=list(
counts=log2(1+txiTx$counts)[,isamples, drop=FALSE],
abundance=log2(1+txiTx$abundance)[,isamples, drop=FALSE],
length=txiTx$length[,isamples, drop=FALSE]
),
rowData=rowData_tx,
colData=sample_df[isamples, , drop=FALSE],
metadata=list(countsFromAbundance=txiTx$countsFromAbundance)
);
ret_list$TxSE <- TxSE;
}
# optionally define gene_body_ids
if (any(c("gene", "gene_body", "gene_tx") %in% import_types)) {
if (length(gene_body_ids) == 0) {
gene_body_colname <- head(intersect(gene_body_colname,
colnames(tx2gene)), 1)
if (length(gene_body_colname) > 0) {
gene_body_ids <- subset(tx2gene, tx2gene[[gene_body_colname]] %in% "gene_body")[[tx_colname]];
}
}
gene_body_ids <- intersect(gene_body_ids,
rownames(txiTx$counts));
} else {
gene_body_ids <- NULL
}
# remove gene_body entries from transcript rows used here
tx_ids <- setdiff(rownames(txiTx$counts), gene_body_ids)
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"length(tx_ids): ",
jamba::formatInt(length(tx_ids)))
jamba::printDebug("import_salmon_quant(): ",
"length(gene_body_ids): ",
jamba::formatInt(length(gene_body_ids)))
}
if ("gene" %in% import_types) {
if (length(tx_ids) == 0) {
jamba::printDebug("import_salmon_quant(): ",
"no tx_ids exist for import_types='gene'")
} else {
# summarize transcript to gene level
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"summarizing tx data to gene level: ",
"GeneSE");
}
txiTx_use <- lapply(txiTx[c("abundance", "counts", "length")], function(i){
i[match(tx_ids, rownames(i)), , drop=FALSE]
});
tx2gene_use <- tx2gene[match(tx_ids, tx2gene[[tx_colname]]),
c(tx_colname, gene_colname), drop=FALSE]
txiGene <- tximport::summarizeToGene(
txiTx_use,
countsFromAbundance=countsFromAbundance,
tx2gene=tx2gene_use);
igenes <- rownames(txiGene[[1]]);
# create gene SummarizedExperiment
# rowData for genes
gene_colnames <- setdiff(c(gene_colname,
jamba::unvigrep("transcript|tx", colnames(tx2gene))),
tx_colname);
rowData_gene <- tx2gene[match(igenes, tx2gene[[gene_colname]]),
gene_colnames, drop=FALSE];
rownames(rowData_gene) <- igenes;
GeneSE <- SummarizedExperiment::SummarizedExperiment(
assays=list(
counts=log2(1+txiGene$counts)[, isamples, drop=FALSE],
abundance=log2(1+txiGene$abundance)[, isamples, drop=FALSE],
length=txiGene$length[, isamples, drop=FALSE]
),
rowData=rowData_gene,
colData=sample_df[isamples, , drop=FALSE],
metadata=list(countsFromAbundance=txiGene$countsFromAbundance)
);
ret_list$GeneSE <- GeneSE;
}
}
if ("gene_body" %in% import_types) {
if (length(gene_body_ids) == 0) {
jamba::printDebug("import_salmon_quant(): ",
"no gene_body_ids exist for import_types='gene_body'")
} else {
# summarize transcript to gene level
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"summarizing tx and gene_body data to gene level: ",
"GeneBodySE");
}
txiGeneBody <- tximport::summarizeToGene(
txiTx,
countsFromAbundance=countsFromAbundance,
tx2gene=tx2gene[,c(tx_colname, gene_colname), drop=FALSE]);
igenes <- rownames(txiGeneBody[[1]]);
# create gene SummarizedExperiment
# rowData for genes
gene_colnames <- setdiff(c(gene_colname,
jamba::unvigrep("transcript|tx", colnames(tx2gene))),
tx_colname);
rowData_gene <- tx2gene[match(igenes, tx2gene[[gene_colname]]),
gene_colnames, drop=FALSE];
rownames(rowData_gene) <- igenes;
GeneBodySE <- SummarizedExperiment::SummarizedExperiment(
assays=list(
counts=log2(1+txiGeneBody$counts)[, isamples, drop=FALSE],
abundance=log2(1+txiGeneBody$abundance)[, isamples, drop=FALSE],
length=txiGeneBody$length[, isamples, drop=FALSE]
),
rowData=rowData_gene,
colData=sample_df[isamples, , drop=FALSE],
metadata=list(countsFromAbundance=txiGeneBody$countsFromAbundance)
);
ret_list$GeneBodySE <- GeneBodySE;
}
}
if ("gene_tx" %in% import_types) {
if (length(gene_body_ids) == 0) {
jamba::printDebug("import_salmon_quant(): ",
"no gene_body_ids exist for import_types='gene_tx'")
} else {
# summarize transcript to gene level
if (verbose) {
jamba::printDebug("import_salmon_quant(): ",
"summarizing tx data to gene level, with gene_body data separately: ",
"GeneTxSE");
}
# prepare custom tx2gene
tx2gene_tx <- tx2gene;
tx2gene_tx[match(gene_body_ids, rownames(tx2gene_tx)), gene_colname] <- paste0(
tx2gene_tx[match(gene_body_ids, rownames(tx2gene_tx)), gene_colname],
"_gene_body")
txiGeneTx <- tximport::summarizeToGene(
txiTx,
countsFromAbundance=countsFromAbundance,
tx2gene=tx2gene_tx[,c(tx_colname, gene_colname), drop=FALSE]);
igenes <- rownames(txiGeneTx[[1]]);
# create gene SummarizedExperiment
# rowData for genes
gene_colnames <- setdiff(c(gene_colname,
jamba::unvigrep("transcript|tx", colnames(tx2gene_tx))),
tx_colname);
rowData_gene <- tx2gene_tx[match(igenes, tx2gene_tx[[gene_colname]]),
gene_colnames, drop=FALSE];
rownames(rowData_gene) <- igenes;
rowData_gene$gene_body <- ifelse(grepl("_gene_body$", igenes),
"gene_body",
"transcript");
rowData_gene$base_gene_name <- gsub("_gene_body$", "", igenes)
rowData_gene$has_gene_body <- igenes %in% c(tx2gene[gene_body_ids, gene_colname],
paste0(tx2gene[gene_body_ids, gene_colname], "_gene_body"));
GeneTxSE <- SummarizedExperiment::SummarizedExperiment(
assays=list(
counts=log2(1+txiGeneTx$counts)[, isamples, drop=FALSE],
abundance=log2(1+txiGeneTx$abundance)[, isamples, drop=FALSE],
length=txiGeneTx$length[, isamples, drop=FALSE]
),
rowData=rowData_gene,
colData=sample_df[isamples, , drop=FALSE],
metadata=list(countsFromAbundance=txiGeneTx$countsFromAbundance)
);
ret_list$GeneTxSE <- GeneTxSE;
}
}
return(ret_list);
}