**Author :** Rutendo F. Sigauke

**Input  :** 
1. Bedtools coverage files for samples in DBNascent (bedtoolscov/${SRR}$_filt.gene.antisense/sense_counts.bed)

**Output :**

2. Files with all sample coverages merged 

        a. bedtools_cov_genes.filt.gene.antisense_counts_fraction_bases_A.bed.gz 
        b. bedtools_cov_genes.filt.gene.antisense_counts_num_features_B.bed.gz  
        c. bedtools_cov_genes.filt.gene.sense_counts_num_bases_A.bed.gz
        d. bedtools_cov_genes.filt.gene.antisense_counts_length_A.bed.gz 
        e. bedtools_cov_genes.filt.gene.sense_counts_fraction_bases_A.bed.gz    
        f. bedtools_cov_genes.filt.gene.sense_counts_num_features_B.bed.gz
        g. bedtools_cov_genes.filt.gene.antisense_counts_num_bases_A.bed.gz     
        h. bedtools_cov_genes.filt.gene.sense_counts_length_A.bed.gz

# Summary

Counts from nascent RNA sequencing datasets in `DBNascent` are merged to a single text file.

The count data is in `/Shares/dbnascent/{paperYearID}/bedtools_cov_genes` 

Here only `filt.gene.antisense_counts.bed` and `filt.gene.sense_counts.bed` are merged into 4 separate files (see below) with all human counts.

https://bedtools.readthedocs.io/en/latest/content/tools/coverage.html

After each interval in A, bedtools coverage will report:

1. The number of features in B that overlapped (by at least one base pair) the A interval. (file extension `num_features_B`)
2. The number of bases in A that had non-zero coverage from features in B. (file extension `num_bases_A`)
3. The length of the entry in A. (file extension `length_A`)
4. The fraction of bases in A that had non-zero coverage from features in B. (file extension `fraction_bases_A`)


# Import libraries

In [1]:
library(dplyr) ##data.frame wrangling
library(tidyr) ##data.frame wrangling
library(data.table) ##load and save large data tables


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




# Annotations

## Refseq genes

In [2]:
##example bed 6 file from one of the bed6 files
transcript_beds <- data.table::fread('/Shares/dbnascent/Andrysik2017identification/bedtools_cov_genes/SRR4090098.filt.gene.antisense_counts.bed', 
                  drop = c('V7','V8','V9','V10'))
colnames(transcript_beds) <- c('chrom','start','end','gene_transcript','score','strand')
dim(transcript_beds)
head(transcript_beds, 3)

chrom,start,end,gene_transcript,score,strand
<chr>,<int>,<int>,<chr>,<chr>,<chr>
chr1,11873,14409,DDX11L1:NR_046018.2,.,+
chr1,14361,29370,WASH7P:NR_024540.1,.,-
chr1,17368,17436,MIR6859-1:NR_106918.1,.,-


# Functions

## Load all the files to one

In [3]:
merge_files <- function(file_directory, file_pattern, drop_list, ntranscripts, drop=FALSE){
    
    ##assign path for directory to variable
    file_dir <- file_directory
    
    ##list all files in that directory based on a pattern
    ##recursively check folder for file with extension
    file_paths <- list.files(path=file_dir,
                             pattern=file_pattern,
                             recursive = TRUE, 
                             full.names=TRUE)
    
    file_names <- base::basename(file_paths)
    
    print(paste0("Input files : ", as.character(length(file_paths))))
    
    ##load all files excluding the metadata columns (if you wish)
    if (drop == TRUE) {
        file_DT_list <- lapply(file_paths,
                               data.table::fread,
                               drop = drop_list)
        } else {
        file_DT_list <- lapply(file_paths,
                               data.table::fread)
        
    }
    
    ##name each list item by file
    names(file_DT_list) <- file_names

    ##make sure the counts are from human annotations with (human == 42224 transcripts)
    file_DT_list_checked <- file_DT_list[sapply(file_DT_list,
                                          function(x) nrow(x) == ntranscripts)]                                             
                                                
    ##merge all counts for human samples
    file_DT_features_one <- dplyr::bind_cols(file_DT_list_checked)
    colnames(file_DT_features_one) <- names(file_DT_list_checked)   
                                                
    ##rename columns by SRR/SRZ id
    file_DT_newnames <- setnames(file_DT_features_one, names(file_DT_features_one),
                               as.character(lapply(strsplit(names(file_DT_features_one), '\\.'), `[`, 1)))
                                                #setnames(file_DT_features_one, names(file_DT_features_one),
                        #           as.character(lapply(strsplit(file_names, '\\.'), `[`, 1)))
                              
    print(paste0("Loaded files matching # transcripts : ",
                 as.character(ncol(file_DT_newnames))))
                                                
    same_file <- ncol(file_DT_newnames)==length(file_paths)
                                                
    print(paste("Did the same number of files get loaded? : ",
                same_file))

    if (same_file == FALSE){
        print("The difference is due to human vs. mouse samples in each folder")
    }
    
    return(file_DT_newnames)
}

## Combine with RefSeq annotations

In [4]:
combine_anno_bedtoolscov <- function(file_directory, out_directory, file_pattern, ntranscripts, transcript_beds_DT, sense=TRUE){
    
    cols <- c('V7','V8','V9','V10')
    file_ids <- c('num_features_B', 'num_bases_A','length_A','fraction_bases_A')
    wd <- out_directory

    for (i in 1:4) {

        ##default column names for bedtools coverage bed files
        ##the first 6 columns are the strandard chrom, start...
        ##the following 4 are output from bedtools cov (see file_ids and Summary section of notebook)
        col_ids <-  c('V1','V2','V3','V4','V5', 
                    'V6','V7', 'V8','V9','V10')

        ##in each pass, we are getting each of the 4 columns from bedtools coverage
        ##and merging them across all samples
        col_ids_filt <- col_ids[col_ids != cols[i]]
        bedtoolscov <- merge_files(file_directory=file_directory, 
                                   file_pattern=file_pattern,
                                   drop_list=col_ids_filt, 
                                   ntranscripts=ntranscripts, 
                                   drop=TRUE)

        ##merge the bed6 file with annotations with the individual load columns
        bedtoolscov_bed <- cbind(transcript_beds_DT, bedtoolscov)

        ##saving the merged file
        if (sense==TRUE){
            data.table::fwrite(bedtoolscov_bed, sep='\t',
                          file = paste0(wd,'bedtools_cov_genes.filt.gene.sense_counts_',file_ids[i],'.bed.gz'))
        } else {

            data.table::fwrite(bedtoolscov_bed, sep='\t',
                          file = paste0(wd,'bedtools_cov_genes.filt.gene.antisense_counts_',file_ids[i],'.bed.gz'))
            
        }
    
    }
    
}

# Running the merge functions

In [5]:
shares <- '/Shares/dbnascent'
#shares <- '/Shares/dbnascent/Andrysik2017identification'
wd <- '/Users/rusi2317/projects/meta_analysis_qc/hg38/processed_data/counts/bedtoolscov/'
antisense <- 'filt.gene.antisense_counts.bed$'
sense <- 'filt.gene.sense_counts.bed$'

##antisense files
combine_anno_bedtoolscov(file_directory=shares, 
                         out_directory=wd, 
                         file_pattern=antisense, 
                         ntranscripts=nrow(transcript_beds),  
                         transcript_beds_DT=transcript_beds,
                        sense=FALSE)

##sense files
combine_anno_bedtoolscov(file_directory=shares,
                         out_directory=wd, 
                         file_pattern=sense, 
                         ntranscripts=nrow(transcript_beds),  
                         transcript_beds_DT=transcript_beds)

[1] "Input files : 2395"


[1m[22mNew names:
[36m•[39m `V7` -> `V7...1`
[36m•[39m `V7` -> `V7...2`
[36m•[39m `V7` -> `V7...3`
[36m•[39m `V7` -> `V7...4`
[36m•[39m `V7` -> `V7...5`
[36m•[39m `V7` -> `V7...6`
[36m•[39m `V7` -> `V7...7`
[36m•[39m `V7` -> `V7...8`
[36m•[39m `V7` -> `V7...9`
[36m•[39m `V7` -> `V7...10`
[36m•[39m `V7` -> `V7...11`
[36m•[39m `V7` -> `V7...12`
[36m•[39m `V7` -> `V7...13`
[36m•[39m `V7` -> `V7...14`
[36m•[39m `V7` -> `V7...15`
[36m•[39m `V7` -> `V7...16`
[36m•[39m `V7` -> `V7...17`
[36m•[39m `V7` -> `V7...18`
[36m•[39m `V7` -> `V7...19`
[36m•[39m `V7` -> `V7...20`
[36m•[39m `V7` -> `V7...21`
[36m•[39m `V7` -> `V7...22`
[36m•[39m `V7` -> `V7...23`
[36m•[39m `V7` -> `V7...24`
[36m•[39m `V7` -> `V7...25`
[36m•[39m `V7` -> `V7...26`
[36m•[39m `V7` -> `V7...27`
[36m•[39m `V7` -> `V7...28`
[36m•[39m `V7` -> `V7...29`
[36m•[39m `V7` -> `V7...30`
[36m•[39m `V7` -> `V7...31`
[36m•[39m `V7` -> `V7...32`
[36m•[39m `V7` -> `V7...33`

[1] "Loaded files matching # transcripts : 1645"
[1] "Did the same number of files get loaded? :  FALSE"
[1] "The difference is due to human vs. mouse samples in each folder"
[1] "Input files : 2395"


[1m[22mNew names:
[36m•[39m `V8` -> `V8...1`
[36m•[39m `V8` -> `V8...2`
[36m•[39m `V8` -> `V8...3`
[36m•[39m `V8` -> `V8...4`
[36m•[39m `V8` -> `V8...5`
[36m•[39m `V8` -> `V8...6`
[36m•[39m `V8` -> `V8...7`
[36m•[39m `V8` -> `V8...8`
[36m•[39m `V8` -> `V8...9`
[36m•[39m `V8` -> `V8...10`
[36m•[39m `V8` -> `V8...11`
[36m•[39m `V8` -> `V8...12`
[36m•[39m `V8` -> `V8...13`
[36m•[39m `V8` -> `V8...14`
[36m•[39m `V8` -> `V8...15`
[36m•[39m `V8` -> `V8...16`
[36m•[39m `V8` -> `V8...17`
[36m•[39m `V8` -> `V8...18`
[36m•[39m `V8` -> `V8...19`
[36m•[39m `V8` -> `V8...20`
[36m•[39m `V8` -> `V8...21`
[36m•[39m `V8` -> `V8...22`
[36m•[39m `V8` -> `V8...23`
[36m•[39m `V8` -> `V8...24`
[36m•[39m `V8` -> `V8...25`
[36m•[39m `V8` -> `V8...26`
[36m•[39m `V8` -> `V8...27`
[36m•[39m `V8` -> `V8...28`
[36m•[39m `V8` -> `V8...29`
[36m•[39m `V8` -> `V8...30`
[36m•[39m `V8` -> `V8...31`
[36m•[39m `V8` -> `V8...32`
[36m•[39m `V8` -> `V8...33`

[1] "Loaded files matching # transcripts : 1645"
[1] "Did the same number of files get loaded? :  FALSE"
[1] "The difference is due to human vs. mouse samples in each folder"
[1] "Input files : 2395"


[1m[22mNew names:
[36m•[39m `V9` -> `V9...1`
[36m•[39m `V9` -> `V9...2`
[36m•[39m `V9` -> `V9...3`
[36m•[39m `V9` -> `V9...4`
[36m•[39m `V9` -> `V9...5`
[36m•[39m `V9` -> `V9...6`
[36m•[39m `V9` -> `V9...7`
[36m•[39m `V9` -> `V9...8`
[36m•[39m `V9` -> `V9...9`
[36m•[39m `V9` -> `V9...10`
[36m•[39m `V9` -> `V9...11`
[36m•[39m `V9` -> `V9...12`
[36m•[39m `V9` -> `V9...13`
[36m•[39m `V9` -> `V9...14`
[36m•[39m `V9` -> `V9...15`
[36m•[39m `V9` -> `V9...16`
[36m•[39m `V9` -> `V9...17`
[36m•[39m `V9` -> `V9...18`
[36m•[39m `V9` -> `V9...19`
[36m•[39m `V9` -> `V9...20`
[36m•[39m `V9` -> `V9...21`
[36m•[39m `V9` -> `V9...22`
[36m•[39m `V9` -> `V9...23`
[36m•[39m `V9` -> `V9...24`
[36m•[39m `V9` -> `V9...25`
[36m•[39m `V9` -> `V9...26`
[36m•[39m `V9` -> `V9...27`
[36m•[39m `V9` -> `V9...28`
[36m•[39m `V9` -> `V9...29`
[36m•[39m `V9` -> `V9...30`
[36m•[39m `V9` -> `V9...31`
[36m•[39m `V9` -> `V9...32`
[36m•[39m `V9` -> `V9...33`

[1] "Loaded files matching # transcripts : 1645"
[1] "Did the same number of files get loaded? :  FALSE"
[1] "The difference is due to human vs. mouse samples in each folder"
[1] "Input files : 2395"


[1m[22mNew names:
[36m•[39m `V10` -> `V10...1`
[36m•[39m `V10` -> `V10...2`
[36m•[39m `V10` -> `V10...3`
[36m•[39m `V10` -> `V10...4`
[36m•[39m `V10` -> `V10...5`
[36m•[39m `V10` -> `V10...6`
[36m•[39m `V10` -> `V10...7`
[36m•[39m `V10` -> `V10...8`
[36m•[39m `V10` -> `V10...9`
[36m•[39m `V10` -> `V10...10`
[36m•[39m `V10` -> `V10...11`
[36m•[39m `V10` -> `V10...12`
[36m•[39m `V10` -> `V10...13`
[36m•[39m `V10` -> `V10...14`
[36m•[39m `V10` -> `V10...15`
[36m•[39m `V10` -> `V10...16`
[36m•[39m `V10` -> `V10...17`
[36m•[39m `V10` -> `V10...18`
[36m•[39m `V10` -> `V10...19`
[36m•[39m `V10` -> `V10...20`
[36m•[39m `V10` -> `V10...21`
[36m•[39m `V10` -> `V10...22`
[36m•[39m `V10` -> `V10...23`
[36m•[39m `V10` -> `V10...24`
[36m•[39m `V10` -> `V10...25`
[36m•[39m `V10` -> `V10...26`
[36m•[39m `V10` -> `V10...27`
[36m•[39m `V10` -> `V10...28`
[36m•[39m `V10` -> `V10...29`
[36m•[39m `V10` -> `V10...30`
[36m•[39m `V10` -> `V10...3

[1] "Loaded files matching # transcripts : 1645"
[1] "Did the same number of files get loaded? :  FALSE"
[1] "The difference is due to human vs. mouse samples in each folder"
[1] "Input files : 2395"


[1m[22mNew names:
[36m•[39m `V7` -> `V7...1`
[36m•[39m `V7` -> `V7...2`
[36m•[39m `V7` -> `V7...3`
[36m•[39m `V7` -> `V7...4`
[36m•[39m `V7` -> `V7...5`
[36m•[39m `V7` -> `V7...6`
[36m•[39m `V7` -> `V7...7`
[36m•[39m `V7` -> `V7...8`
[36m•[39m `V7` -> `V7...9`
[36m•[39m `V7` -> `V7...10`
[36m•[39m `V7` -> `V7...11`
[36m•[39m `V7` -> `V7...12`
[36m•[39m `V7` -> `V7...13`
[36m•[39m `V7` -> `V7...14`
[36m•[39m `V7` -> `V7...15`
[36m•[39m `V7` -> `V7...16`
[36m•[39m `V7` -> `V7...17`
[36m•[39m `V7` -> `V7...18`
[36m•[39m `V7` -> `V7...19`
[36m•[39m `V7` -> `V7...20`
[36m•[39m `V7` -> `V7...21`
[36m•[39m `V7` -> `V7...22`
[36m•[39m `V7` -> `V7...23`
[36m•[39m `V7` -> `V7...24`
[36m•[39m `V7` -> `V7...25`
[36m•[39m `V7` -> `V7...26`
[36m•[39m `V7` -> `V7...27`
[36m•[39m `V7` -> `V7...28`
[36m•[39m `V7` -> `V7...29`
[36m•[39m `V7` -> `V7...30`
[36m•[39m `V7` -> `V7...31`
[36m•[39m `V7` -> `V7...32`
[36m•[39m `V7` -> `V7...33`

[1] "Loaded files matching # transcripts : 1645"
[1] "Did the same number of files get loaded? :  FALSE"
[1] "The difference is due to human vs. mouse samples in each folder"
[1] "Input files : 2395"


[1m[22mNew names:
[36m•[39m `V8` -> `V8...1`
[36m•[39m `V8` -> `V8...2`
[36m•[39m `V8` -> `V8...3`
[36m•[39m `V8` -> `V8...4`
[36m•[39m `V8` -> `V8...5`
[36m•[39m `V8` -> `V8...6`
[36m•[39m `V8` -> `V8...7`
[36m•[39m `V8` -> `V8...8`
[36m•[39m `V8` -> `V8...9`
[36m•[39m `V8` -> `V8...10`
[36m•[39m `V8` -> `V8...11`
[36m•[39m `V8` -> `V8...12`
[36m•[39m `V8` -> `V8...13`
[36m•[39m `V8` -> `V8...14`
[36m•[39m `V8` -> `V8...15`
[36m•[39m `V8` -> `V8...16`
[36m•[39m `V8` -> `V8...17`
[36m•[39m `V8` -> `V8...18`
[36m•[39m `V8` -> `V8...19`
[36m•[39m `V8` -> `V8...20`
[36m•[39m `V8` -> `V8...21`
[36m•[39m `V8` -> `V8...22`
[36m•[39m `V8` -> `V8...23`
[36m•[39m `V8` -> `V8...24`
[36m•[39m `V8` -> `V8...25`
[36m•[39m `V8` -> `V8...26`
[36m•[39m `V8` -> `V8...27`
[36m•[39m `V8` -> `V8...28`
[36m•[39m `V8` -> `V8...29`
[36m•[39m `V8` -> `V8...30`
[36m•[39m `V8` -> `V8...31`
[36m•[39m `V8` -> `V8...32`
[36m•[39m `V8` -> `V8...33`

[1] "Loaded files matching # transcripts : 1645"
[1] "Did the same number of files get loaded? :  FALSE"
[1] "The difference is due to human vs. mouse samples in each folder"
[1] "Input files : 2395"


[1m[22mNew names:
[36m•[39m `V9` -> `V9...1`
[36m•[39m `V9` -> `V9...2`
[36m•[39m `V9` -> `V9...3`
[36m•[39m `V9` -> `V9...4`
[36m•[39m `V9` -> `V9...5`
[36m•[39m `V9` -> `V9...6`
[36m•[39m `V9` -> `V9...7`
[36m•[39m `V9` -> `V9...8`
[36m•[39m `V9` -> `V9...9`
[36m•[39m `V9` -> `V9...10`
[36m•[39m `V9` -> `V9...11`
[36m•[39m `V9` -> `V9...12`
[36m•[39m `V9` -> `V9...13`
[36m•[39m `V9` -> `V9...14`
[36m•[39m `V9` -> `V9...15`
[36m•[39m `V9` -> `V9...16`
[36m•[39m `V9` -> `V9...17`
[36m•[39m `V9` -> `V9...18`
[36m•[39m `V9` -> `V9...19`
[36m•[39m `V9` -> `V9...20`
[36m•[39m `V9` -> `V9...21`
[36m•[39m `V9` -> `V9...22`
[36m•[39m `V9` -> `V9...23`
[36m•[39m `V9` -> `V9...24`
[36m•[39m `V9` -> `V9...25`
[36m•[39m `V9` -> `V9...26`
[36m•[39m `V9` -> `V9...27`
[36m•[39m `V9` -> `V9...28`
[36m•[39m `V9` -> `V9...29`
[36m•[39m `V9` -> `V9...30`
[36m•[39m `V9` -> `V9...31`
[36m•[39m `V9` -> `V9...32`
[36m•[39m `V9` -> `V9...33`

[1] "Loaded files matching # transcripts : 1645"
[1] "Did the same number of files get loaded? :  FALSE"
[1] "The difference is due to human vs. mouse samples in each folder"
[1] "Input files : 2395"


[1m[22mNew names:
[36m•[39m `V10` -> `V10...1`
[36m•[39m `V10` -> `V10...2`
[36m•[39m `V10` -> `V10...3`
[36m•[39m `V10` -> `V10...4`
[36m•[39m `V10` -> `V10...5`
[36m•[39m `V10` -> `V10...6`
[36m•[39m `V10` -> `V10...7`
[36m•[39m `V10` -> `V10...8`
[36m•[39m `V10` -> `V10...9`
[36m•[39m `V10` -> `V10...10`
[36m•[39m `V10` -> `V10...11`
[36m•[39m `V10` -> `V10...12`
[36m•[39m `V10` -> `V10...13`
[36m•[39m `V10` -> `V10...14`
[36m•[39m `V10` -> `V10...15`
[36m•[39m `V10` -> `V10...16`
[36m•[39m `V10` -> `V10...17`
[36m•[39m `V10` -> `V10...18`
[36m•[39m `V10` -> `V10...19`
[36m•[39m `V10` -> `V10...20`
[36m•[39m `V10` -> `V10...21`
[36m•[39m `V10` -> `V10...22`
[36m•[39m `V10` -> `V10...23`
[36m•[39m `V10` -> `V10...24`
[36m•[39m `V10` -> `V10...25`
[36m•[39m `V10` -> `V10...26`
[36m•[39m `V10` -> `V10...27`
[36m•[39m `V10` -> `V10...28`
[36m•[39m `V10` -> `V10...29`
[36m•[39m `V10` -> `V10...30`
[36m•[39m `V10` -> `V10...3

[1] "Loaded files matching # transcripts : 1645"
[1] "Did the same number of files get loaded? :  FALSE"
[1] "The difference is due to human vs. mouse samples in each folder"


# Session Infromation

In [6]:
sessionInfo()

R version 3.6.0 (2019-04-26)
Platform: x86_64-redhat-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] data.table_1.14.2 tidyr_1.2.1       dplyr_1.0.10     

loaded via a namespace (and not attached):
 [1] magrittr_2.0.3   tidyselect_1.1.2 uuid_1.1-0       R6_2.5.1        
 [5] rlang_1.0.6      fastmap_1.1.0    fansi_1.0.3      tools_3.6.0     
 [9] utf8_1.2.2       DBI_1.1.3        cli_3.4.1        htmltools_0.5.2 
[13] asserttha