# Load libraries

In [1]:
library(dplyr) 
library(tidyr)
library(data.table)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last




# Load data



## Gene annotations

### Full length transcripts

In [2]:
hg38_refseq_gtf <- data.table::fread('/scratch/Shares/dowell/dbnascent/pipeline_assets/Bidirectional-Flow/assets/hg38_refseq_diff53prime.gtf',
                              sep='\t')
head(hg38_refseq_gtf)

V1,V2,V3,V4,V5,V6,V7,V8,V9
<chr>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>
chr1,hg38_refseq,gene_length,100038095,100083373,0,+,.,"gene_id ""MFSD14A""; transcript_id ""NM_033055"""
chr1,hg38_refseq,gene_length,100083570,100132930,0,-,.,"gene_id ""SASS6""; transcript_id ""NM_194292"""
chr1,hg38_refseq,gene_length,100133150,100150498,0,+,.,"gene_id ""TRMT13""; transcript_id ""NR_135078"""
chr1,hg38_refseq,gene_length,100148448,100178273,0,-,.,"gene_id ""LRRC39""; transcript_id ""NM_001256385"""
chr1,hg38_refseq,gene_length,100186919,100249834,0,-,.,"gene_id ""DBT""; transcript_id ""NM_001918"""
chr1,hg38_refseq,gene_length,100264742,100266174,0,-,.,"gene_id ""RTCA-AS1""; transcript_id ""NR_110434"""


In [3]:
hg38_refseq <- hg38_refseq_gtf[,c(1,4,5,7,9)]
names(hg38_refseq) <- c('chromosome','start','end','strand','feature_id')
head(hg38_refseq)

chromosome,start,end,strand,feature_id
<chr>,<int>,<int>,<chr>,<chr>
chr1,100038095,100083373,+,"gene_id ""MFSD14A""; transcript_id ""NM_033055"""
chr1,100083570,100132930,-,"gene_id ""SASS6""; transcript_id ""NM_194292"""
chr1,100133150,100150498,+,"gene_id ""TRMT13""; transcript_id ""NR_135078"""
chr1,100148448,100178273,-,"gene_id ""LRRC39""; transcript_id ""NM_001256385"""
chr1,100186919,100249834,-,"gene_id ""DBT""; transcript_id ""NM_001918"""
chr1,100264742,100266174,-,"gene_id ""RTCA-AS1""; transcript_id ""NR_110434"""


### 5' truncated

In [4]:
hg38_refseq_trunc_gtf <- data.table::fread('/scratch/Shares/dowell/dbnascent/pipeline_assets/Bidirectional-Flow/assets/hg38_refseq_diff53prime_5ptrunc.gtf',
                              sep='\t')
head(hg38_refseq_trunc_gtf)

V1,V2,V3,V4,V5,V6,V7,V8,V9
<chr>,<chr>,<chr>,<int>,<int>,<int>,<chr>,<chr>,<chr>
chr1,hg38_refseq,gene_length,100038845,100083373,0,+,.,"gene_id ""MFSD14A""; transcript_id ""NM_033055"""
chr1,hg38_refseq,gene_length,100083570,100132180,0,-,.,"gene_id ""SASS6""; transcript_id ""NM_194292"""
chr1,hg38_refseq,gene_length,100133900,100150498,0,+,.,"gene_id ""TRMT13""; transcript_id ""NR_135078"""
chr1,hg38_refseq,gene_length,100148448,100177523,0,-,.,"gene_id ""LRRC39""; transcript_id ""NM_001256385"""
chr1,hg38_refseq,gene_length,100186919,100249084,0,-,.,"gene_id ""DBT""; transcript_id ""NM_001918"""
chr1,hg38_refseq,gene_length,100264742,100265424,0,-,.,"gene_id ""RTCA-AS1""; transcript_id ""NR_110434"""


In [5]:
hg38_refseq_trunc <- hg38_refseq_trunc_gtf[,c(1,4,5,7,9)]
names(hg38_refseq_trunc) <- c('chromosome','start','end','strand','feature_id')
head(hg38_refseq_trunc)

chromosome,start,end,strand,feature_id
<chr>,<int>,<int>,<chr>,<chr>
chr1,100038845,100083373,+,"gene_id ""MFSD14A""; transcript_id ""NM_033055"""
chr1,100083570,100132180,-,"gene_id ""SASS6""; transcript_id ""NM_194292"""
chr1,100133900,100150498,+,"gene_id ""TRMT13""; transcript_id ""NR_135078"""
chr1,100148448,100177523,-,"gene_id ""LRRC39""; transcript_id ""NM_001256385"""
chr1,100186919,100249084,-,"gene_id ""DBT""; transcript_id ""NM_001918"""
chr1,100264742,100265424,-,"gene_id ""RTCA-AS1""; transcript_id ""NR_110434"""


## Counts

### Get papers that pass QC filter

In [6]:
## get samples that pass the QC filters
qc_data_hg38 <- read.csv('/Users/rusi2317/projects/meta_analysis_qc/hg38/processed_data/qc_filtered_samples/qc_filtered_hg38.csv',
                          fill = TRUE, sep=',', header=TRUE)
head(qc_data_hg38)
dim(qc_data_hg38)

Unnamed: 0_level_0,identifier,srz
Unnamed: 0_level_1,<fct>,<fct>
1,Teppo2016genome,SRZ1950491
2,Teppo2016genome,SRZ1950493
3,Teppo2016genome,SRZ1950495
4,Teppo2016genome,SRZ1950497
5,Teppo2016genome,SRZ1950499
6,Teppo2016genome,SRZ1950501


### Get list of paths

#### Genes

In [7]:
author_lists <- unique(qc_data_hg38$identifier)
shares <- '/Shares/dbnascent/'
featurecounts <- '/featurecounts_genes'

author_path_lists <- paste0(shares, 
                            author_lists,
                            featurecounts)
length(author_path_lists)

In [8]:
extension <- '.sorted.stranded.gene_counts.txt'

counts_paths <- unlist(lapply(author_path_lists, 
                              list.files, 
                              pattern=extension,
                              full.names=TRUE))

length(counts_paths)
class(counts_paths)

In [9]:
trunc_extension <- '.sorted.stranded.5ptrunc_gene_counts.txt'

trunc_counts_paths <- unlist(lapply(author_path_lists, 
                              list.files, 
                              pattern=trunc_extension,
                              full.names=TRUE))

length(trunc_counts_paths)
class(trunc_counts_paths)

#### Bidirectionals 

In [10]:
featurecounts_bidir <- '/featurecounts_bidirs'

author_path_lists_bidir <- paste0(shares, 
                            author_lists,
                            featurecounts_bidir)
length(author_path_lists_bidir)

In [11]:
extension_pos <- '.sorted.pos.bidir_counts.txt'

counts_paths_pos <- unlist(lapply(author_path_lists_bidir, 
                              list.files, 
                              pattern=extension_pos,
                              full.names=TRUE))

length(counts_paths_pos)
class(counts_paths_pos)

In [12]:
extension_neg <- '.sorted.neg.bidir_counts.txt'

counts_paths_neg <- unlist(lapply(author_path_lists_bidir, 
                              list.files, 
                              pattern=extension_neg,
                              full.names=TRUE))

length(counts_paths_neg)
class(counts_paths_neg)

### Load counts

#### Genes

In [13]:
example_counts <- data.table::fread('/Shares/dbnascent/Andrysik2017identification/featurecounts_genes/SRR4090100.sorted.unstranded.gene_counts.txt')
head(example_counts)
dim(example_counts)

GeneID,TranscriptID,Length,SRR4090100.sorted.sorted.bam
<chr>,<chr>,<int>,<int>
MFSD14A,NM_033055,45279,7085
SASS6,NM_194292,49361,4815
TRMT13,NR_135078,17349,2604
LRRC39,NM_001256385,29826,1953
DBT,NM_001918,62916,4963
RTCA-AS1,NR_110434,1433,760


In [14]:
#load all counts excluding the metadata columns
counts_DT <- lapply(counts_paths, 
                    data.table::fread, 
                    drop = c('GeneID', 'TranscriptID', 'Length')) 

#make sure the counts are from human annotations with 41813 transcripts
counts_DT_hg38 <- counts_DT[sapply(counts_DT, function(x) nrow(x) == 41813)]
                         

In [15]:
counts_DT_features_one <- dplyr::bind_cols(counts_DT_hg38)
counts_DT_newnames <- setnames(counts_DT_features_one, names(counts_DT_features_one),
                               as.character(lapply(strsplit(names(counts_DT_features_one), '\\.'), `[`, 1)))
head(counts_DT_newnames)

SRZ1950491,SRZ1950493,SRZ1950495,SRZ1950497,SRZ1950499,SRZ1950501,SRZ1950503,SRZ1950505,SRZ1950507,SRZ1950509,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1187,1277,996,1021,1284,1319,1147,1071,1287,1016,⋯,3760,5652,5888,3746,438,573,1949,1639,453,1476
1088,1425,862,929,1133,1182,935,890,1221,1003,⋯,423,2257,882,564,208,304,1067,897,272,760
496,660,459,582,688,821,461,520,551,511,⋯,225,736,612,532,77,148,459,481,56,345
51,60,42,47,50,63,69,53,69,53,⋯,13,120,30,22,19,31,92,79,26,78
832,917,659,594,911,809,721,707,895,699,⋯,884,2979,1611,1729,254,353,1273,1257,765,1371
85,107,40,39,68,60,68,91,100,78,⋯,400,115,365,230,19,33,50,64,19,40


In [16]:
counts_DT_features <- cbind(hg38_refseq,
                            example_counts[,c(1,2,3)], 
                            counts_DT_features_one)
head(counts_DT_features)
dim(counts_DT_features)

chromosome,start,end,strand,feature_id,GeneID,TranscriptID,Length,SRZ1950491,SRZ1950493,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1,100038095,100083373,+,"gene_id ""MFSD14A""; transcript_id ""NM_033055""",MFSD14A,NM_033055,45279,1187,1277,⋯,3760,5652,5888,3746,438,573,1949,1639,453,1476
chr1,100083570,100132930,-,"gene_id ""SASS6""; transcript_id ""NM_194292""",SASS6,NM_194292,49361,1088,1425,⋯,423,2257,882,564,208,304,1067,897,272,760
chr1,100133150,100150498,+,"gene_id ""TRMT13""; transcript_id ""NR_135078""",TRMT13,NR_135078,17349,496,660,⋯,225,736,612,532,77,148,459,481,56,345
chr1,100148448,100178273,-,"gene_id ""LRRC39""; transcript_id ""NM_001256385""",LRRC39,NM_001256385,29826,51,60,⋯,13,120,30,22,19,31,92,79,26,78
chr1,100186919,100249834,-,"gene_id ""DBT""; transcript_id ""NM_001918""",DBT,NM_001918,62916,832,917,⋯,884,2979,1611,1729,254,353,1273,1257,765,1371
chr1,100264742,100266174,-,"gene_id ""RTCA-AS1""; transcript_id ""NR_110434""",RTCA-AS1,NR_110434,1433,85,107,⋯,400,115,365,230,19,33,50,64,19,40


In [17]:
counts_DT_genes <- counts_DT_features[order(-GeneID, Length), 
                                      head(.SD, 1), by = GeneID]
dim(counts_DT_genes)
head(counts_DT_genes)

GeneID,chromosome,start,end,strand,feature_id,TranscriptID,Length,SRZ1950491,SRZ1950493,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
ZZZ3,chr1,77562416,77682658,-,"gene_id ""ZZZ3""; transcript_id ""NM_015534""",NM_015534,120243,3327,4046,⋯,1719,15138,3474,3910,613,862,3035,2741,636,2503
ZZEF1,chr17,4004445,4143030,-,"gene_id ""ZZEF1""; transcript_id ""NM_015113""",NM_015113,138586,1963,2068,⋯,1380,6963,2774,3190,545,715,2303,1906,649,1834
ZYX,chr7,143381345,143391111,+,"gene_id ""ZYX""; transcript_id ""NM_001362783""",NM_001362783,9767,71,42,⋯,857,3315,1185,2507,143,177,363,345,123,287
ZYG11B,chr1,52726453,52827336,+,"gene_id ""ZYG11B""; transcript_id ""NM_024646""",NM_024646,100884,1111,1087,⋯,2300,3676,4196,1716,375,500,1563,1332,449,1318
ZYG11A,chr1,52842760,52894995,+,"gene_id ""ZYG11A""; transcript_id ""NM_001004339""",NM_001004339,52236,6,0,⋯,632,892,1056,591,303,434,1354,1164,457,1239
ZXDC,chr3,126458901,126475919,-,"gene_id ""ZXDC""; transcript_id ""NM_001040653""",NM_001040653,17019,391,375,⋯,1961,1947,3550,2675,101,102,291,265,95,302


#### Genes : 5' truncated 

In [18]:
trunc_example_counts <- data.table::fread('/Shares/dbnascent/Andrysik2017identification/featurecounts_genes/SRR4090100.sorted.stranded.5ptrunc_gene_counts.txt')
head(trunc_example_counts)
dim(trunc_example_counts)

GeneID,TranscriptID,Length,SRR4090100.sorted.sorted.bam
<chr>,<chr>,<int>,<int>
MFSD14A,NM_033055,44529,4359
SASS6,NM_194292,48611,2705
TRMT13,NR_135078,16599,2034
LRRC39,NM_001256385,29076,416
DBT,NM_001918,62166,3843
RTCA-AS1,NR_110434,683,59


In [19]:
#load all counts excluding the metadata columns
trunc_counts_DT <- lapply(trunc_counts_paths, 
                    data.table::fread, 
                    drop = c('GeneID', 'TranscriptID', 'Length')) 

#make sure the counts are from human annotations with 41813 transcripts
trunc_counts_DT_hg38 <- trunc_counts_DT[sapply(trunc_counts_DT, function(x) nrow(x) == 41813)]
                         

In [20]:
trunc_counts_DT_features_one <- dplyr::bind_cols(trunc_counts_DT_hg38)
trunc_counts_DT_newnames <- setnames(trunc_counts_DT_features_one, names(trunc_counts_DT_features_one),
                               as.character(lapply(strsplit(names(trunc_counts_DT_features_one), '\\.'), `[`, 1)))
head(trunc_counts_DT_newnames)

SRZ1950491,SRZ1950493,SRZ1950495,SRZ1950497,SRZ1950499,SRZ1950501,SRZ1950503,SRZ1950505,SRZ1950507,SRZ1950509,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
898,1051,835,890,1060,1156,896,831,1018,853,⋯,822,5121,1487,1336,378,498,1843,1552,391,1374
1011,1361,810,883,1075,1140,874,834,1143,948,⋯,223,2226,430,369,200,297,1034,881,260,734
446,618,436,526,631,768,426,459,508,481,⋯,63,691,140,224,63,133,415,440,52,316
48,53,40,46,47,59,67,46,68,50,⋯,13,118,30,21,18,31,91,77,26,74
706,803,592,536,828,748,636,606,788,609,⋯,263,2822,514,744,205,290,965,908,219,878
3,8,8,5,9,13,8,9,7,8,⋯,22,31,8,20,5,7,26,24,0,24


In [21]:
trunc_counts_DT_features <- cbind(hg38_refseq_trunc,
                            trunc_example_counts[,c(1,2,3)], 
                            trunc_counts_DT_features_one)
head(trunc_counts_DT_features)
dim(trunc_counts_DT_features)

chromosome,start,end,strand,feature_id,GeneID,TranscriptID,Length,SRZ1950491,SRZ1950493,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
chr1,100038845,100083373,+,"gene_id ""MFSD14A""; transcript_id ""NM_033055""",MFSD14A,NM_033055,44529,898,1051,⋯,822,5121,1487,1336,378,498,1843,1552,391,1374
chr1,100083570,100132180,-,"gene_id ""SASS6""; transcript_id ""NM_194292""",SASS6,NM_194292,48611,1011,1361,⋯,223,2226,430,369,200,297,1034,881,260,734
chr1,100133900,100150498,+,"gene_id ""TRMT13""; transcript_id ""NR_135078""",TRMT13,NR_135078,16599,446,618,⋯,63,691,140,224,63,133,415,440,52,316
chr1,100148448,100177523,-,"gene_id ""LRRC39""; transcript_id ""NM_001256385""",LRRC39,NM_001256385,29076,48,53,⋯,13,118,30,21,18,31,91,77,26,74
chr1,100186919,100249084,-,"gene_id ""DBT""; transcript_id ""NM_001918""",DBT,NM_001918,62166,706,803,⋯,263,2822,514,744,205,290,965,908,219,878
chr1,100264742,100265424,-,"gene_id ""RTCA-AS1""; transcript_id ""NR_110434""",RTCA-AS1,NR_110434,683,3,8,⋯,22,31,8,20,5,7,26,24,0,24


In [22]:
trunc_counts_DT_genes <- trunc_counts_DT_features[order(-GeneID, Length), 
                                      head(.SD, 1), by = GeneID]
dim(trunc_counts_DT_genes)
head(trunc_counts_DT_genes)

GeneID,chromosome,start,end,strand,feature_id,TranscriptID,Length,SRZ1950491,SRZ1950493,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
ZZZ3,chr1,77562416,77681908,-,"gene_id ""ZZZ3""; transcript_id ""NM_015534""",NM_015534,119493,3207,3976,⋯,1447,14963,2733,3146,588,838,2977,2700,608,2435
ZZEF1,chr17,4004445,4142280,-,"gene_id ""ZZEF1""; transcript_id ""NM_015113""",NM_015113,137836,1882,2026,⋯,1015,6772,1756,2415,534,709,2286,1900,636,1826
ZYX,chr7,143382095,143391111,+,"gene_id ""ZYX""; transcript_id ""NM_001362783""",NM_001362783,9017,55,33,⋯,329,3052,714,1434,123,156,312,306,118,273
ZYG11B,chr1,52727203,52827336,+,"gene_id ""ZYG11B""; transcript_id ""NM_024646""",NM_024646,100134,1043,1040,⋯,2087,3632,3584,1112,344,462,1505,1301,421,1296
ZYG11A,chr1,52843510,52894995,+,"gene_id ""ZYG11A""; transcript_id ""NM_001004339""",NM_001004339,51486,6,0,⋯,183,802,193,209,275,380,1255,1091,400,1134
ZXDC,chr3,126458901,126475169,-,"gene_id ""ZXDC""; transcript_id ""NM_001040653""",NM_001040653,16269,332,334,⋯,292,1696,589,845,83,86,259,231,69,264


#### Bidirectionals 

In [23]:
example_pos <- data.table::fread('/Shares/dbnascent/Andrysik2017identification/featurecounts_bidirs/SRR4090100.sorted.pos.bidir_counts.txt')
head(example_pos)
dim(example_pos)

GeneID,Source,SRR4090100.sorted.sorted.bam
<chr>,<chr>,<int>
chr1:3730-7399,dreg,0
chr1:10152-11370,"tfit,dreg",0
chr1:12345-12711,dreg,0
chr1:13250-13756,dreg,0
chr1:15111-15407,dreg,0
chr1:16192-17624,"tfit,dreg",0


In [24]:
chromosome <- as.character(lapply(strsplit(example_pos$GeneID,':'), `[`, 1))
start_end <- as.character(lapply(strsplit(example_pos$GeneID,':'), `[`, 2))
start <- as.character(lapply(strsplit(start_end,'-'), `[`, 1))
end <- as.character(lapply(strsplit(start_end,'-'), `[`, 2))
Length <- as.numeric(end)-as.numeric(start)
Geneid <-paste0(chromosome,'-', start,'-',end,'-',example_pos$Source)

bidir_coord <- data.frame(chromosome, start, end, Length, Geneid)
head(bidir_coord)

Unnamed: 0_level_0,chromosome,start,end,Length,Geneid
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<dbl>,<fct>
1,chr1,3730,7399,3669,chr1-3730-7399-dreg
2,chr1,10152,11370,1218,"chr1-10152-11370-tfit,dreg"
3,chr1,12345,12711,366,chr1-12345-12711-dreg
4,chr1,13250,13756,506,chr1-13250-13756-dreg
5,chr1,15111,15407,296,chr1-15111-15407-dreg
6,chr1,16192,17624,1432,"chr1-16192-17624-tfit,dreg"


In [25]:
#load all counts excluding the metadata columns
counts_bidir_pos_DT <- lapply(counts_paths_pos, 
                    data.table::fread, 
                    drop = c('GeneID', 'Source')) 

#make sure the counts are from human annotations with 41813 transcripts
## Positive strand
counts_bidir_pos <- counts_bidir_pos_DT [sapply(counts_bidir_pos_DT,
                                                function(x) nrow(x) == 652281)]
                         

In [26]:
counts_bidir_pos_merge <- dplyr::bind_cols(counts_bidir_pos)
counts_bidir_pos_newnames <- setnames(counts_bidir_pos_merge, names(counts_bidir_pos_merge),
                               as.character(lapply(strsplit(names(counts_bidir_pos_merge), '\\.'), `[`, 1)))
counts_bidir_pos_newnames_txpts <- cbind(bidir_coord,
                                         example_pos[,c('GeneID', 'Source')],
                                        counts_bidir_pos_newnames)
                                         
dim(counts_bidir_pos_newnames_txpts)
head(counts_bidir_pos_newnames_txpts)


Unnamed: 0_level_0,chromosome,start,end,Length,Geneid,GeneID,Source,SRZ1950491,SRZ1950493,SRZ1950495,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<dbl>,<fct>,<chr>,<chr>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,chr1,3730,7399,3669,chr1-3730-7399-dreg,chr1:3730-7399,dreg,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,chr1,10152,11370,1218,"chr1-10152-11370-tfit,dreg",chr1:10152-11370,"tfit,dreg",0,0,0,⋯,1,2,5,16,0,0,0,0,0,0
3,chr1,12345,12711,366,chr1-12345-12711-dreg,chr1:12345-12711,dreg,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,chr1,13250,13756,506,chr1-13250-13756-dreg,chr1:13250-13756,dreg,0,0,0,⋯,5,38,5,12,6,7,8,10,0,5
5,chr1,15111,15407,296,chr1-15111-15407-dreg,chr1:15111-15407,dreg,0,0,0,⋯,3,22,3,2,0,1,3,0,0,2
6,chr1,16192,17624,1432,"chr1-16192-17624-tfit,dreg",chr1:16192-17624,"tfit,dreg",4,0,0,⋯,28,181,27,75,5,12,22,18,0,10


In [27]:
#load all counts excluding the metadata columns
counts_bidir_neg_DT <- lapply(counts_paths_neg, 
                    data.table::fread, 
                    drop = c('GeneID', 'Source')) 

#make sure the counts are from human annotations with 41813 transcripts
## Positive strand
counts_bidir_neg <- counts_bidir_neg_DT [sapply(counts_bidir_neg_DT,
                                                function(x) nrow(x) == 652281)]
                         

In [28]:
counts_bidir_neg_merge <- dplyr::bind_cols(counts_bidir_neg)
counts_bidir_neg_newnames <- setnames(counts_bidir_neg_merge, names(counts_bidir_neg_merge),
                               as.character(lapply(strsplit(names(counts_bidir_neg_merge), '\\.'), `[`, 1)))
counts_bidir_neg_newnames_txpts <- cbind(bidir_coord,
                                         example_pos[,c('GeneID', 'Source')],
                                         counts_bidir_neg_newnames)
dim(counts_bidir_neg_newnames_txpts)
head(counts_bidir_neg_newnames_txpts)


Unnamed: 0_level_0,chromosome,start,end,Length,Geneid,GeneID,Source,SRZ1950491,SRZ1950493,SRZ1950495,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<dbl>,<fct>,<chr>,<chr>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,chr1,3730,7399,3669,chr1-3730-7399-dreg,chr1:3730-7399,dreg,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,chr1,10152,11370,1218,"chr1-10152-11370-tfit,dreg",chr1:10152-11370,"tfit,dreg",3,1,2,⋯,0,0,1,0,0,0,0,0,0,0
3,chr1,12345,12711,366,chr1-12345-12711-dreg,chr1:12345-12711,dreg,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,chr1,13250,13756,506,chr1-13250-13756-dreg,chr1:13250-13756,dreg,7,2,6,⋯,0,0,0,0,0,0,0,0,0,0
5,chr1,15111,15407,296,chr1-15111-15407-dreg,chr1:15111-15407,dreg,0,0,0,⋯,1,0,0,0,0,0,0,0,0,0
6,chr1,16192,17624,1432,"chr1-16192-17624-tfit,dreg",chr1:16192-17624,"tfit,dreg",24,16,18,⋯,0,0,0,0,0,0,0,0,0,0


In [29]:
# sum positive and negative counts
## NOTE. data.table or matris does not allow for summing counts by sample and transcript
counts_bidir_neg_pos <- as.data.frame(counts_bidir_pos) + as.data.frame(counts_bidir_neg)
colnames(counts_bidir_neg_pos) <- as.character(lapply(strsplit(colnames(counts_bidir_neg_pos),
                                                               '\\.'), `[`, 1))
dim(counts_bidir_neg_pos)
head(counts_bidir_neg_pos)

Unnamed: 0_level_0,SRZ1950491,SRZ1950493,SRZ1950495,SRZ1950497,SRZ1950499,SRZ1950501,SRZ1950503,SRZ1950505,SRZ1950507,SRZ1950509,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,3,1,2,1,0,0,1,1,1,0,⋯,1,2,6,16,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,7,2,6,1,3,2,4,0,5,1,⋯,5,38,5,12,6,7,8,10,0,5
5,0,0,0,0,1,0,0,0,0,0,⋯,4,22,3,2,0,1,3,0,0,2
6,28,16,18,15,25,14,22,16,27,26,⋯,28,181,27,75,5,12,22,18,0,10


In [30]:
counts_bidir <- cbind(bidir_coord,
                      example_pos[,c('GeneID', 'Source')], 
                      counts_bidir_neg_pos)
head(counts_bidir)
dim(counts_bidir)

Unnamed: 0_level_0,chromosome,start,end,Length,Geneid,GeneID,Source,SRZ1950491,SRZ1950493,SRZ1950495,⋯,SRR9833433,SRZ9833428,SRZ9833431,SRZ9833434,SRR11793825,SRR11793826,SRR11793827,SRR11793828,SRR11793829,SRR11793830
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<dbl>,<fct>,<chr>,<chr>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,chr1,3730,7399,3669,chr1-3730-7399-dreg,chr1:3730-7399,dreg,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
2,chr1,10152,11370,1218,"chr1-10152-11370-tfit,dreg",chr1:10152-11370,"tfit,dreg",3,1,2,⋯,1,2,6,16,0,0,0,0,0,0
3,chr1,12345,12711,366,chr1-12345-12711-dreg,chr1:12345-12711,dreg,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
4,chr1,13250,13756,506,chr1-13250-13756-dreg,chr1:13250-13756,dreg,7,2,6,⋯,5,38,5,12,6,7,8,10,0,5
5,chr1,15111,15407,296,chr1-15111-15407-dreg,chr1:15111-15407,dreg,0,0,0,⋯,4,22,3,2,0,1,3,0,0,2
6,chr1,16192,17624,1432,"chr1-16192-17624-tfit,dreg",chr1:16192-17624,"tfit,dreg",28,16,18,⋯,28,181,27,75,5,12,22,18,0,10


# Save counts

## Genes

In [31]:
data.table::fwrite(counts_DT_genes,
           "/Users/rusi2317/projects/DBNascent_Analysis/data/final_counts/counts_genes.tsv.gz",
           sep='\t')

In [32]:
data.table::fwrite(trunc_counts_DT_genes,
           "/Users/rusi2317/projects/DBNascent_Analysis/data/final_counts/counts_genes_5ptrunc.tsv.gz",
           sep='\t')


## Bidirectionals 

In [33]:
data.table::fwrite(counts_bidir_pos_newnames_txpts,
           "/Users/rusi2317/projects/DBNascent_Analysis/data/final_counts/counts_bidirs_pos.tsv.gz",
           sep='\t')

data.table::fwrite(counts_bidir_neg_newnames_txpts,
           "/Users/rusi2317/projects/DBNascent_Analysis/data/final_counts/counts_bidirs_neg.tsv.gz",
           sep='\t')

data.table::fwrite(counts_bidir,
           "/Users/rusi2317/projects/DBNascent_Analysis/data/final_counts/counts_bidirs.tsv.gz",
           sep='\t')

# Session Information

In [34]:
sessionInfo()

R version 3.6.0 (2019-04-26)
Platform: x86_64-redhat-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] data.table_1.14.2 tidyr_1.2.1       dplyr_1.0.10     

loaded via a namespace (and not attached):
 [1] magrittr_2.0.3   tidyselect_1.1.2 uuid_1.1-0       R6_2.5.1        
 [5] rlang_1.0.6      fastmap_1.1.0    fansi_1.0.3      tools_3.6.0     
 [9] utf8_1.2.2       DBI_1.1.3        cli_3.4.1        htmltools_0.5.2 
[13] asserttha