**Author :** Rutendo F. Sigauke

**Input  :** 

1. Further filtered significant pairs

        - sig_inter_nobs_dist_filtered.txt.gz

2. Gene and bidirectional pairs that overlap GTEx eQTLs 
    
        - sig_pairs_path_DT_filt_gtex_all_INTER.txt.gz
        
3. GENIE3 ranked bidirectional gene pairs  (*chr*_GENIE3_ranks.tsv.gz)


**Output :**

1. Final filtered significant pairs with GTEx supported pairs and GENIE3 ranks

        - dbnascent_pairs_hg38.txt.gz


# Load libraries

In [1]:
library(data.table) 

# Import data

## Significant pairs

In [2]:
sig_inter_nobs_dist_filtered <- data.table::fread("/scratch/Shares/dowell/rutendo/projects/DBNascent_Analysis/data/gene_bidir_significant_pairs/sig_inter_nobs_dist_filtered.txt.gz")
sig_inter_nobs_dist_filtered$pair_id <- paste0(sig_inter_nobs_dist_filtered$transcript_1,
                                              "~",
                                              sig_inter_nobs_dist_filtered$transcript_2)
sig_inter_nobs_dist_filtered$pair_id_tissue <- paste0(sig_inter_nobs_dist_filtered$transcript_1,
                                                      "~",
                                                      sig_inter_nobs_dist_filtered$transcript_2,
                                                      '~', 
                                                      sig_inter_nobs_dist_filtered$tissue)
nrow(sig_inter_nobs_dist_filtered)
head(sig_inter_nobs_dist_filtered)

transcript1_chrom,transcript1_start,transcript1_stop,transcript_1,transcript1_score,transcript1_strand,transcript2_chrom,transcript2_start,transcript2_stop,transcript_2,⋯,adj_p_BH,nObs,t,distance_tss,distance_tes,position,tissue,percent_transcribed_both,pair_id,pair_id_tissue
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,⋯,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<chr>
chr1,33306765,33321098,A3GALT2:NM_001080438.1,.,-,chr1,33036652,33037238,chr1:33036652-33037238,⋯,9.631487000000001e-23,215,11.984604,284153,269820,downstream,blood,96.846847,A3GALT2:NM_001080438.1~chr1:33036652-33037238,A3GALT2:NM_001080438.1~chr1:33036652-33037238~blood
chr1,33306765,33321098,A3GALT2:NM_001080438.1,.,-,chr1,33224694,33224966,chr1:33224694-33224966,⋯,0.002198101,16,5.028114,96268,81935,downstream,blood,7.207207,A3GALT2:NM_001080438.1~chr1:33224694-33224966,A3GALT2:NM_001080438.1~chr1:33224694-33224966~blood
chr1,33306765,33321098,A3GALT2:NM_001080438.1,.,-,chr1,33242850,33243170,chr1:33242850-33243170,⋯,0.002060092,22,4.606764,78088,63755,downstream,blood,9.90991,A3GALT2:NM_001080438.1~chr1:33242850-33243170,A3GALT2:NM_001080438.1~chr1:33242850-33243170~blood
chr1,93992833,94121148,ABCA4:NM_000350.3,.,-,chr1,93910534,93911046,chr1:93910534-93911046,⋯,6.731068e-11,86,8.504139,210358,82043,downstream,blood,38.738739,ABCA4:NM_000350.3~chr1:93910534-93911046,ABCA4:NM_000350.3~chr1:93910534-93911046~blood
chr1,93992833,94121148,ABCA4:NM_000350.3,.,-,chr1,93931066,93931248,chr1:93931066-93931248,⋯,0.0001088703,35,5.42922,189991,61676,downstream,blood,15.765766,ABCA4:NM_000350.3~chr1:93931066-93931248,ABCA4:NM_000350.3~chr1:93931066-93931248~blood
chr1,93992833,94121148,ABCA4:NM_000350.3,.,-,chr1,93946840,93947120,chr1:93946840-93947120,⋯,6.991417e-06,38,6.376853,174168,45853,downstream,blood,17.117117,ABCA4:NM_000350.3~chr1:93946840-93947120,ABCA4:NM_000350.3~chr1:93946840-93947120~blood


## GENIE3 results

In [3]:
genie_path <- '/Users/rusi2317/projects/meta_analysis_qc/hg38/processed_data/genie3_ranked_bidir_gene_pairs'

##get paths for the tables
genie_path_files <- list.files(path=genie_path, 
                              pattern="*GENIE3_ranks.tsv.gz", 
                                   full.names=TRUE)
##load correlations
genie_path_DT_list <- lapply(genie_path_files, 
                                   data.table::fread) 

In [4]:
genie_path_DT <- do.call(rbind, genie_path_DT_list)
colnames(genie_path_DT)[7] <- "genie3_ranks" ##change the 7th column to "genie3_ranks"
nrow(genie_path_DT)
head(genie_path_DT)

regulatoryGene,targetGene,weight,pair_id,gtex,tissue,genie3_ranks
<chr>,<chr>,<dbl>,<chr>,<int>,<chr>,<int>
chr1:33036652-33037238,A3GALT2:NM_001080438.1,0.93466462,A3GALT2:NM_001080438.1~chr1:33036652-33037238,0,blood,1
chr1:33242850-33243170,A3GALT2:NM_001080438.1,0.03795462,A3GALT2:NM_001080438.1~chr1:33242850-33243170,0,blood,2
chr1:33224694-33224966,A3GALT2:NM_001080438.1,0.02738077,A3GALT2:NM_001080438.1~chr1:33224694-33224966,0,blood,3
chr1:32349126-32349590,A3GALT2:NM_001080438.1,0.28754345,A3GALT2:NM_001080438.1~chr1:32349126-32349590,0,breast,1
chr1:32992392-32992740,A3GALT2:NM_001080438.1,0.20546709,A3GALT2:NM_001080438.1~chr1:32992392-32992740,0,breast,2
chr1:32989317-32989605,A3GALT2:NM_001080438.1,0.15960173,A3GALT2:NM_001080438.1~chr1:32989317-32989605,0,breast,3


In [5]:
genie_path_DT$pair_id_tissue <- paste0(genie_path_DT$pair_id,
                                       "~",
                                       genie_path_DT$tissue)
head(genie_path_DT)

regulatoryGene,targetGene,weight,pair_id,gtex,tissue,genie3_ranks,pair_id_tissue
<chr>,<chr>,<dbl>,<chr>,<int>,<chr>,<int>,<chr>
chr1:33036652-33037238,A3GALT2:NM_001080438.1,0.93466462,A3GALT2:NM_001080438.1~chr1:33036652-33037238,0,blood,1,A3GALT2:NM_001080438.1~chr1:33036652-33037238~blood
chr1:33242850-33243170,A3GALT2:NM_001080438.1,0.03795462,A3GALT2:NM_001080438.1~chr1:33242850-33243170,0,blood,2,A3GALT2:NM_001080438.1~chr1:33242850-33243170~blood
chr1:33224694-33224966,A3GALT2:NM_001080438.1,0.02738077,A3GALT2:NM_001080438.1~chr1:33224694-33224966,0,blood,3,A3GALT2:NM_001080438.1~chr1:33224694-33224966~blood
chr1:32349126-32349590,A3GALT2:NM_001080438.1,0.28754345,A3GALT2:NM_001080438.1~chr1:32349126-32349590,0,breast,1,A3GALT2:NM_001080438.1~chr1:32349126-32349590~breast
chr1:32992392-32992740,A3GALT2:NM_001080438.1,0.20546709,A3GALT2:NM_001080438.1~chr1:32992392-32992740,0,breast,2,A3GALT2:NM_001080438.1~chr1:32992392-32992740~breast
chr1:32989317-32989605,A3GALT2:NM_001080438.1,0.15960173,A3GALT2:NM_001080438.1~chr1:32989317-32989605,0,breast,3,A3GALT2:NM_001080438.1~chr1:32989317-32989605~breast


## GTEx pairs

In [6]:
sig_pairs_path_DT_filt_gtex_all <- data.table::fread("/Users/rusi2317/projects/meta_analysis_qc/hg38/processed_data/gtex_bidir_significant_pairs/sig_pairs_path_DT_filt_gtex_all_INTER.txt.gz")
nrow(sig_pairs_path_DT_filt_gtex_all)
head(sig_pairs_path_DT_filt_gtex_all)

transcript1_chrom,transcript1_start,transcript1_stop,transcript_1,transcript1_score,transcript1_strand,transcript2_chrom,transcript2_start,transcript2_stop,transcript_2,⋯,adj_p_BH,nObs,t,distance_tss,distance_tes,position,tissue,percent_transcribed_both,pair_id,gtex
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,⋯,<dbl>,<int>,<dbl>,<dbl>,<int>,<chr>,<chr>,<dbl>,<chr>,<int>
chr1,94418388,94478723,ABCD3:NM_001122674.2,.,+,chr1,94310249,94311127,chr1:94310249-94311127,⋯,4.657652e-06,21,8.106376,-107700,-168035,upstream,blood,9.459459,ABCD3:NM_001122674.2~chr1:94310249-94311127,1
chr1,94418388,94478723,ABCD3:NM_001122674.2,.,+,chr1,94347033,94348559,chr1:94347033-94348559,⋯,3.478977e-06,22,8.085675,-70592,-130927,upstream,blood,9.90991,ABCD3:NM_001122674.2~chr1:94347033-94348559,1
chr1,94418388,94478723,ABCD3:NM_001122674.2,.,+,chr1,94382060,94382668,chr1:94382060-94382668,⋯,0.0002548884,32,5.178511,-36024,-96359,upstream,blood,14.414414,ABCD3:NM_001122674.2~chr1:94382060-94382668,1
chr1,94418388,94478723,ABCD3:NM_001122674.2,.,+,chr1,94384921,94385259,chr1:94384921-94385259,⋯,1.933635e-07,78,6.659129,-33298,-93633,upstream,blood,35.135135,ABCD3:NM_001122674.2~chr1:94384921-94385259,1
chr1,94418388,94478723,ABCD3:NM_001122674.2,.,+,chr1,94566576,94567348,chr1:94566576-94567348,⋯,2.678685e-05,38,5.875028,148574,88239,downstream,blood,17.117117,ABCD3:NM_001122674.2~chr1:94566576-94567348,1
chr1,94418388,94478723,ABCD3:NM_001122674.2,.,+,chr1,94590531,94591849,chr1:94590531-94591849,⋯,0.001310936,34,4.445454,172802,112467,downstream,blood,15.315315,ABCD3:NM_001122674.2~chr1:94590531-94591849,1


# Merge datasets

## `1:` Merge pairs with GENIE3 ranks

In [7]:
sig_inter_nobs_dist_filtered_meta <- merge(sig_inter_nobs_dist_filtered, 
                                            genie_path_DT[, c("pair_id_tissue","genie3_ranks")],
                                            by='pair_id_tissue',
                                            all.x=TRUE)
nrow(sig_inter_nobs_dist_filtered_meta)
head(sig_inter_nobs_dist_filtered_meta, 3)


pair_id_tissue,transcript1_chrom,transcript1_start,transcript1_stop,transcript_1,transcript1_score,transcript1_strand,transcript2_chrom,transcript2_start,transcript2_stop,⋯,adj_p_BH,nObs,t,distance_tss,distance_tes,position,tissue,percent_transcribed_both,pair_id,genie3_ranks
<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,⋯,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<int>
A1BG-AS1:NR_015380.2~chr19:57479710-57479990~uterus,chr19,58351969,58355183,A1BG-AS1:NR_015380.2,.,+,chr19,57479710,57479990,⋯,0.0034719478,26,4.310259,-872119,-875333,upstream,uterus,44.82759,A1BG-AS1:NR_015380.2~chr19:57479710-57479990,5
A1BG-AS1:NR_015380.2~chr19:57480338-57480600~prostate,chr19,58351969,58355183,A1BG-AS1:NR_015380.2,.,+,chr19,57480338,57480600,⋯,0.0017483223,18,4.871192,-871500,-874714,upstream,prostate,43.90244,A1BG-AS1:NR_015380.2~chr19:57480338-57480600,1
A1BG-AS1:NR_015380.2~chr19:57480338-57480600~uterus,chr19,58351969,58355183,A1BG-AS1:NR_015380.2,.,+,chr19,57480338,57480600,⋯,0.0009361908,23,5.122232,-871500,-874714,upstream,uterus,39.65517,A1BG-AS1:NR_015380.2~chr19:57480338-57480600,1


In [8]:
na_ranks <- sig_inter_nobs_dist_filtered_meta[is.na(sig_inter_nobs_dist_filtered_meta$ranks),]
no_na_ranks <- sig_inter_nobs_dist_filtered_meta[!is.na(sig_inter_nobs_dist_filtered_meta$ranks),]


In [9]:
length(intersect(unique(na_ranks$transcript_1), unique(no_na_ranks$transcript_1)))

In [10]:
length(unique(na_ranks$transcript_1))
length(unique(no_na_ranks$transcript_1))

In [11]:
table(na_ranks$tissue)

< table of extent 0 >

In [12]:
##check recovery of pairs
length(unique(sig_inter_nobs_dist_filtered$pair_id))
length(unique(sig_inter_nobs_dist_filtered_meta$pair_id))

In [13]:
##check recovery of pairs by tissue
nrow(unique(sig_inter_nobs_dist_filtered[,c("pair_id", "tissue")]))
nrow(unique(sig_inter_nobs_dist_filtered_meta[,c("pair_id","tissue")]))

## `2:` Note the GTEx supported pairs

In [14]:
sig_inter_nobs_dist_filtered_meta$gtex_supported <- ifelse(sig_inter_nobs_dist_filtered_meta$pair_id %in%
                                                  sig_pairs_path_DT_filt_gtex_all$pair_id, "Yes","No")
nrow(sig_inter_nobs_dist_filtered_meta)
head(sig_inter_nobs_dist_filtered_meta, 3)

pair_id_tissue,transcript1_chrom,transcript1_start,transcript1_stop,transcript_1,transcript1_score,transcript1_strand,transcript2_chrom,transcript2_start,transcript2_stop,⋯,nObs,t,distance_tss,distance_tes,position,tissue,percent_transcribed_both,pair_id,genie3_ranks,gtex_supported
<chr>,<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,⋯,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<chr>,<int>,<chr>
A1BG-AS1:NR_015380.2~chr19:57479710-57479990~uterus,chr19,58351969,58355183,A1BG-AS1:NR_015380.2,.,+,chr19,57479710,57479990,⋯,26,4.310259,-872119,-875333,upstream,uterus,44.82759,A1BG-AS1:NR_015380.2~chr19:57479710-57479990,5,No
A1BG-AS1:NR_015380.2~chr19:57480338-57480600~prostate,chr19,58351969,58355183,A1BG-AS1:NR_015380.2,.,+,chr19,57480338,57480600,⋯,18,4.871192,-871500,-874714,upstream,prostate,43.90244,A1BG-AS1:NR_015380.2~chr19:57480338-57480600,1,No
A1BG-AS1:NR_015380.2~chr19:57480338-57480600~uterus,chr19,58351969,58355183,A1BG-AS1:NR_015380.2,.,+,chr19,57480338,57480600,⋯,23,5.122232,-871500,-874714,upstream,uterus,39.65517,A1BG-AS1:NR_015380.2~chr19:57480338-57480600,1,No


In [15]:
table(sig_inter_nobs_dist_filtered_meta$gtex)


     No     Yes 
1319758  121132 

In [16]:
sig_inter_nobs_dist_filtered_meta[,c("pair_id_tissue","pair_id"):=NULL]

In [17]:
head(sig_inter_nobs_dist_filtered_meta, 3)

transcript1_chrom,transcript1_start,transcript1_stop,transcript_1,transcript1_score,transcript1_strand,transcript2_chrom,transcript2_start,transcript2_stop,transcript_2,⋯,adj_p_BH,nObs,t,distance_tss,distance_tes,position,tissue,percent_transcribed_both,genie3_ranks,gtex_supported
<chr>,<int>,<int>,<chr>,<chr>,<chr>,<chr>,<int>,<int>,<chr>,⋯,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<int>,<chr>
chr19,58351969,58355183,A1BG-AS1:NR_015380.2,.,+,chr19,57479710,57479990,chr19:57479710-57479990,⋯,0.0034719478,26,4.310259,-872119,-875333,upstream,uterus,44.82759,5,No
chr19,58351969,58355183,A1BG-AS1:NR_015380.2,.,+,chr19,57480338,57480600,chr19:57480338-57480600,⋯,0.0017483223,18,4.871192,-871500,-874714,upstream,prostate,43.90244,1,No
chr19,58351969,58355183,A1BG-AS1:NR_015380.2,.,+,chr19,57480338,57480600,chr19:57480338-57480600,⋯,0.0009361908,23,5.122232,-871500,-874714,upstream,uterus,39.65517,1,No


# Save final pair file

In [19]:
data.table::fwrite(sig_inter_nobs_dist_filtered_meta,
                  "/scratch/Shares/dowell/rutendo/projects/DBNascent_Analysis/data/gene_bidir_significant_pairs/dbnascent_pairs.txt.gz",
                  sep="\t")

# Session Information 

In [20]:
sessionInfo()

R version 3.6.0 (2019-04-26)
Platform: x86_64-redhat-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] data.table_1.14.2

loaded via a namespace (and not attached):
 [1] fansi_1.0.3       crayon_1.5.1      digest_0.6.29     utf8_1.2.2       
 [5] R.methodsS3_1.8.2 IRdisplay_1.1     repr_1.1.4        lifecycle_1.0.3  
 [9] jsonlite_1.8.0    evaluate_0.16     pillar_1.8.1      rlang_1.0.6      
[13] cli_3.4.1         uuid_1.1-0     