# Comparing DEGs listed from DESeq2 between bam and fastq

DEGs generated from DESeq2 for STAR Aligner counts (Reprocessed GTEx vs Reprocessed GDC). Starting files are either fastq from GDC Legacy Archive for the GDC samples, or from bam from the current GDC Portal then converted to fastq using Biobambam.

This notebook compares the two DEG listings between bam and fastq inputs, seeing how the rankings of DEGs (based on padj values) match up between the two lists.

In [1]:
# Import libraries
library("dplyr")


Attaching package: 'dplyr'


The following objects are masked from 'package:stats':

    filter, lag


The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union




In [2]:
degs.bam <- read.csv("Reprocessed vs Reprocessed DEGs ranked extended bam.csv", header = T)[,seq(1, 10)]
degs.fastq <- read.csv("Reprocessed vs Reprocessed DEGs ranked extended fastq.csv", header = T)[,seq(1, 10)]

degs.bam
degs.fastq

gene,GTExN7MScountsREPROCESSED,GTExNFK9countsREPROCESSED,GTExO5YTcountsREPROCESSED,GDC2821countsREPROCESSED_bam,GDC2828countsREPROCESSED_bam,GDC2839countsREPROCESSED_bam,padj,symbol,rank
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<int>
ENSG00000163464,12193,20430,95467,14,10,18,6.83e-69,CXCR1,1
ENSG00000162747,9192,7385,35265,12,38,20,3.12e-58,FCGR3B,2
ENSG00000157551,1227,2227,7541,18,15,10,1.05e-43,KCNJ15,3
ENSG00000236438,935,549,1788,28,30,29,8.61e-43,FAM157A,4
ENSG00000135083,835,1353,2630,23,16,7,2.34e-39,CCNJL,5
ENSG00000184106,311,288,957,6,7,5,2.08e-35,TREML3P,6
ENSG00000198858,22229,12733,33248,1812,1724,1560,1.01e-34,R3HDM4,7
ENSG00000148346,7056,871,28306,11,15,14,2.03e-30,LCN2,8
ENSG00000204936,1636,10916,251070,1,0,2,2.27e-26,CD177,9
ENSG00000090238,7631,5230,10759,1211,847,984,1.14e-25,YPEL3,10


gene,GTExN7MScountsREPROCESSED,GTExNFK9countsREPROCESSED,GTExO5YTcountsREPROCESSED,GDC2821countsREPROCESSED_fastq,GDC2828countsREPROCESSED_fastq,GDC2839countsREPROCESSED_fastq,padj,symbol,rank
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<int>
ENSG00000163464,12193,20430,95467,14,7,14,1.33e-62,CXCR1,1
ENSG00000162747,9192,7385,35265,12,30,15,2.84e-58,FCGR3B,2
ENSG00000157551,1227,2227,7541,18,15,5,3.57e-39,KCNJ15,3
ENSG00000236438,935,549,1788,28,23,8,6.01e-32,FAM157A,4
ENSG00000148346,7056,871,28306,11,11,11,7.96e-31,LCN2,5
ENSG00000135083,835,1353,2630,23,13,2,1.28e-29,CCNJL,6
ENSG00000184106,311,288,957,6,6,2,3.29e-29,TREML3P,7
ENSG00000198858,22229,12733,33248,1812,1464,1156,2.87e-28,R3HDM4,8
ENSG00000143632,4158,1193,953,6,7,1,2.75e-26,ACTA1,9
ENSG00000204936,1636,10916,251070,1,0,1,8.20e-26,CD177,10


In [3]:
# Function to return rank change for lapply
rank_change <- function(gene, gene_list) {
    if (gene %in% gene_list) {
        return(which(gene_list == gene))
    } else {
        return(NA)
    }
}

# Get ranking changes and store as columns
degs.bam.rankchange <- unlist(lapply(degs.bam$gene, rank_change, gene_list = degs.fastq$gene))
for (i in 1:length(degs.bam.rankchange)) {
    degs.bam.rankchange[i] <- degs.bam.rankchange[i] - i
}
degs.fastq.rankchange <- unlist(lapply(degs.fastq$gene, rank_change, gene_list = degs.bam$gene))
for (i in 1:length(degs.fastq.rankchange)) {
    degs.fastq.rankchange[i] <- degs.fastq.rankchange[i] - i
}
degs.bam$rank_change <- degs.bam.rankchange
degs.fastq$rank_change <- degs.fastq.rankchange

In [4]:
degs.bam

gene,GTExN7MScountsREPROCESSED,GTExNFK9countsREPROCESSED,GTExO5YTcountsREPROCESSED,GDC2821countsREPROCESSED_bam,GDC2828countsREPROCESSED_bam,GDC2839countsREPROCESSED_bam,padj,symbol,rank,rank_change
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<int>,<int>
ENSG00000163464,12193,20430,95467,14,10,18,6.83e-69,CXCR1,1,0
ENSG00000162747,9192,7385,35265,12,38,20,3.12e-58,FCGR3B,2,0
ENSG00000157551,1227,2227,7541,18,15,10,1.05e-43,KCNJ15,3,0
ENSG00000236438,935,549,1788,28,30,29,8.61e-43,FAM157A,4,0
ENSG00000135083,835,1353,2630,23,16,7,2.34e-39,CCNJL,5,1
ENSG00000184106,311,288,957,6,7,5,2.08e-35,TREML3P,6,1
ENSG00000198858,22229,12733,33248,1812,1724,1560,1.01e-34,R3HDM4,7,1
ENSG00000148346,7056,871,28306,11,15,14,2.03e-30,LCN2,8,-3
ENSG00000204936,1636,10916,251070,1,0,2,2.27e-26,CD177,9,1
ENSG00000090238,7631,5230,10759,1211,847,984,1.14e-25,YPEL3,10,7


In [5]:
degs.fastq

gene,GTExN7MScountsREPROCESSED,GTExNFK9countsREPROCESSED,GTExO5YTcountsREPROCESSED,GDC2821countsREPROCESSED_fastq,GDC2828countsREPROCESSED_fastq,GDC2839countsREPROCESSED_fastq,padj,symbol,rank,rank_change
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<int>,<int>
ENSG00000163464,12193,20430,95467,14,7,14,1.33e-62,CXCR1,1,0
ENSG00000162747,9192,7385,35265,12,30,15,2.84e-58,FCGR3B,2,0
ENSG00000157551,1227,2227,7541,18,15,5,3.57e-39,KCNJ15,3,0
ENSG00000236438,935,549,1788,28,23,8,6.01e-32,FAM157A,4,0
ENSG00000148346,7056,871,28306,11,11,11,7.96e-31,LCN2,5,3
ENSG00000135083,835,1353,2630,23,13,2,1.28e-29,CCNJL,6,-1
ENSG00000184106,311,288,957,6,6,2,3.29e-29,TREML3P,7,-1
ENSG00000198858,22229,12733,33248,1812,1464,1156,2.87e-28,R3HDM4,8,-1
ENSG00000143632,4158,1193,953,6,7,1,2.75e-26,ACTA1,9,14
ENSG00000204936,1636,10916,251070,1,0,1,8.20e-26,CD177,10,-1


### Where the DEGs listed in the DESeq2 analysis are within the rankings?

From the DESeq2 analysis comparing counts from bam inputs vs fastq inputs, there are 27 genes listed that differ between the two inputs' outputs:
- ENSG00000230916
- ENSG00000151846
- ENSG00000227939
- ENSG00000237709
- ENSG00000270350
- ENSG00000271207
- ENSG00000232177
- ENSG00000249780
- ENSG00000259612
- ENSG00000268222
- ENSG00000270906
- ENSG00000228205
- ENSG00000255642
- ENSG00000183199
- ENSG00000240103
- ENSG00000240376
- ENSG00000225093
- ENSG00000249264
- ENSG00000198868
- ENSG00000223668
- ENSG00000247627
- ENSG00000248626
- ENSG00000223529
- ENSG00000270388
- ENSG00000224411
- ENSG00000250144
- ENSG00000233057

The next step is determining where these genes are ranked.

In [6]:
deseq2_degs <- c("ENSG00000230916",
"ENSG00000151846",
"ENSG00000227939",
"ENSG00000237709",
"ENSG00000270350",
"ENSG00000271207",
"ENSG00000232177",
"ENSG00000249780",
"ENSG00000259612",
"ENSG00000268222",
"ENSG00000270906",
"ENSG00000228205",
"ENSG00000255642",
"ENSG00000183199",
"ENSG00000240103",
"ENSG00000240376",
"ENSG00000225093",
"ENSG00000249264",
"ENSG00000198868",
"ENSG00000223668",
"ENSG00000247627",
"ENSG00000248626",
"ENSG00000223529",
"ENSG00000270388",
"ENSG00000224411",
"ENSG00000250144",
"ENSG00000233057")

locate_deg <- function(deg, gene_list) {
    if (deg %in% gene_list) {
        return(which(gene_list == deg))
    } else {
        return(NA)
    }
}


In [7]:
# Where are the DEGs listed in the rankings for bam inputs?
lapply(deseq2_degs, locate_deg, gene_list = degs.bam$gene)

In [8]:
# How about the rankings for fastq inputs?
lapply(deseq2_degs, locate_deg, gene_list = degs.fastq$gene)

In [9]:
# Write rankings as csv files
write.csv(degs.bam, "Reprocessed vs Reprocessed DEGs ranked comparing GDC bam vs fastq inputs bam.csv", row.names = F)
write.csv(degs.fastq, "Reprocessed vs Reprocessed DEGs ranked comparing GDC bam vs fastq inputs fastq.csv", row.names = F)