# Differential gene expression
### Prepare environment
----

In [1]:
library('tximport', quietly=T)
library('DESeq2',quietly=T)
library('ashr',quietly=T)
library('tibble',quietly=T)
library('tidyverse',quietly=T)


Attaching package: ‘BiocGenerics’


The following objects are masked from ‘package:stats’:

    IQR, mad, sd, var, xtabs


The following objects are masked from ‘package:base’:

    anyDuplicated, aperm, append, as.data.frame, basename, cbind,
    colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
    get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
    match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
    Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
    table, tapply, union, unique, unsplit, which.max, which.min



Attaching package: ‘S4Vectors’


The following object is masked from ‘package:utils’:

    findMatches


The following objects are masked from ‘package:base’:

    expand.grid, I, unname



Attaching package: ‘MatrixGenerics’


The following objects are masked from ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
 

# This notebook will create and save two data types:
1. matrix of significant differentially expressed orf's from the results of the differential expression analysis
2. matrix of VSD normalized counts ordered by variance across samples
## Both of these files will be created for transcript-level data and "gene-level" data using only transcripts with Kegg annotations and summing by Kegg annotation. 
#### The second method will result in data with rownames as Kegg annotations, meaning each annotation appears once in the matrix. 

# 1. Read in Salmon Counts
----

In [2]:
#fixing tximport function 

pattern='[[:alpha:]]+([[:digit:]]{2}|_)(_9[[:alpha:]]|[[:alpha:]]*)'

read.in <- function(org){
    dir <- paste("/work/nclab/lucy/SAB/Assembly/",org,"/salmon",sep='')
    files <- file.path(dir,list.files(dir,pattern=".sf",recursive=TRUE))
    
    names(files)=str_extract(files,pattern)
    names(files)=str_replace(names(files),'oFe', 'pFe') #correct a sample for 08 from oFe to pFe

    if (all(file.exists(files)) == FALSE) {
        print("ERROR IN FILE NAMES, not all files exist")
        print(paste("Directory:", dir, sep="/n"))
        print(paste("Files:", files, sep='/n'))
    }
    
    raw_counts <- tximport(files, type='salmon', txOut = TRUE) 
}

create.metadata=function(org){
    dir <- paste("/work/nclab/lucy/SAB/Assembly/",org,"/salmon",sep='')
    files <- file.path(dir,list.files(dir,pattern=".sf",recursive=TRUE))
    
    id=str_extract(files,pattern)
    id=str_replace(id,'oFe', 'pFe')
    metadata=data.frame('id'=id,'isolate'=org,'treatment'=str_extract(id,'[[:alpha:]]+(19|21_9|_back)'),'rep'=str_extract(id, 'A|B|C'))
    metadata$treatment=str_replace_all(metadata$treatment, c('pFe19'='High_Iron', 'pFe21_9'='Low_Iron','add_back'='Add_Back'))
    metadata
    print(metadata)                    
}
counts_4=read.in('04')
metadata_4=create.metadata('04')

counts_8=read.in('08')
metadata_8=create.metadata('08')

counts_6=read.in('06')
metadata_6=create.metadata('06')

counts_13=read.in('13')
metadata_13=create.metadata('13')
head(counts_8$counts)

reading in files with read_tsv

1 
2 
3 
4 
5 




         id isolate treatment rep
1 add_backA      04  Add_Back   A
2 add_backB      04  Add_Back   B
3  pFe21_9A      04  Low_Iron   A
4  pFe21_9B      04  Low_Iron   B
5  pFe21_9C      04  Low_Iron   C


reading in files with read_tsv

1 
2 
3 
4 
5 
6 
7 
8 




         id isolate treatment rep
1 add_backB      08  Add_Back   B
2 add_backC      08  Add_Back   C
3  pFe21_9A      08  Low_Iron   A
4    pFe19A      08 High_Iron   A
5    pFe19B      08 High_Iron   B
6    pFe19C      08 High_Iron   C
7  pFe21_9B      08  Low_Iron   B
8  pFe21_9C      08  Low_Iron   C


reading in files with read_tsv

1 
2 
3 
4 
5 
6 
7 
8 
9 




         id isolate treatment rep
1 add_backA      06  Add_Back   A
2 add_backB      06  Add_Back   B
3 add_backC      06  Add_Back   C
4    pFe19A      06 High_Iron   A
5    pFe19B      06 High_Iron   B
6    pFe19C      06 High_Iron   C
7  pFe21_9A      06  Low_Iron   A
8  pFe21_9B      06  Low_Iron   B
9  pFe21_9C      06  Low_Iron   C


reading in files with read_tsv

1 
2 
3 
4 
5 
6 
7 
8 
9 




         id isolate treatment rep
1 add_backA      13  Add_Back   A
2 add_backB      13  Add_Back   B
3 add_backC      13  Add_Back   C
4    pFe19A      13 High_Iron   A
5    pFe19B      13 High_Iron   B
6    pFe19C      13 High_Iron   C
7  pFe21_9A      13  Low_Iron   A
8  pFe21_9B      13  Low_Iron   B
9  pFe21_9C      13  Low_Iron   C


Unnamed: 0,add_backB,add_backC,pFe21_9A,pFe19A,pFe19B,pFe19C,pFe21_9B,pFe21_9C
NODE_10000_length_1734_cov_7.947622_g6638_i0.p1,34.0,15.0,9.0,5.0,7.0,15.0,10.0,1.0
NODE_10001_length_1733_cov_212.025904_g6639_i0.p1,677.0,263.878,49.0,37.0,30.0,58.0,108.0,141.237
NODE_10002_length_1733_cov_138.864458_g1003_i6.p1,165.923,88.899,28.753,14.298,16.228,44.13,12.042,26.511
NODE_10003_length_1733_cov_40.357229_g5975_i1.p2,6.055,1.363,1.001,1.098,0.0,3.387,9.037,4.809
NODE_10003_length_1733_cov_40.357229_g5975_i1.p1,4.0,2.0,4.0,1.0,0.0,7.0,4.0,1.0
NODE_10004_length_1733_cov_25.593373_g6640_i0.p1,70.0,40.0,11.0,11.0,5.0,9.0,34.0,17.0


# 2. Create DeSeq object
---------
## Create deseq2 object normalize the counts object for plotting later, and run the differential expression
We should have four kinds of objects for each sample now, dds (deseq object), normalized dds, differential expression analysis object (de), and the differential expression results (res)
## 2.1 create dds

In [31]:
dds <- function(raw_counts, metadata, n){
    dds <- DESeqDataSetFromTximport( raw_counts,
                             colData=metadata,
                             design=~treatment)
    dds$treatment <- relevel(dds$treatment, ref = "Low_Iron")
    keep <- rowSums(counts(dds) >=5) >= n #filter out rows with too low expression
    print(nrow(dds))
    dds <- dds[keep, ]
    print(nrow(dds))
    dds
}

dds4 <- dds(counts_4, metadata_4, 2)
dds8 <- dds(counts_8, metadata_8, 2)
dds6 <- dds(counts_6, metadata_6, 3)
dds13 <- dds(counts_13, metadata_13, 3)
head(assays(dds13))

“some variables in design formula are characters, converting to factors”
using counts and average transcript lengths from tximport



[1] 40907
[1] 27159


“some variables in design formula are characters, converting to factors”
using counts and average transcript lengths from tximport



[1] 37958
[1] 23714


“some variables in design formula are characters, converting to factors”
using counts and average transcript lengths from tximport



[1] 53342
[1] 35097


“some variables in design formula are characters, converting to factors”
using counts and average transcript lengths from tximport



[1] 48089
[1] 29014


List of length 2
names(2): counts avgTxLength

## Below are flavodoxin genes to look for with kegg annotations to make sure they are not lost later

In [32]:

#isip2a:

d13=as.data.frame(assay(dds13)) %>% rownames_to_column('orfs')

filter(d13, orfs=='NODE_19011_length_1508_cov_60.425087_g9146_i1.p1') 
filter(d13, orfs=='NODE_32738_length_942_cov_54.683544_g18313_i0.p1')
filter(d13, orfs=='NODE_34623_length_873_cov_1933.596250_g19622_i0.p1')

orfs,add_backA,add_backB,add_backC,pFe19A,pFe19B,pFe19C,pFe21_9A,pFe21_9B,pFe21_9C
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
NODE_19011_length_1508_cov_60.425087_g9146_i1.p1,42,60,3,116,103,64,38,35,22


orfs,add_backA,add_backB,add_backC,pFe19A,pFe19B,pFe19C,pFe21_9A,pFe21_9B,pFe21_9C
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
NODE_32738_length_942_cov_54.683544_g18313_i0.p1,0,0,0,0,0,0,102,121,97


orfs,add_backA,add_backB,add_backC,pFe19A,pFe19B,pFe19C,pFe21_9A,pFe21_9B,pFe21_9C
<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
NODE_34623_length_873_cov_1933.596250_g19622_i0.p1,0,0,0,0,0,0,3619,3036,2551


### 2.2 VSD Normalize counts 
---
Order rows by variance across treatment. Top rows will have highest variance in normalized counts between treatments. Save in vsd folder. 

In [33]:
vsd.norm <- function(dds){
    vst(dds,blind=FALSE)
    }

#full vsd deseq2 objects:
vsd4 <- vsd.norm(dds4)
vsd8 <- vsd.norm(dds8)
vsd6 <- vsd.norm(dds6)
vsd13 <- vsd.norm(dds13)

## order the df's by decreasing variance. top rows have highest varience between 
## samples. write dataframe
write.vsd <- function(vsd, org){
    vsd <- assay(vsd)
    vsd_order <- order(rowVars(vsd), decreasing=T)
    vsd_new <- vsd[vsd_order, ]
    print(paste('saving',org,sep=' '))
    vsd_new <- as.data.frame(vsd_new) %>% rownames_to_column("orfs")
    write.csv(vsd_new, paste('./vsd_files/', org, "vsd.csv", sep=""), row.names=FALSE)
}

write.vsd(vsd4, "04")
write.vsd(vsd8, "08")
write.vsd(vsd6, "06")
write.vsd(vsd13, "13")

using 'avgTxLength' from assays(dds), correcting for library size

using 'avgTxLength' from assays(dds), correcting for library size

using 'avgTxLength' from assays(dds), correcting for library size

using 'avgTxLength' from assays(dds), correcting for library size



[1] "saving 04"
[1] "saving 08"
[1] "saving 06"
[1] "saving 13"


## 2.3 Differential Expression from DESeq
---
Run the differential expression. 

Extract the results using res() and specifying the contrast, or which treatments we wish to compare.  

In [34]:
de4 <- DESeq(dds4)
de8 <- DESeq(dds8)
de6 <- DESeq(dds6)
de13 <- DESeq(dds13)

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [35]:

de4 <- DESeq(dds4)
de8 <- DESeq(dds8)
de6 <- DESeq(dds6)
de13 <- DESeq(dds13)

#contrasts
HvL <- c("treatment", "High_Iron", "Low_Iron")
AvL <- c("treatment", "Add_Back", "Low_Iron")

#results from Low Iron vs Iron ammendment
AvL4 <- results(de4, contrast=AvL, tidy=TRUE)
AvL8 <- results(de8, contrast=AvL, tidy=T)
AvL6 <- results(de6, contrast=AvL, tidy=TRUE)
AvL13 <- results(de13, contrast=AvL, tidy=TRUE)

#results from High Iron vs Low Iron
HvL8  <- results(de8, contrast=HvL, tidy=TRUE)
HvL6  <- results(de6, contrast=HvL,  tidy=TRUE)
HvL13  <- results(de13,contrast=HvL, tidy=TRUE)

colnames(AvL4)[1]  <- "orfs"
colnames(AvL8)[1]  <- "orfs"
colnames(AvL6)[1]  <- "orfs"
colnames(AvL13)[1]  <- "orfs"

colnames(HvL8)[1]  <- "orfs"
colnames(HvL6)[1]  <- "orfs"
colnames(HvL13)[1]  <- "orfs"

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

using 'avgTxLength' from assays(dds), correcting for library size

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [51]:
colData(de4)[2]

DataFrame with 5 rows and 1 column
              isolate
          <character>
add_backA          04
add_backB          04
pFe21_9A           04
pFe21_9B           04
pFe21_9C           04

In [38]:
contrast = list('HvL' = c("treatment", "High_Iron", "Low_Iron"),
                'AvL' =c("treatment", "Add_Back", "Low_Iron"))
de_output = data.frame('Contrast'=as.character(), 'Organism'=as.character(), 
                       'num.de'=as.numeric(), 'percent de'=as.numeric())
organism = list(de4, de8, de6, de13)

for (o in organism) {
     if (colData(o)[1,2] == colData(de4)[1,2]) {
        isolate='C. closterium UGA4'
    }else if (colData(o)[1,2] == colData(de8)[1,2]) {
        isolate='C. closterium UGA8'
    }else if (colData(o)[1,2] == colData(de6)[1,2]) {
           isolate='G. oceanica'
    }else if (colData(o)[1,2] == colData(de13)[1,2]) {            
        isolate='G. huxleyi'}
    for (c in contrast) {
        if (isolate =='C. closterium UGA4' & c[2]=='High_Iron'){
            next
        }
        de.c = results(o, contrast=c, tidy=TRUE)
        de.percent = (nrow(filter(de.c,(padj < 0.05)==T))/nrow(o)*100)
        de.num = nrow(filter(de.c,(padj < 0.05) ==T))
        deAdd = data.frame('Contrast'= c[2], 'Organism'=isolate, 'num.de'=de.num, 'percent de'=de.percent)
        de_output = rbind(de_output, deAdd)
    }
              }

de_output
write.csv(de_output, './de_res_files/de_output.csv', row.names=F)

Contrast,Organism,num.de,percent.de
<chr>,<chr>,<int>,<dbl>
Add_Back,C. closterium UGA4,880,3.240178
High_Iron,C. closterium UGA8,1649,6.953698
Add_Back,C. closterium UGA8,1910,8.054314
High_Iron,G. oceanica,3177,9.052056
Add_Back,G. oceanica,2202,6.274041
High_Iron,G. huxleyi,1783,6.145309
Add_Back,G. huxleyi,2032,7.003516


## Logfold 2 shrinkage

In [39]:
## log fold change shrinkage and save file

logFC = function(de, contrast, file_name){
    lfc.de <- lfcShrink(de, contrast=contrast, type='ashr')
    lfc.de <- lfc.de %>% as.data.frame() %>% rownames_to_column("ko_id") 
    write.csv(lfc.de, paste("./de_res_files/lfc",file_name,"csv",sep=''), row.names=F)
}

lfc4.AvL <- lfcShrink(de4,  contrast=c("treatment", "Add_Back", "Low_Iron"), type='ashr')
lfc4.AvL <- lfc4.AvL %>% as.data.frame() %>% rownames_to_column("orfs") 
write.csv(lfc4.AvL, "./de_res_files/lfc4.AvL.csv", row.names=F)

lfc8.AvL <- lfcShrink(de8, contrast=c("treatment", "Add_Back", "Low_Iron"), type='ashr')
lfc8.AvL <- lfc8.AvL %>% as.data.frame() %>% rownames_to_column("orfs") 
write.csv(lfc8.AvL, "./de_res_files/lfc8.AvL.csv", row.names=F)

lfc6.AvL <- lfcShrink(de6, contrast=c("treatment", "Add_Back", "Low_Iron"), type='ashr')
lfc6.AvL <- lfc6.AvL %>% as.data.frame() %>% rownames_to_column("orfs") 
write.csv(lfc6.AvL, "./de_res_files/lfc6.AvL.csv", row.names=F)

lfc13.AvL <- lfcShrink(de13,  contrast=c("treatment", "Add_Back", "Low_Iron"), type='ashr')
lfc13.AvL <- lfc13.AvL %>% as.data.frame() %>% rownames_to_column("orfs") 
write.csv(lfc13.AvL, "./de_res_files/lfc13.AvL.csv", row.names=F)

lfc8.HvL <- lfcShrink(de8, contrast=c("treatment", "High_Iron", "Low_Iron"), type='ashr')
lfc8.HvL <- lfc8.HvL %>% as.data.frame() %>% rownames_to_column("orfs") 
write.csv(lfc8.HvL, "./de_res_files/lfc8.HvL.csv", row.names=F)

lfc6.HvL <- lfcShrink(de6, contrast=c("treatment", "High_Iron", "Low_Iron"), type='ashr')
lfc6.HvL <- lfc6.HvL %>% as.data.frame() %>% rownames_to_column("orfs") 
write.csv(lfc6.HvL, "./de_res_files/lfc6.HvL.csv", row.names=F)

lfc13.HvL <- lfcShrink(de13,  contrast=c("treatment", "High_Iron", "Low_Iron"), type='ashr')
lfc13.HvL <- lfc13.HvL %>% as.data.frame() %>% rownames_to_column("orfs") 
write.csv(lfc13.HvL, "./de_res_files/lfc13.HvL.csv", row.names=F)

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/

## 2.4 Subset results tables by significance and save
df's made into a list and looped through to order rows by adjusted p value and pull out significant rows. saving both files. Sorting the _res files positive or negative log2fold change will enable me to get the up regulated and down regulated genes.

In [13]:
df.names <- list()
samples <- c('4','8','6','13')
contrasts <- c('AvL','HvL')

for (s in samples){
    for (c in contrasts){
        if (exists(paste(c,s, sep="")) == FALSE){
            print(paste('contrast ', c, ' for ', s, ' not found'))
            next}
        df.names <- append(df.names, paste(s,c,sep=''))
    }
}

res.ls <- list(AvL4, AvL8, AvL6, AvL13, HvL8, HvL6, HvL13, AvH8, AvH6, AvH13)
names(res.ls) <- df.names

res.ls <- lapply(res.ls, function(df){   #order each df in list by p.value
    arrange(df, padj)
})

#save each df ordered by p.value
walk2(res.ls, paste0("./de_res_files/", names(res.ls), "_res.csv", sep=""), write.csv,row.names=F)

res.ls.sig <- lapply(res.ls, function(df){   #pull out significant DE's
    filter(df, padj<=0.05)
})

#save significant de's for each df
walk2(res.ls.sig, paste0("./de_res_files/", names(res.ls), "sig_res.csv", sep=""), write.csv,row.names=F)

[1] "contrast  HvL  for  4  not found"
[1] "contrast  AvH  for  4  not found"


# Make table of % kegg annotated orfs and other functional annotation info

In [40]:
ko4_ls=read.csv('../kegg_names/ko4_ls.csv')
ko8_ls=read.csv('../kegg_names/ko8_ls.csv')
ko6_ls=read.csv('../kegg_names/ko6_ls.csv')
ko13_ls=read.csv('../kegg_names/ko13_ls.csv')


#make sure to REMOVE ORFS NOT COUNTED BY SALMON!
ko_annotated=data.frame('Organism'=c('C. closterium 4', 'C. closterium 8', 'G. oceanica', 'G. huxleyi'),
                        'Anotated orfs (%)'=c(
                            (nrow(filter(ko4_ls, ko_iteration=='ko:_1'))/nrow(de4)*100),
                            (nrow(filter(ko8_ls, ko_iteration=='ko:_1'))/nrow(de8)*100),
                            (nrow(filter(ko6_ls, ko_iteration=='ko:_1'))/nrow(de6)*100),
                            (nrow(filter(ko13_ls, ko_iteration=='ko:_1'))/nrow(de13)*100)))
head(ko_annotated)
write.csv(ko_annotated, './de_res_files/ko_anotat_de.csv', row.names=F)

Unnamed: 0_level_0,Organism,Anotated.orfs....
Unnamed: 0_level_1,<chr>,<dbl>
1,C. closterium 4,45.27044
2,C. closterium 8,46.3355
3,G. oceanica,66.00849
4,G. huxleyi,70.26608


# Now repeat steps but summarize to 'kegg gene level'
## 1. Make df of raw counts which are summed to kegg level.
#### Read in the list of ORF-to-Ko's for each sample 
This list repeates orfs when multiple ko's were assigned to a single orf by eggnog. Using the 'orf' column from the organism specific ko list, we want to map the counts to each row of the orf-to-ko list. This will automatically repeat the counts for each repeated orf in the list and allow us to sum orfs with matching ko's later. 

<b/> Remember, not all rows from the counts table will have a ko assigned and thus will not appear in the orf-to-ko list. Additionally, not all orfs annotated by eggnog were counted by salmon. Thus we must remove any rows from the orf-to-ko table for which orfs are not found </b>

Because some orfs had multiple ko assignments the merging relationship will be many-to-one, many orf-ko rows matching to one counts row. "Each row in x (orf-to-ko list) matches at most 1 row in y (counts table)."

Once the two tables are merged, we can group by ko_id and sum counts which have the same ko_id. The result is a new raw counts table (matrix) which we can used in deseq. 

# Split counts by Ko

In [3]:
ko.def=read.csv('../kegg_names/ko_def.csv')

split.sum.kegg <- function(org, counts.raw){
    ko_df= read.csv(paste('../kegg_names/ko', org,'_ls.csv', sep=''))
    ko_df$ko_iteration = as.numeric(str_remove(ko_df$ko_iteration, 'ko:_'))
    ko_df = ko_df %>% group_by(orfs) %>% mutate(max_ko=max(ko_iteration))
    
    # make raw counts matrix into a tibble
    counts <- as_tibble(counts.raw$counts, rownames = "orfs")
    
    # merge the two, so orfs and thier counts are repeated when they match to multiple ko's
    b <- left_join(x=ko_df, y=counts, by='orfs', relationship="many-to-one")

    #group by orf and split counts for each ko_id
    split_count = b %>% mutate(across(c(4:(ncol(b)-1)),.fns = ~./max_ko)) 
    #group by ko and sum counts
    sum_count = split_count %>% ungroup() %>% na.omit() %>% select(!c(orfs, ko_iteration, max_ko)) %>%  
        group_by(ko_id) %>% summarise(across(everything(), sum)) %>%
        column_to_rownames("ko_id") %>% as.matrix
    
    mode(sum_count) <- 'integer'
    sum_count
    }

split_counts_4 <- split.sum.kegg('4', counts_4)
split_counts_8 <- split.sum.kegg('8', counts_8)
split_counts_6 <- split.sum.kegg('6', counts_6)
split_counts_13 <- split.sum.kegg('13', counts_13)

In [4]:
## create deseq object from matrix
kegg.dds <- function(split.counts, metadata,n){
    #n = lowest number of reps in any treatment
   dds <-  DESeqDataSetFromMatrix(
       countData = split.counts, 
       colData = metadata, 
       design=~treatment)
    dds$treatment <- relevel(dds$treatment, ref = "Low_Iron")
    keep <- rowSums(counts(dds) >=10) >= n #filter out rows with too low expression
    dds <- dds[keep, ]
    dds}

kegg.dds4 <- kegg.dds(split_counts_4, metadata_4, 2)
kegg.dds8 <- kegg.dds(split_counts_8, metadata_8, 2)
kegg.dds6 <- kegg.dds(split_counts_6, metadata_6, 3)
kegg.dds13 <- kegg.dds(split_counts_13, metadata_13, 3)

“some variables in design formula are characters, converting to factors”
“some variables in design formula are characters, converting to factors”
“some variables in design formula are characters, converting to factors”
“some variables in design formula are characters, converting to factors”


In [5]:
## vsd normalize and save ordered by variance
vsd.norm <- function(dds){
    vst(dds,blind=FALSE)
    }
kegg.vsd4 <- vsd.norm(kegg.dds4)
kegg.vsd8 <- vsd.norm(kegg.dds8)
kegg.vsd6 <- vsd.norm(kegg.dds6)
kegg.vsd13 <- vsd.norm(kegg.dds13)

write.vsd.kegg <- function(vsd, org){
    vsd <- assay(vsd)
    vsd_order <- order(rowVars(vsd), decreasing=T)
    vsd_new <- vsd[vsd_order, ]
    vsd_new = vsd_new %>% as.data.frame() %>% rownames_to_column('ko_id')
    #colnames(vsd_new)[1] = 'ko_id'
    write.csv(vsd_new, paste('./vsd_files/vsd.', org, ".kegg.csv", sep=""))
}

write.vsd.kegg(kegg.vsd4, '04')
write.vsd.kegg(kegg.vsd8, '08')
write.vsd.kegg(kegg.vsd6, '06')
write.vsd.kegg(kegg.vsd13, '13')

In [6]:
## run differential expression analysis
de4.kegg <- DESeq(kegg.dds4)
de8.kegg <- DESeq(kegg.dds8)
de6.kegg <- DESeq(kegg.dds6)
de13.kegg <- DESeq(kegg.dds13)

#extract results
HvL <- c("treatment", "High_Iron", "Low_Iron")
AvL <- c("treatment", "Add_Back", "Low_Iron")

#results from Low Iron vs Iron ammendment
AvL4.kegg <- results(de4.kegg, contrast=AvL, tidy=TRUE)
AvL8.kegg <- results(de8.kegg, contrast=AvL, tidy=T)
AvL6.kegg <- results(de6.kegg, contrast=AvL, tidy=TRUE)
AvL13.kegg <- results(de13.kegg, contrast=AvL, tidy=TRUE)

#results from High Iron vs Low Iron
HvL8.kegg <- results(de8.kegg, contrast=HvL, tidy=TRUE)
HvL6.kegg <- results(de6.kegg, contrast=HvL,  tidy=TRUE)
HvL13.kegg <- results(de13.kegg,contrast=HvL, tidy=TRUE)

colnames(AvL4.kegg)[1]  <- "ko_id"
colnames(AvL8.kegg)[1]  <- "ko_id"
colnames(AvL6.kegg)[1]  <- "ko_id"
colnames(AvL13.kegg)[1]  <- "ko_id"

colnames(HvL8.kegg)[1]  <- "ko_id"
colnames(HvL6.kegg)[1]  <- "ko_id"
colnames(HvL13.kegg)[1]  <- "ko_id"

#save deseq results

AvL4.kegg %>% na.omit() %>% write.csv('./de_res_files/AvL4.kegg.csv', row.names=F)
AvL8.kegg %>% na.omit() %>% write.csv('./de_res_files/AvL8.kegg.csv', row.names=F)
AvL6.kegg %>% na.omit() %>% write.csv('./de_res_files/AvL6.kegg.csv', row.names=F)
AvL13.kegg %>% na.omit() %>% write.csv('./de_res_files/AvL13.kegg.csv', row.names=F)

HvL8.kegg %>% na.omit() %>% write.csv('./de_res_files/HvL8.kegg.csv', row.names=F)
HvL6.kegg %>% na.omit() %>% write.csv('./de_res_files/HvL6.kegg.csv', row.names=F)
HvL13.kegg %>% na.omit() %>% write.csv('./de_res_files/HvL13.kegg.csv', row.names=F)



estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing

estimating size factors

estimating dispersions

gene-wise dispersion estimates

mean-dispersion relationship

final dispersion estimates

fitting model and testing



In [7]:
## log fold change shrinkage and save file

logFC = function(de, contrast, file_name){
    lfc.de <- lfcShrink(de, contrast=contrast, type='ashr')
    lfc.de <- lfc.de %>% as.data.frame() %>% rownames_to_column("ko_id") 
    write.csv(lfc.de, paste("./de_res_files/lfc",file_name,".csv",sep=''), row.names=F)
}

logFC(de4.kegg, contrast=AvL, file_name='4.AvL.kegg') 
logFC(de8.kegg, contrast=AvL, file_name='8.AvL.kegg') 
logFC(de8.kegg, contrast=HvL, file_name='8.HvL.kegg') 
logFC(de6.kegg, contrast=AvL, file_name='6.AvL.kegg') 
logFC(de6.kegg, contrast=HvL, file_name='6.HvL.kegg') 
logFC(de13.kegg, contrast=AvL, file_name='13.AvL.kegg') 
logFC(de13.kegg, contrast=HvL, file_name='13.HvL.kegg') 



using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/10.1093/biostatistics/kxw041

using 'ashr' for LFC shrinkage. If used in published research, please cite:
    Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
    https://doi.org/

In [27]:
avl4.perc=(nrow(filter(AvL4.kegg,(padj < 0.05)==T))/nrow(de4.kegg))*100
avl4=nrow(filter(AvL4.kegg,(padj < 0.05)==T))

avl8.perc=(nrow(filter(HvL8.kegg,(padj < 0.05)==T))/nrow(de8.kegg))*100
avl8=nrow(filter(HvL8.kegg,(padj < 0.05)==T))

avl4.perc
avl4

avl8.perc
avl8

In [10]:
contrast = list('HvL' = c("treatment", "High_Iron", "Low_Iron"),
                'AvL' =c("treatment", "Add_Back", "Low_Iron"))
de_ko_output = data.frame('Contrast'=as.character(), 'Organism'=as.character(), 
                       'num.de.ko'=as.numeric(), 'percent.de.ko'=as.numeric())
organism = list(de4.kegg, de8.kegg, de6.kegg, de13.kegg)

for (o in organism) {
     if (colData(o)[1,2] == colData(de4.kegg)[1,2]) {
        isolate='C. closterium UGA4'
    }else if (colData(o)[1,2] == colData(de8.kegg)[1,2]) {
        isolate='C. closterium UGA8'
    }else if (colData(o)[1,2] == colData(de6.kegg)[1,2]) {
           isolate='G. oceanica'
    }else if (colData(o)[1,2] == colData(de13.kegg)[1,2]) {            
        isolate='G. huxleyi'}
    for (c in contrast) {
        if (isolate =='C. closterium UGA4' & c[2]=='High_Iron'){
            next
        }
        de.c = results(o, contrast=c, tidy=TRUE)
        de.percent = (nrow(filter(de.c,(padj < 0.05)==T))/nrow(o)*100)
        de.num = nrow(filter(de.c,(padj < 0.05) ==T))
        deAdd = data.frame('Contrast'= c[2], 'Organism'=isolate, 'num.de.ko'=de.num, 'percent.de.ko'=de.percent)
        de_ko_output = rbind(de_ko_output, deAdd)
    }
        }
write.csv(de_ko_output,'./de_res_files/de_ko_output.csv')
de_ko_output

Contrast,Organism,num.de.ko,percent.de.ko
<chr>,<chr>,<int>,<dbl>
Add_Back,C. closterium UGA4,198,6.655462
High_Iron,C. closterium UGA8,254,9.000709
Add_Back,C. closterium UGA8,799,28.313253
High_Iron,G. oceanica,1186,25.159101
Add_Back,G. oceanica,1109,23.525668
High_Iron,G. huxleyi,1089,25.732514
Add_Back,G. huxleyi,1022,24.149338


## Now we have ordered vsd file and results file (for each comparison) for both transcript level and counts summarized at kegg annotation level. 
#### Taking these files we will create plots. 
## It might be best to use transcript level for ma plots and 'gene' level for clusterProfiler?