In [None]:
##########################################
# Filtering and normalization of ASV count 
##########################################


In [2]:
# Set seed
set.seed(32426)

# Load libraries
source("scr/functions/general/load_abs_install_pkg.R")

load_abs_install_pkg(c("phyloseq", "metagenomeSeq", "tidyverse", "RColorBrewer", "gmodels"))

# Create output directory 
dir.create("output/3_filt_norm_phyl")


In [3]:

# 1. Read in phyloseq object generated by DADA2
###############################################

ps.0 <- readRDS("output/2_dada2/phyloseq0.rds")

In [4]:

# 2. Remove ASVs that have count or/end prevalence less than 3 
###########################################################

# Remove ASVs with less than 3 observations   
ps.0f <- prune_taxa(taxa = colSums(ps.0@otu_table) > 3 , x= ps.0) 

# Extract otu table into otu_prev object
otu_prev <- otu_table(ps.0f)

# Convert otu_prev from count to prevalence data 
otu_prev[otu_prev > 1] <- 1

# Remove taxa with prevalence less than 3 
ps_tf1 <- prune_taxa(taxa = colSums(otu_prev) > 3 , x= ps.0f)

In [5]:

# 3. Remove sample that have less than a 1000 observations  
######################################################

# Extract ASV table into otu_prev object
o.tab.ps_tf1 <- otu_table(ps_tf1)

# Prune samples with less than 1000 observation from the phyloseq object 
ps_tf2 <- prune_samples(samples = rownames(o.tab.ps_tf1)[rowSums(o.tab.ps_tf1) > 1000], ps_tf1)

In [6]:

# 4. Normalize ASVs abundance using CSS as implemented in the metagenomSeq 
##########################################################################

# Prepare data from phyloseq package 
mg.ps_tf2 <- phyloseq_to_metagenomeSeq(ps_tf2)

# Calculate cumulative statistics 
p <- metagenomeSeq::cumNormStatFast(mg.ps_tf2)

# Normalize count 
mg.c.ps_tf2 <- metagenomeSeq::cumNorm(mg.ps_tf2, p = p)

# Convert metagenomSeq object with normalized count into a dataframe  
css.otu.all <- data.frame(otu_table(metagenomeSeq::MRcounts(mg.c.ps_tf2, 
                        norm = TRUE, log = TRUE), taxa_are_rows = FALSE))

# Convert into matrix 
css.otu.all.m <- as.matrix(t(css.otu.all))

# Replace "." with "-" in rownames 
rname <- gsub("[.]", "-", rownames(css.otu.all.m))

# Replace remove "X" from rownames 
rownames(css.otu.all.m) <- gsub("X", "", rname)

# Make a copy of the original phyloseq object 
ps_tf2_css <- ps_tf2

# Replace ASV count table with CSS normalized ASV table 
ps_tf2_css@otu_table@.Data <- css.otu.all.m 

# Write new phyloseq into RDS file 
saveRDS(ps_tf2_css, "output/3_filt_norm_phyl/ps_tf2_css.RDS")

Default value being used.


In [7]:

# 5. Extract general information about sequencing
##################################################

# Median number of samples per animal 
med.samp.n <- median(table(ps_tf2_css@sam_data$CowN))

# Write information about number of sample per animal into a file 
write_csv(x = as.data.frame(table(ps_tf2_css@sam_data$CowN)),
          path = "output/3_filt_norm_phyl/Table_S1.csv")

# Extract otu table from filtered and not normalized phyloseq object for 
#         summary statistics 
otus.tab <- otu_table(ps_tf2)

# Total observations  
reads.sum <- sum(otus.tab)

# Observations Median
reads.med <- median(rowSums(otus.tab))

# Observations Min 
reads.min <- min(rowSums(otus.tab))

# Observations Max 
reads.max <- max(rowSums(otus.tab))

In [8]:

# 6. Visualize observations distribution 
########################################

# Calculate number of observations per sample 
reads.pl.df <- as.data.frame(rowSums(otus.tab))

# Adjust column names 
colnames(reads.pl.df) <- "Reads"

# Add samples ID column 
reads.pl.df$SampleID <- rownames(reads.pl.df)

# Order samples by number of observations 
reads.pl.df <- reads.pl.df[order(reads.pl.df$Reads), ]

# Order levels of samples
reads.pl.df$SampleID <- factor(reads.pl.df$SampleID, levels=unique(reads.pl.df$SampleID))

# Plot number of observations per sample
reads.pers.p <- ggplot(reads.pl.df, aes(x = SampleID, y = Reads)) + 
  geom_bar(stat="identity") + 
  theme_bw() + 
  geom_hline(yintercept = mean(reads.pl.df$Reads), color="blue") +
  theme(axis.text.x = element_blank()) + 
  ylab("Reads number") + 
  xlab("Samples") + 
  theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank())

# Save the plot
ggsave(filename = "output/3_filt_norm_phyl/Figure_S2.pdf", 
       plot = reads.pers.p, width = 10, height = 6)
ggsave(filename = "output/3_filt_norm_phyl//Figure_S2.jpg", 
       plot = reads.pers.p, width = 10, height = 6, dpi = 400)

In [9]:

# 7. Visualization of phylogenetic composition on Phylum level 
##############################################################

# Prepare data for plotting 
# Glom phyloseq to Phylum level 
ps.tf2.plot <- tax_glom(ps_tf2, taxrank = "Phylum")

# Transform count to relative abundance 
ps.tf2.plot <- transform_sample_counts(ps.tf2.plot, function(x) x / sum(x) * 100)

# Extract otu table 
phy.plot.d <- data.frame(t(otu_table(ps.tf2.plot)))

# Calculate row sums 
phy.plot.rs <- rowSums(phy.plot.d)

# Add column with Phylum names 
phy.plot.d$Phylum <- as.character(tax_table(ps.tf2.plot)[,"Phylum"])
                                       
# Melt dataframe into long format 
phy.plot.dm <- gather(phy.plot.d, SampleID, Abundance, -Phylum)
                                       
# Order factors in column Phylum by abundance 
phy.plot.dm$Phylum <- factor(phy.plot.dm$Phylum, 
                             levels = c(phy.plot.dm$Phylum[order(phy.plot.rs, decreasing = TRUE)]))

# Prepare custom colors pallet 
plot.col <- c(brewer.pal(n = 8, name = "Dark2"), replicate('#666666', n = 7))

# Plot phylogenetic composition on Phylum level
phy.plot <- ggplot(phy.plot.dm, aes(x = SampleID, y = Abundance, fill = Phylum)) + 
        geom_bar(stat = "identity") + 
        scale_fill_manual(values = plot.col) + 
        theme_bw() + 
        theme(axis.text.x = element_blank(), 
              panel.grid = element_blank(), 
              axis.ticks.x = element_blank(), )

# Save the plot
ggsave(filename = "output/3_filt_norm_phyl/Figure_S3.pdf", width = 10, height = 5.5)
ggsave(filename = "output/3_filt_norm_phyl/Figure_S3.jpg", width = 10, height = 5.5, dpi = 400)
                                       

In [10]:

# 8. Summary table of phylogenetic composition on Phylum level 
##############################################################

# Add phyla names as row names 
rownames(phy.plot.d) <- phy.plot.d$Phylum

# Remove the Phylum column 
phy.plot.d2 <- phy.plot.d[, ! colnames(phy.plot.d) %in% "Phylum"]

# Make summary statistics of the phyla relative abundance
sum.t <- as.data.frame(summary(t(phy.plot.d2)))

# Format summary statistics of the phyla relative abundance
sum.t2<- as.data.frame(cbind(as.character(sum.t$Var2), 
              str_split(sum.t$Freq, pattern = ":", simplify = TRUE)))

# Convert Summary statistics into a wide table
sum.t3 <- spread(sum.t2, V2, V3)

# Reorder Summary statistics table for future bind with CI calculation results 
sum.t3 <- sum.t3[order(trimWhiteSpace(as.character(sum.t3$V1))), ]

# Calculate CI per phylum 
ci.t3 <- round(t(apply(phy.plot.d2, 1, ci)), 2)

# Reorder CI table to further combine with Summary statistics 
ci.t3 <- ci.t3[order(rownames(ci.t3)), ]

# Bind (wide) CI and Summary tables 
summ.save <- cbind(sum.t3,  ci.t3)

# Write the combined table into file 
write_csv(x = summ.save, path = "output/3_filt_norm_phyl/Table_S2.csv")

“No class or unkown class.  Using default calcuation.”