Skip to content

CBW 2024 Beginner Module 4: Functional prediction and additional analyses

Robyn Wright edited this page May 1, 2024 · 5 revisions

PICRUSt2

Get data from end of module 2

cd workspace
mkdir bmb_module4
cd bmb_module4/
ln -s ~/workspace/bmb_module2/deblur_output/ .
ln -s ~/workspace/bmb_module2/taxa/ .
ln -s ~/workspace/bmb_module2/Blueberry_metadata_reduced.tsv .
ls #to check everything copied over ok

Filter and export files

conda activate qiime2-amplicon-2024.2

qiime feature-table filter-features \
  --i-table deblur_output/deblur_table_final.qza \
  --p-min-frequency 50 \
  --p-min-samples 2 \
  --o-filtered-table final_table_filtered.qza
  
qiime feature-table filter-seqs \
  --i-data deblur_output/representative_sequences.qza \
  --i-table final_table_filtered.qza \
  --o-filtered-data  representative_sequences_final_filtered.qza
  
qiime tools export \
  --input-path representative_sequences_final_filtered.qza \
  --output-path exports_filtered

qiime tools export \
  --input-path final_table_filtered.qza \
  --output-path exports_filtered
  
biom convert \
  -i exports_filtered/feature-table.biom \
  -o exports_filtered/feature-table.txt \
  --to-tsv 
  
qiime tools export \
  --input-path taxa/classification.qza \
  --output-path taxa
  
sed -i -e '1 s/Feature/#Feature/' -e '1 s/Taxon/taxonomy/' taxa/taxonomy.tsv
  
conda deactivate

Run PICRUSt2

conda activate picrust2
picrust2_pipeline.py -s exports_filtered/dna-sequences.fasta -i exports_filtered/feature-table.biom -o picrust2_out_pipeline_filtered -p 4 -t sepp

Differential abundance

Read into Phyloseq

library(reticulate)
use_condaenv('/home/ubuntu/CourseData/MIC_data/.conda/envs/r-env/')
library(phyloseq)
library(Maaslin2, lib.loc="/home/ubuntu/CourseData/MIC_data/.conda/envs/r-env/lib/R/library")
library(ANCOMBC, lib.loc="/home/ubuntu/CourseData/MIC_data/.conda/envs/r-env/lib/R/library")
library(microbiome, lib.loc="/home/ubuntu/CourseData/MIC_data/.conda/envs/r-env/lib/R/library")
library(ALDEx2)
library(ggplot2)
library(tidyr)
library(stringr)
library(vegan)

folder = '/home/ubuntu/workspace/bmb_module4/'
asv_table <- read.csv(paste(folder, "exports_filtered/feature-table.txt", sep=""), sep='\t', skip=1, header=T)
asv_table_num = data.matrix(asv_table[,2:13]) #convert the ASV table to a numeric matric
rownames(asv_table_num) = asv_table[,1] #give the matrix row names

taxonomy <- read.csv(paste(folder, "taxa/taxonomy.tsv", sep=""), sep='\t')
taxonomy_split <- separate(data = taxonomy, col = taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = "\\; ") #separate the taxonomy table so each phylogenetic level is its own column
taxonomy_split <- taxonomy_split[,-c(1,9)] #remove the Feature.ID and Confidence columns from the taxonomy table
rownames(taxonomy_split) <- taxonomy[,1] #and now give the taxonomy table the OTU IDs as row names

metadata <- read.csv(paste(folder, "Blueberry_metadata_reduced.tsv", sep=""), sep='\t')
samples = metadata[,2:3] #get the metadata columns
rownames(samples) = metadata[,1] #and add the sample names as row names
samples = data.frame(samples, stringsAsFactors = FALSE) #convert this to a data frame

ASV = otu_table(asv_table_num, taxa_are_rows = TRUE)
TAX = tax_table(taxonomy_split)
taxa_names(TAX) <- rownames(taxonomy_split)
SAMPLE = sample_data(samples)
physeq = phyloseq(ASV, TAX, SAMPLE)
physeq
physeq_genus = tax_glom(physeq, taxrank="ta6")
all_tax = paste(tax_table(physeq_genus)[,2], tax_table(physeq_genus)[,3], tax_table(physeq_genus)[,4], tax_table(physeq_genus)[,5], tax_table(physeq_genus)[,6], sep=';')
taxa_names(physeq_genus) = all_tax
otu_table(physeq_genus)

MaAsLin2

#rarefy
physeq_rare = rarefy_even_depth(physeq_genus, sample.size = min(sample_sums(physeq_genus)), replace = TRUE, trimOTUs = TRUE, verbose = TRUE)

feat_table = data.frame(t(otu_table(physeq_rare)), check.rows=T, check.names=T, stringsAsFactors=T)
metadata = data.frame(sample_data(physeq_rare), stringsAsFactors = F)

results_all <- Maaslin2(feat_table, metadata, paste(folder, "MaAsLin2_out_taxa_Description_1", sep=""), transform = "AST", fixed_effects = c("Description_1"), reference=paste("Description_1", "Bulk", sep=','), standardize = FALSE, plot_heatmap = T, plot_scatter = T)
results_all <- Maaslin2(feat_table, metadata, paste(folder, "MaAsLin2_out_taxa_Description_3", sep=""), transform = "AST", fixed_effects = c("Description_3"), reference=paste("Description_3", "Forest", sep=','), standardize = FALSE, plot_heatmap = T, plot_scatter = T)

ANCOM2

ancom_out = ancombc(phyloseq=physeq_genus, formula="Description_1+Description_3", alpha=0.1)
w = ancom_out$res$W
q = ancom_out$res$q_val
p = ancom_out$res$p_val
rownames(w) = w[,1]
w = w[,-c(1:2)]
rownames(q) = q[,1]
q = q[,-c(1:2)]
rownames(p) = p[,1]
p = p[,-c(1:2)]
colnames(w) = c("Description_1 W", "Description_3 W")
colnames(q) = c("Description_1 q", "Description_3 q")
colnames(p) = c("Description_1 p", "Description_3 p")
a_out = merge(w, q, by = 'row.names')
rownames(a_out) = rownames(w)
a_out = a_out[,-1]
a_out = merge(a_out, p, by = 'row.names')
rownames(a_out) = rownames(w)
a_out = a_out[,-1]
write.csv(a_out, paste(folder, "ANCOM_taxa.csv", sep=""))

ALDEx2

x <- aldex.clr(otu_table(physeq_genus), sample_data(physeq)$Description_1, mc.samples = 128, verbose=F, denom="all")
kw.test.d1 <- aldex.kw(x, useMC=2, verbose=FALSE)
write.csv(kw.test.d1, paste(folder, "ALDEx2_taxa_Description_1.csv", sep=""))

x <- aldex.clr(otu_table(physeq_genus), sample_data(physeq)$Description_3, mc.samples = 128, verbose=F, denom="all")
kw.test.d3 <- aldex.kw(x, useMC=2, verbose=FALSE)
write.csv(kw.test.d3, paste(folder, "ALDEx2_taxa_Description_3.csv", sep=""))

Combine DA

Description_1:

maaslin = read.csv(paste(folder, "MaAsLin2_out_taxa_Description_1/significant_results.tsv", sep=""), sep="\t")
maaslin$feature = gsub("\\g__Burkholderia.Caballeronia.Paraburkholderia", "g__Burkholderia-Caballeronia-Paraburkholderia", maaslin$feature)
maaslin$feature = gsub("\\RCP2.54", "RCP2-54", maaslin$feature)
maaslin$feature = gsub("\\.", ";", maaslin$feature)
maaslin = maaslin[maaslin$qval <= 0.1, ]
aldex = read.csv(paste(folder, "ALDEx2_taxa_Description_1.csv", sep=""))
aldex = aldex[aldex$kw.eBH <= 0.1, ]
ancom = read.csv(paste(folder, "ANCOM_taxa.csv", sep=""))
ancom = ancom[ancom$Description_1.q <= 0.1, ]

taxa = c(maaslin$feature, aldex$X, ancom$X)
taxa = unique(taxa)
physeq_rare_d1 = prune_taxa(taxa, physeq_rare)
physeq_relabun_d1  = transform_sample_counts(physeq_rare_d1, function(x) (x / sum(x))*100 )

Plot DA single taxon

single_taxon = ""
#e.g. single_taxon="p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Beijerinckiaceae;g__Roseiarcus"

microbiome::boxplot_abundance(physeq_relabun_d1, x='Description_1', y=single_taxon, line = NULL, violin = FALSE, na.rm = FALSE, show.points = TRUE) + ggtitle(str_replace(single_taxon, ';f__', '\nf__')) + ylab('Relative abundance (%)')

Plot DA all taxa

for(i in 1:length(taxa)) {
  print(microbiome::boxplot_abundance(physeq_relabun_d1, x='Description_1', y=taxa[i]) + ggtitle(str_replace(taxa[i], ';f__', '\nf__')) + ylab('Relative abundance (%)'))
}

See if you can repeat these steps for Description_3

############
#only in answers
############
maaslin = read.csv(paste(folder, "MaAsLin2_out_taxa_Description_3/significant_results.tsv", sep=""), sep="\t")
maaslin$feature = gsub("\\g__Burkholderia.Caballeronia.Paraburkholderia", "g__Burkholderia-Caballeronia-Paraburkholderia", maaslin$feature)
maaslin$feature = gsub("\\RCP2.54", "RCP2-54", maaslin$feature)
maaslin$feature = gsub("\\.", ";", maaslin$feature)
maaslin = maaslin[maaslin$qval <= 0.1, ]
aldex = read.csv(paste(folder, "ALDEx2_taxa_Description_3.csv", sep=""))
aldex = aldex[aldex$kw.eBH <= 0.1, ]
ancom = read.csv(paste(folder, "ANCOM_taxa.csv", sep=""))
ancom = ancom[ancom$Description_3.q <= 0.1, ]

taxa = c(maaslin$feature, aldex$X, ancom$X)
taxa = unique(taxa)
physeq_rare_d3 = prune_taxa(taxa, physeq_rare)
physeq_relabun_d3 = transform_sample_counts(physeq_rare_d3, function(x) (x / sum(x))*100 )
############
#only in answers
############
single_taxon = ""
#e.g. single_taxon="p__Actinobacteriota;c__Actinobacteria;o__Catenulisporales;f__Actinospicaceae;g__Actinospica"

microbiome::boxplot_abundance(physeq_relabun_d3, x='Description_3', y=single_taxon, line = NULL, violin = FALSE, na.rm = FALSE, show.points = TRUE) + ggtitle(str_replace(single_taxon, ';f__', '\nf__')) + ylab('Relative abundance (%)')

HINT: have you made sure that you've changed Description_1 to Description_3 and d1 to d3 in every place that they occur?? HINT: make sure you change the taxon as it's probably different in this one!

############
#only in answers
############
for(i in 1:length(taxa)) {
  print(microbiome::boxplot_abundance(physeq_relabun_d3, x='Description_3', y=taxa[i]) + ggtitle(str_replace(taxa[i], ';f__', '\nf__')) + ylab('Relative abundance (%)'))
}

DA PICRUSt2

Now try to do the same with the PICRUSt2 file pathways_out/path_abun_unstrat.tsv. Hint: You'll need to unzip the file first, and you won't need the "TAX" option.

############
#only in answers
############
gunzip picrust2_out_pipeline_filtered/pathways_out/path_abun_unstrat.tsv.gz
############
#only in answers
############
folder = '/home/ubuntu/workspace/bmb_module4/'

pwy_table <- read.csv(paste(folder, "picrust2_out_pipeline_filtered/pathways_out/path_abun_unstrat.tsv", sep=""), sep='\t', header=T)
pwy_table_num = data.matrix(pwy_table[,2:13]) #convert the ASV table to a numeric matric
rownames(pwy_table_num) = pwy_table[,1] #give the matrix row names

metadata <- read.csv(paste(folder, "Blueberry_metadata_reduced.tsv", sep=""), sep='\t')
samples = metadata[,2:3] #get the metadata columns
rownames(samples) = metadata[,1] #and add the sample names as row names
samples = data.frame(samples, stringsAsFactors = FALSE) #convert this to a data frame

PWY = otu_table(pwy_table_num, taxa_are_rows = TRUE)
SAMPLE = sample_data(samples)
physeq_pwy = phyloseq(PWY, SAMPLE)
physeq_pwy_relabun = transform_sample_counts(physeq_pwy, function(x) (x / sum(x))*100 )
top_100_abun <- names(sort(taxa_sums(physeq_pwy_relabun), TRUE)[1:100]) #get most abundant ones
mode(pwy_table_num) = "integer"
PWY_int = otu_table(pwy_table_num, taxa_are_rows = TRUE)
physeq_pwy_int = phyloseq(PWY_int, SAMPLE)

HINT: this file doesn't have a junk first line

Alpha diversity:

plot_richness(physeq_pwy_int, x="Description_1", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
plot_richness(physeq_pwy_int, x="Description_3", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()

Beta diversity:

ps.ord <- ordinate(physeq_pwy_relabun, "PCoA", "bray")
plot_ordination(physeq_pwy_relabun, ps.ord, type="samples", color="Description_1", shape="Description_3") 

MaAsLIn2

############
#only in answers
############

#rarefy
physeq_rare_pwy = rarefy_even_depth(physeq_pwy, sample.size = min(sample_sums(physeq_pwy)), replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
physeq_rare_pwy_top_100 = prune_taxa(top_100_abun, physeq_rare_pwy)

feat_table = data.frame(t(otu_table(physeq_rare_pwy_top_100)), check.rows=F, check.names=F, stringsAsFactors=F)
metadata = data.frame(sample_data(physeq_rare), stringsAsFactors = F)
results_all <- Maaslin2(feat_table, metadata, paste(folder, "MaAsLin2_out_pathways_Description_1", sep=""), transform = "AST", fixed_effects = c("Description_1"), reference=paste("Description_1", "Bulk", sep=','), standardize = FALSE, plot_heatmap = T, plot_scatter = T)
results_all <- Maaslin2(feat_table, metadata, paste(folder, "MaAsLin2_out_pathways_Description_3", sep=""), transform = "AST", fixed_effects = c("Description_3"), reference=paste("Description_3", "Forest", sep=','), standardize = FALSE, plot_heatmap = T, plot_scatter = T)

HINT: make sure that you don't write over the previous folders and files that you made!

ALDEx2:

############
#only in answers
############
physeq_pwy_top_100 = prune_taxa(top_100_abun, physeq_pwy)
otu_table(physeq_pwy_top_100) = round(otu_table(physeq_pwy_top_100))

x <- aldex.clr(otu_table(physeq_pwy_top_100), sample_data(physeq_pwy_top_100)$Description_1, mc.samples = 128, verbose=F, denom="all")
kw.test.d1 <- aldex.kw(x, useMC=2, verbose=FALSE)
write.csv(kw.test.d1, paste(folder, "ALDEx2_pathways_Description_1.csv", sep=""))

x <- aldex.clr(otu_table(physeq_pwy_top_100), sample_data(physeq_pwy_top_100)$Description_3, mc.samples = 128, verbose=F, denom="all")
kw.test.d3 <- aldex.kw(x, useMC=2, verbose=FALSE)
write.csv(kw.test.d3, paste(folder, "ALDEx2_pathways_Description_3.csv", sep=""))

ANCOM2:

############
#only in answers
############
ancom_out = ancombc(phyloseq=physeq_pwy_top_100, formula="Description_1+Description_3", alpha=0.1)
w = ancom_out$res$W
q = ancom_out$res$q_val
p = ancom_out$res$p_val
rownames(w) = w[,1]
w = w[,-c(1:2)]
rownames(q) = q[,1]
q = q[,-c(1:2)]
rownames(p) = p[,1]
p = p[,-c(1:2)]
colnames(w) = c("Description_1 W", "Description_3 W")
colnames(q) = c("Description_1 q", "Description_3 q")
colnames(p) = c("Description_1 p", "Description_3 p")
a_out = merge(w, q, by = 'row.names')
rownames(a_out) = rownames(w)
a_out = a_out[,-1]
a_out = merge(a_out, p, by = 'row.names')
rownames(a_out) = rownames(w)
a_out = a_out[,-1]
write.csv(a_out, paste(folder, "ANCOM_pathways.csv", sep=""))

Combine:

############
#only in answers
############
maaslin = read.csv(paste(folder, "MaAsLin2_out_pathways_Description_1/significant_results.tsv", sep=""), sep="\t")
# maaslin$feature = gsub("\\g__Burkholderia.Caballeronia.Paraburkholderia", "g__Burkholderia-Caballeronia-Paraburkholderia", maaslin$feature)
# maaslin$feature = gsub("\\RCP2.54", "RCP2-54", maaslin$feature)
maaslin$feature = gsub("\\.", "-", maaslin$feature)
maaslin = maaslin[maaslin$qval <= 0.1, ]
aldex = read.csv(paste(folder, "ALDEx2_pathways_Description_1.csv", sep=""))
aldex = aldex[aldex$kw.eBH <= 0.1, ]
ancom = read.csv(paste(folder, "ANCOM_pathways.csv", sep=""))
ancom = ancom[ancom$Description_1.q <= 0.1, ]

pathways = c(ancom$X, aldex$X, maaslin$feature)
pathways = unique(pathways)
physeq_pwy_relabun_d1 = prune_taxa(pathways, physeq_pwy_relabun)
############
#only in answers
############
single_pathway = ""
#e.g. single_pathway="PWY-7222"

microbiome::boxplot_abundance(physeq_pwy_relabun_d1, x='Description_1', y=single_pathway, line = NULL, violin = FALSE, na.rm = FALSE, show.points = TRUE) + ggtitle(single_pathway) + ylab('Relative abundance (%)')
############
#only in answers
############
for(i in 1:length(pathways)) {
  print(microbiome::boxplot_abundance(physeq_pwy_relabun_d1, x='Description_1', y=pathways[i]) + ggtitle(pathways[i]) + ylab('Relative abundance (%)'))
}
Clone this wiki locally