CBW 2024 Beginner Module 4: Functional prediction and additional analyses
Robyn Wright edited this page May 1, 2024
·
5 revisions
cd workspace
mkdir bmb_module4
cd bmb_module4/
ln -s ~/workspace/bmb_module2/deblur_output/ .
ln -s ~/workspace/bmb_module2/taxa/ .
ln -s ~/workspace/bmb_module2/Blueberry_metadata_reduced.tsv .
ls #to check everything copied over ok
conda activate qiime2-amplicon-2024.2
qiime feature-table filter-features \
--i-table deblur_output/deblur_table_final.qza \
--p-min-frequency 50 \
--p-min-samples 2 \
--o-filtered-table final_table_filtered.qza
qiime feature-table filter-seqs \
--i-data deblur_output/representative_sequences.qza \
--i-table final_table_filtered.qza \
--o-filtered-data representative_sequences_final_filtered.qza
qiime tools export \
--input-path representative_sequences_final_filtered.qza \
--output-path exports_filtered
qiime tools export \
--input-path final_table_filtered.qza \
--output-path exports_filtered
biom convert \
-i exports_filtered/feature-table.biom \
-o exports_filtered/feature-table.txt \
--to-tsv
qiime tools export \
--input-path taxa/classification.qza \
--output-path taxa
sed -i -e '1 s/Feature/#Feature/' -e '1 s/Taxon/taxonomy/' taxa/taxonomy.tsv
conda deactivate
conda activate picrust2
picrust2_pipeline.py -s exports_filtered/dna-sequences.fasta -i exports_filtered/feature-table.biom -o picrust2_out_pipeline_filtered -p 4 -t sepp
library(reticulate)
use_condaenv('/home/ubuntu/CourseData/MIC_data/.conda/envs/r-env/')
library(phyloseq)
library(Maaslin2, lib.loc="/home/ubuntu/CourseData/MIC_data/.conda/envs/r-env/lib/R/library")
library(ANCOMBC, lib.loc="/home/ubuntu/CourseData/MIC_data/.conda/envs/r-env/lib/R/library")
library(microbiome, lib.loc="/home/ubuntu/CourseData/MIC_data/.conda/envs/r-env/lib/R/library")
library(ALDEx2)
library(ggplot2)
library(tidyr)
library(stringr)
library(vegan)
folder = '/home/ubuntu/workspace/bmb_module4/'
asv_table <- read.csv(paste(folder, "exports_filtered/feature-table.txt", sep=""), sep='\t', skip=1, header=T)
asv_table_num = data.matrix(asv_table[,2:13]) #convert the ASV table to a numeric matric
rownames(asv_table_num) = asv_table[,1] #give the matrix row names
taxonomy <- read.csv(paste(folder, "taxa/taxonomy.tsv", sep=""), sep='\t')
taxonomy_split <- separate(data = taxonomy, col = taxonomy, into = c("Domain", "Phylum", "Class", "Order", "Family", "Genus", "Species"), sep = "\\; ") #separate the taxonomy table so each phylogenetic level is its own column
taxonomy_split <- taxonomy_split[,-c(1,9)] #remove the Feature.ID and Confidence columns from the taxonomy table
rownames(taxonomy_split) <- taxonomy[,1] #and now give the taxonomy table the OTU IDs as row names
metadata <- read.csv(paste(folder, "Blueberry_metadata_reduced.tsv", sep=""), sep='\t')
samples = metadata[,2:3] #get the metadata columns
rownames(samples) = metadata[,1] #and add the sample names as row names
samples = data.frame(samples, stringsAsFactors = FALSE) #convert this to a data frame
ASV = otu_table(asv_table_num, taxa_are_rows = TRUE)
TAX = tax_table(taxonomy_split)
taxa_names(TAX) <- rownames(taxonomy_split)
SAMPLE = sample_data(samples)
physeq = phyloseq(ASV, TAX, SAMPLE)
physeq
physeq_genus = tax_glom(physeq, taxrank="ta6")
all_tax = paste(tax_table(physeq_genus)[,2], tax_table(physeq_genus)[,3], tax_table(physeq_genus)[,4], tax_table(physeq_genus)[,5], tax_table(physeq_genus)[,6], sep=';')
taxa_names(physeq_genus) = all_tax
otu_table(physeq_genus)
#rarefy
physeq_rare = rarefy_even_depth(physeq_genus, sample.size = min(sample_sums(physeq_genus)), replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
feat_table = data.frame(t(otu_table(physeq_rare)), check.rows=T, check.names=T, stringsAsFactors=T)
metadata = data.frame(sample_data(physeq_rare), stringsAsFactors = F)
results_all <- Maaslin2(feat_table, metadata, paste(folder, "MaAsLin2_out_taxa_Description_1", sep=""), transform = "AST", fixed_effects = c("Description_1"), reference=paste("Description_1", "Bulk", sep=','), standardize = FALSE, plot_heatmap = T, plot_scatter = T)
results_all <- Maaslin2(feat_table, metadata, paste(folder, "MaAsLin2_out_taxa_Description_3", sep=""), transform = "AST", fixed_effects = c("Description_3"), reference=paste("Description_3", "Forest", sep=','), standardize = FALSE, plot_heatmap = T, plot_scatter = T)
ancom_out = ancombc(phyloseq=physeq_genus, formula="Description_1+Description_3", alpha=0.1)
w = ancom_out$res$W
q = ancom_out$res$q_val
p = ancom_out$res$p_val
rownames(w) = w[,1]
w = w[,-c(1:2)]
rownames(q) = q[,1]
q = q[,-c(1:2)]
rownames(p) = p[,1]
p = p[,-c(1:2)]
colnames(w) = c("Description_1 W", "Description_3 W")
colnames(q) = c("Description_1 q", "Description_3 q")
colnames(p) = c("Description_1 p", "Description_3 p")
a_out = merge(w, q, by = 'row.names')
rownames(a_out) = rownames(w)
a_out = a_out[,-1]
a_out = merge(a_out, p, by = 'row.names')
rownames(a_out) = rownames(w)
a_out = a_out[,-1]
write.csv(a_out, paste(folder, "ANCOM_taxa.csv", sep=""))
x <- aldex.clr(otu_table(physeq_genus), sample_data(physeq)$Description_1, mc.samples = 128, verbose=F, denom="all")
kw.test.d1 <- aldex.kw(x, useMC=2, verbose=FALSE)
write.csv(kw.test.d1, paste(folder, "ALDEx2_taxa_Description_1.csv", sep=""))
x <- aldex.clr(otu_table(physeq_genus), sample_data(physeq)$Description_3, mc.samples = 128, verbose=F, denom="all")
kw.test.d3 <- aldex.kw(x, useMC=2, verbose=FALSE)
write.csv(kw.test.d3, paste(folder, "ALDEx2_taxa_Description_3.csv", sep=""))
Description_1:
maaslin = read.csv(paste(folder, "MaAsLin2_out_taxa_Description_1/significant_results.tsv", sep=""), sep="\t")
maaslin$feature = gsub("\\g__Burkholderia.Caballeronia.Paraburkholderia", "g__Burkholderia-Caballeronia-Paraburkholderia", maaslin$feature)
maaslin$feature = gsub("\\RCP2.54", "RCP2-54", maaslin$feature)
maaslin$feature = gsub("\\.", ";", maaslin$feature)
maaslin = maaslin[maaslin$qval <= 0.1, ]
aldex = read.csv(paste(folder, "ALDEx2_taxa_Description_1.csv", sep=""))
aldex = aldex[aldex$kw.eBH <= 0.1, ]
ancom = read.csv(paste(folder, "ANCOM_taxa.csv", sep=""))
ancom = ancom[ancom$Description_1.q <= 0.1, ]
taxa = c(maaslin$feature, aldex$X, ancom$X)
taxa = unique(taxa)
physeq_rare_d1 = prune_taxa(taxa, physeq_rare)
physeq_relabun_d1 = transform_sample_counts(physeq_rare_d1, function(x) (x / sum(x))*100 )
single_taxon = ""
#e.g. single_taxon="p__Proteobacteria;c__Alphaproteobacteria;o__Rhizobiales;f__Beijerinckiaceae;g__Roseiarcus"
microbiome::boxplot_abundance(physeq_relabun_d1, x='Description_1', y=single_taxon, line = NULL, violin = FALSE, na.rm = FALSE, show.points = TRUE) + ggtitle(str_replace(single_taxon, ';f__', '\nf__')) + ylab('Relative abundance (%)')
for(i in 1:length(taxa)) {
print(microbiome::boxplot_abundance(physeq_relabun_d1, x='Description_1', y=taxa[i]) + ggtitle(str_replace(taxa[i], ';f__', '\nf__')) + ylab('Relative abundance (%)'))
}
############
#only in answers
############
maaslin = read.csv(paste(folder, "MaAsLin2_out_taxa_Description_3/significant_results.tsv", sep=""), sep="\t")
maaslin$feature = gsub("\\g__Burkholderia.Caballeronia.Paraburkholderia", "g__Burkholderia-Caballeronia-Paraburkholderia", maaslin$feature)
maaslin$feature = gsub("\\RCP2.54", "RCP2-54", maaslin$feature)
maaslin$feature = gsub("\\.", ";", maaslin$feature)
maaslin = maaslin[maaslin$qval <= 0.1, ]
aldex = read.csv(paste(folder, "ALDEx2_taxa_Description_3.csv", sep=""))
aldex = aldex[aldex$kw.eBH <= 0.1, ]
ancom = read.csv(paste(folder, "ANCOM_taxa.csv", sep=""))
ancom = ancom[ancom$Description_3.q <= 0.1, ]
taxa = c(maaslin$feature, aldex$X, ancom$X)
taxa = unique(taxa)
physeq_rare_d3 = prune_taxa(taxa, physeq_rare)
physeq_relabun_d3 = transform_sample_counts(physeq_rare_d3, function(x) (x / sum(x))*100 )
############
#only in answers
############
single_taxon = ""
#e.g. single_taxon="p__Actinobacteriota;c__Actinobacteria;o__Catenulisporales;f__Actinospicaceae;g__Actinospica"
microbiome::boxplot_abundance(physeq_relabun_d3, x='Description_3', y=single_taxon, line = NULL, violin = FALSE, na.rm = FALSE, show.points = TRUE) + ggtitle(str_replace(single_taxon, ';f__', '\nf__')) + ylab('Relative abundance (%)')
HINT: have you made sure that you've changed Description_1 to Description_3 and d1 to d3 in every place that they occur?? HINT: make sure you change the taxon as it's probably different in this one!
############
#only in answers
############
for(i in 1:length(taxa)) {
print(microbiome::boxplot_abundance(physeq_relabun_d3, x='Description_3', y=taxa[i]) + ggtitle(str_replace(taxa[i], ';f__', '\nf__')) + ylab('Relative abundance (%)'))
}
Now try to do the same with the PICRUSt2 file pathways_out/path_abun_unstrat.tsv. Hint: You'll need to unzip the file first, and you won't need the "TAX" option.
############
#only in answers
############
gunzip picrust2_out_pipeline_filtered/pathways_out/path_abun_unstrat.tsv.gz
############
#only in answers
############
folder = '/home/ubuntu/workspace/bmb_module4/'
pwy_table <- read.csv(paste(folder, "picrust2_out_pipeline_filtered/pathways_out/path_abun_unstrat.tsv", sep=""), sep='\t', header=T)
pwy_table_num = data.matrix(pwy_table[,2:13]) #convert the ASV table to a numeric matric
rownames(pwy_table_num) = pwy_table[,1] #give the matrix row names
metadata <- read.csv(paste(folder, "Blueberry_metadata_reduced.tsv", sep=""), sep='\t')
samples = metadata[,2:3] #get the metadata columns
rownames(samples) = metadata[,1] #and add the sample names as row names
samples = data.frame(samples, stringsAsFactors = FALSE) #convert this to a data frame
PWY = otu_table(pwy_table_num, taxa_are_rows = TRUE)
SAMPLE = sample_data(samples)
physeq_pwy = phyloseq(PWY, SAMPLE)
physeq_pwy_relabun = transform_sample_counts(physeq_pwy, function(x) (x / sum(x))*100 )
top_100_abun <- names(sort(taxa_sums(physeq_pwy_relabun), TRUE)[1:100]) #get most abundant ones
mode(pwy_table_num) = "integer"
PWY_int = otu_table(pwy_table_num, taxa_are_rows = TRUE)
physeq_pwy_int = phyloseq(PWY_int, SAMPLE)
HINT: this file doesn't have a junk first line
Alpha diversity:
plot_richness(physeq_pwy_int, x="Description_1", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
plot_richness(physeq_pwy_int, x="Description_3", measures=c("Observed", "Chao1", "Simpson", "Shannon")) + geom_boxplot()
Beta diversity:
ps.ord <- ordinate(physeq_pwy_relabun, "PCoA", "bray")
plot_ordination(physeq_pwy_relabun, ps.ord, type="samples", color="Description_1", shape="Description_3")
MaAsLIn2
############
#only in answers
############
#rarefy
physeq_rare_pwy = rarefy_even_depth(physeq_pwy, sample.size = min(sample_sums(physeq_pwy)), replace = TRUE, trimOTUs = TRUE, verbose = TRUE)
physeq_rare_pwy_top_100 = prune_taxa(top_100_abun, physeq_rare_pwy)
feat_table = data.frame(t(otu_table(physeq_rare_pwy_top_100)), check.rows=F, check.names=F, stringsAsFactors=F)
metadata = data.frame(sample_data(physeq_rare), stringsAsFactors = F)
results_all <- Maaslin2(feat_table, metadata, paste(folder, "MaAsLin2_out_pathways_Description_1", sep=""), transform = "AST", fixed_effects = c("Description_1"), reference=paste("Description_1", "Bulk", sep=','), standardize = FALSE, plot_heatmap = T, plot_scatter = T)
results_all <- Maaslin2(feat_table, metadata, paste(folder, "MaAsLin2_out_pathways_Description_3", sep=""), transform = "AST", fixed_effects = c("Description_3"), reference=paste("Description_3", "Forest", sep=','), standardize = FALSE, plot_heatmap = T, plot_scatter = T)
HINT: make sure that you don't write over the previous folders and files that you made!
ALDEx2:
############
#only in answers
############
physeq_pwy_top_100 = prune_taxa(top_100_abun, physeq_pwy)
otu_table(physeq_pwy_top_100) = round(otu_table(physeq_pwy_top_100))
x <- aldex.clr(otu_table(physeq_pwy_top_100), sample_data(physeq_pwy_top_100)$Description_1, mc.samples = 128, verbose=F, denom="all")
kw.test.d1 <- aldex.kw(x, useMC=2, verbose=FALSE)
write.csv(kw.test.d1, paste(folder, "ALDEx2_pathways_Description_1.csv", sep=""))
x <- aldex.clr(otu_table(physeq_pwy_top_100), sample_data(physeq_pwy_top_100)$Description_3, mc.samples = 128, verbose=F, denom="all")
kw.test.d3 <- aldex.kw(x, useMC=2, verbose=FALSE)
write.csv(kw.test.d3, paste(folder, "ALDEx2_pathways_Description_3.csv", sep=""))
ANCOM2:
############
#only in answers
############
ancom_out = ancombc(phyloseq=physeq_pwy_top_100, formula="Description_1+Description_3", alpha=0.1)
w = ancom_out$res$W
q = ancom_out$res$q_val
p = ancom_out$res$p_val
rownames(w) = w[,1]
w = w[,-c(1:2)]
rownames(q) = q[,1]
q = q[,-c(1:2)]
rownames(p) = p[,1]
p = p[,-c(1:2)]
colnames(w) = c("Description_1 W", "Description_3 W")
colnames(q) = c("Description_1 q", "Description_3 q")
colnames(p) = c("Description_1 p", "Description_3 p")
a_out = merge(w, q, by = 'row.names')
rownames(a_out) = rownames(w)
a_out = a_out[,-1]
a_out = merge(a_out, p, by = 'row.names')
rownames(a_out) = rownames(w)
a_out = a_out[,-1]
write.csv(a_out, paste(folder, "ANCOM_pathways.csv", sep=""))
Combine:
############
#only in answers
############
maaslin = read.csv(paste(folder, "MaAsLin2_out_pathways_Description_1/significant_results.tsv", sep=""), sep="\t")
# maaslin$feature = gsub("\\g__Burkholderia.Caballeronia.Paraburkholderia", "g__Burkholderia-Caballeronia-Paraburkholderia", maaslin$feature)
# maaslin$feature = gsub("\\RCP2.54", "RCP2-54", maaslin$feature)
maaslin$feature = gsub("\\.", "-", maaslin$feature)
maaslin = maaslin[maaslin$qval <= 0.1, ]
aldex = read.csv(paste(folder, "ALDEx2_pathways_Description_1.csv", sep=""))
aldex = aldex[aldex$kw.eBH <= 0.1, ]
ancom = read.csv(paste(folder, "ANCOM_pathways.csv", sep=""))
ancom = ancom[ancom$Description_1.q <= 0.1, ]
pathways = c(ancom$X, aldex$X, maaslin$feature)
pathways = unique(pathways)
physeq_pwy_relabun_d1 = prune_taxa(pathways, physeq_pwy_relabun)
############
#only in answers
############
single_pathway = ""
#e.g. single_pathway="PWY-7222"
microbiome::boxplot_abundance(physeq_pwy_relabun_d1, x='Description_1', y=single_pathway, line = NULL, violin = FALSE, na.rm = FALSE, show.points = TRUE) + ggtitle(single_pathway) + ylab('Relative abundance (%)')
############
#only in answers
############
for(i in 1:length(pathways)) {
print(microbiome::boxplot_abundance(physeq_pwy_relabun_d1, x='Description_1', y=pathways[i]) + ggtitle(pathways[i]) + ylab('Relative abundance (%)'))
}
- Please feel free to post a question on the Microbiome Helper google group if you have any issues.
- General comments or inquires about Microbiome Helper can be sent to morgan.langille@dal.ca.
Useful Links
Main SOPs
Amplicon SOP v2 (virtual box workflow)
Amplicon SOP v2 (qiime2-2022.11)
PacBio Amplicon SOP v2 (qiime2-2022.2)
Tutorials