In [None]:
# Loading libraries:
suppressMessages({ 
    library(plyr)
    library(dplyr)
    library(data.table)
    library(ggplot2)
    library(IRdisplay)
    library(phyloseq)
    library(stringr)
    library(tidyverse)
    library(vegan)
    library(metagenomeSeq)
})

# Reading KOs table and transforming into a phyloseq object

In [None]:
# Reading and cleaning table
data <- read.table("japan_kos_presabs.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
                     
# Saving data for phyloseq object                     
OTU = otu_table(otumat, taxa_are_rows = TRUE)
sampledata <- data.frame(sample_types = sub("^[^_]*_", "", colnames(otumat)))
rownames(sampledata) = colnames(otumat)
SAM = sample_data(sampledata)

# Building the phyloseq object
physeq = phyloseq(OTU, SAM)

In [None]:
# Keeping only the optimal subsampling for shallowmapping-bwa and shogun
physeq_filt <- subset_samples(physeq, sample_types %in% c('pic', 'mfp', 'deep', '10_sho', '10_bwa'))
metadata = data.frame(sample_data(physeq_filt))

# Statistical analysis of groups comparison for functions

## Global ANOSIM

In [None]:
anosim(phyloseq::distance(physeq_filt, method="bray"), metadata$sample_types)

## Global betadisper and Adonis2

In [None]:
css_beta = distance(physeq_filt, method="bray")
ado_result = adonis2(css_beta ~ sample_types, data = metadata, perm=1e3)
#ado_result

bd = betadisper(css_beta, metadata$'sample_types')
anova(bd)

boxplot(bd)
plot(bd)

## Tukey's HSD test on dispersion for functions

In [None]:
# Tukey's HSD test
tukey_table <- TukeyHSD(bd)

# Extract the results and convert them to data frames
tukey_df_list <- lapply(tukey_table, as.data.frame)

# Combine all components into a single data frame and write to one file
# Add a column to identify the factor
tukey_df_combined <- do.call(rbind, lapply(names(tukey_df_list), function(name) {
  df <- tukey_df_list[[name]]
  df$Comparison <- rownames(df)  # Add comparison names
  rownames(df) <- NULL  # Remove row names
  df$Factor <- name
  return(df)
}))

# Write to a single file
write.table(tukey_df_combined, "new_mouse_tukey_bd_kos.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")


## Pairwise ANOSIM for functions

In [None]:
cbn <- combn(x = unique(metadata$sample_types), m = 2)
p <- c()  # vector to store p values
R <- c()  # vector to store R values

for (i in 1:ncol(cbn)) {
    ps.subs <- subset_samples(physeq_filt, sample_types %in% cbn[, i])
    metadata_sub <- data.frame(sample_data(ps.subs))
    permanova_pairwise <- anosim(phyloseq::distance(ps.subs, method="bray"), 
                                 metadata_sub$sample_types)
    p <- c(p, permanova_pairwise$signif[1])  # Store p-values
    R <- c(R, permanova_pairwise$statistic)  # Store R statistics
}

p.adj <- p.adjust(p, method = "BH")  # Adjust p-values
p.table <- cbind.data.frame(t(cbn), R = R, p = p, p.adj = p.adj)  # Combine all results in a table

write.table(p.table, "new_mouse_pairwise_anosim_kos.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")


### Pairwise ANOSIM heatmap

Criteria for R-value color codes:

R-value Near 0:
R ~ 0.0 to 0.2: This range typically suggests that there is little to no observable difference between the groups. The samples can be considered very similar in composition.

Moderate R-values:
R ~ 0.2 to 0.5: This range suggests moderate differentiation. Samples might be considered somewhat different, but the distinction isn't very strong. Interpretation in this range can depend on the sensitivity required in the study and the natural variability of the dataset.
High R-values:

R-value close to 1:
R > 0.5: This range indicates strong differentiation between samples. The higher the R-value, especially approaching or exceeding 0.7, the more distinct the community compositions between the groups. Samples with these R-values can be considered to have different compositions.


In [None]:
library(pheatmap)

In [None]:
data <- read.table("new_mouse_pairwise_anosim_kos.tsv", header = TRUE, sep = "\t")
samples <- unique(c(data$X1, data$X2))
r_matrix <- matrix(NA, nrow = length(samples), ncol = length(samples), 
                   dimnames = list(samples, samples))

for (i in 1:nrow(data)) {
    row <- which(samples == data$X1[i])
    col <- which(samples == data$X2[i])
    r_matrix[row, col] <- data$R[i]
    r_matrix[col, row] <- data$R[i]  # Ensure the matrix is symmetric
}

desired_order <- c("deep", "10_bwa", "10_sho", "mfp", "pic")
r_matrix <- r_matrix[desired_order, desired_order]

# Set the lower triangle to NA
r_matrix[lower.tri(r_matrix)] <- NA

# Define the colors and breaks
colors <- c(
  colorRampPalette(c("#a80202", "#ff6403"))(20),    # Gradient from 0 to 0.2
  colorRampPalette(c("#ff6403", "#ffda05"))(30),    # Sharp change from 0.2 to 0.5
  colorRampPalette(c("#ffda05", "black"))(50)       # Sharp change from 0.5 to 1.0
)
breaks <- c(
  seq(0, 0.2, length.out = 21),     # Breaks from 0 to 0.2
  seq(0.2, 0.5, length.out = 31)[-1],   # Breaks from 0.2 to 0.5 (excluding duplicate 0.2)
  seq(0.5, 1.0, length.out = 51)[-1]    # Breaks from 0.5 to 1.0 (excluding duplicate 0.5)
)

# Plot the heatmap
pdf("new_mouse_rvalues_kos.pdf", width = 7, height = 5)
pheatmap(r_matrix,
         color = colors,
         breaks = breaks,
         cluster_rows = FALSE,  # Disable clustering to preserve the order
         cluster_cols = FALSE,  # Disable clustering to preserve the order
         show_rownames = TRUE,
         show_colnames = TRUE,
         display_numbers = FALSE,
         na_col = "white")  # Set the color for NA values
dev.off()


# Ordination plots for functions

In [None]:
# Reading and cleaning table
data <- read.table("new_mouse_kos_presabs.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
                     
# Saving data for phyloseq object                     
OTU = otu_table(otumat, taxa_are_rows = TRUE)
sampledata <- data.frame(sample_types = sub("^[^_]*_", "", colnames(otumat)))
rownames(sampledata) = colnames(otumat)
SAM = sample_data(sampledata)

# Building the phyloseq object
physeq = phyloseq(OTU, SAM)

# Keeping only the optimal subsampling for shallowmapping-bwa and shogun
physeq_filt <- subset_samples(physeq, sample_types %in% c('pic', 'mfp', 'deep', '10_sho', '10_bwa'))
metadata = data.frame(sample_data(physeq_filt))


In [None]:
colorCode_sample <- c(
  "10_bwa" = "#3CB44B",
  "10_sho" = "#f77a13",
  "deep" = "#000000",
  "mfp" = "#780164",
  "pic" = "#099fe0"
)

# Testing different ordination methods
dist = "bray"
ord_meths = c("DCA", "CCA", "RDA", "MDS", "PCoA")
plist = llply(as.list(ord_meths), function(i, physeq_obj, dist){
        ordi = ordinate(physeq_obj, method=i, distance=dist)
        plot_ordination(physeq_obj, ordi, "samples")
}, physeq_filt, dist)

names(plist) <- ord_meths
pdataframe = ldply(plist, function(x){
    df = x$data[, 1:2]
    colnames(df) = c("Axis_1", "Axis_2")
    return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"

In [None]:
# Saving all the plots to individual pdf files
for (index in 1:5) {
    plot_type = ord_meths[[index]]
    file_name = paste0('new_mouse_', plot_type, '_kos.pdf')
    pdf(file_name, width = 5, height = 4)
    p = plist[[index]] + 
        geom_point(size=2, alpha=1, aes(color=metadata$sample_types)) + 
        stat_ellipse(level=0.9, type="norm", geom="polygon", alpha=0, aes(color=metadata$sample_types)) +
        theme_bw() + 
        scale_color_manual(values=colorCode_sample) +
        labs(color = "Groups") 
    print(p)
    dev.off()
}

In [None]:
# Displaying the plots with all the methods to choose the best
options(repr.plot.width=7, repr.plot.height=6)
print_plots <- function() {
    for (index in 1:5) {
        p = plist[[index]] + geom_point(size=2, alpha=1, aes(color=metadata$sample_types)) + 
            stat_ellipse(level=0.9, type="norm", geom="polygon", alpha=0, aes(color=metadata$sample_types)) +
            theme_bw() + 
            scale_color_manual(values=colorCode_sample) +
            labs(color = "Groups") 
        print(p)
    }
}
print_plots()

# Statistical analysis of groups comparison for taxonomy

## Hierarchical clustering of taxonomy

In [None]:
library(dendextend)

In [None]:
# Reading table
data <- read.table("jungle_relab_taxo_species.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[apply(data, 1, function(x) any(x > 0.001)), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat_all <- as.matrix(data[-1])

# Filter the columns of otumat matrix based on the conditions in sampledata
selected_samples <- sampledata$sample_types %in% c('amp', 'deep', '10_sho', '10_bwa')
otumat <- otumat_all[, selected_samples]

sampledata <- data.frame(sample_types = sub("^[^_]*_", "", colnames(otumat)))
rownames(sampledata) = colnames(otumat)

colorCode <- c(
  "10_bwa" = "#3CB44B",
  "10_sho" = "#f77a13",
  "deep" = "#000000",
  "amp" = "#2004d4"
)


bc_dist <- vegan::vegdist(t(otumat), method = "bray")
ward <- as.dendrogram(hclust(bc_dist, method = "ward.D2"))
plot_data <- as.dendrogram(ward)
label_colors <- colorCode[sampledata$sample_types[order.dendrogram(plot_data)]]
labels_colors(ward) <- label_colors

pdf("japan_species_hclus.pdf", width = 20, height = 8)
par(mar = c(6, 2, 1, 1))
plot(ward)
legend("topright", inset = c(0, 0), legend = names(colorCode), fill = colorCode, title = "Annotation method", cex = 1.5)
dev.off()

write.table(labels(plot_data), "japan_species_labels.txt", row.names = FALSE, col.names = FALSE, quote = FALSE)


# Reading taxonomic tables

## Heatmaps

In [None]:
suppressMessages({ 
    library(dplyr)
    library(RColorBrewer)
    library(ComplexHeatmap)
})

In [None]:
# Species level heatmap
data_1 <- read.csv("jungle_relab_taxo_species.tsv", sep = "\t", header = TRUE, row.names = 1, check.names=FALSE)
sampledata <- data.frame(sample_types = sub("^[^_]*_", "", colnames(data_1)))
rownames(sampledata) = colnames(data_1)

# Filter the columns of otumat matrix based on the conditions in sampledata
selected_samples <- sampledata$sample_types %in% c('amp', 'deep', '10_sho', '10_bwa')
data_1 <- data_1[, selected_samples]
data_1 <- data_1[apply(data_1, 1, function(x) any(x > 0.1)), ]
data_1 <- data_1[, apply(data_1, 2, function(x) any(x != 0))]
species_data = as.matrix(data_1)

# Create the colors_names annotation
sample_names <- names(data_1) 

# Initialize an empty dataframe for metadata
new_sampledata <- data.frame(sample_name = sample_names, sample_group = character(length(sample_names)), stringsAsFactors = FALSE)

# Function to determine sample_type based on the suffix
get_sample_type <- function(sample_name) {
  suffix <- sub("^[^_]*_", "", sample_name)
if (suffix %in% c("1_bwa", "5_bwa", "10_bwa", "15_bwa", "20_bwa")) {
    return("#3cb44b")
  } else if (suffix %in% c("1_sho", "5_sho", "10_sho", "15_sho", "20_sho")) {
    return("#f77a13")
  } else if (suffix == "deep") {
    return("#000000")
  } else if (suffix == "amp") {
    return("#2004d4")
  } else {
    return(NA)
  }
}

new_sampledata$sample_group <- sapply(new_sampledata$sample_name, get_sample_type)
sampledata_list <- setNames(new_sampledata$sample_group, new_sampledata$sample_name)

annotation = HeatmapAnnotation(Group = new_sampledata$sample_name, col = list(Group = sampledata_list), annotation_name_gp = gpar(fontsize = 8), annotation_legend_param = list(Group = list(at = NULL)))

# Sorting the samples by name
order_samples_by_prefix <- function(sample_names, prefix_list) {
  column_order <- c()
  for (prefix in prefix_list) {
    matching_samples <- grep(paste0(prefix, "$"), sample_names, value = TRUE)
    column_order <- c(column_order, matching_samples)
  }
  return(column_order)
}
prefix_list <- c(
  "_deep",
  "_20_bwa",
  "_20_sho",
  "_amp"
)
column_order <- order_samples_by_prefix(sample_names, prefix_list)

ht_1 = Heatmap(species_data, 
        column_title = "", 
        row_title = "",
        row_names_gp = gpar(fontsize = 0), # Text size for col names
        col = c("#e0e0e0", "#02a8a2"),
        show_column_names = FALSE,
        top_annotation = annotation,
        show_heatmap_legend = FALSE
        #column_order = column_order
)

pdf("jungle_heatmap_taxonomy_species.pdf", width = 5, height = 5) 
draw(ht_1)
dev.off()


## Combined heatmap domain and phylum

In [None]:
# Phylum level heatmap
data_1 <- read.csv("ranks_jungle_relab_taxo_phylum.tsv", sep = "\t", header = TRUE, row.names = 1, check.names=FALSE)
sampledata <- data.frame(sample_types = sub("^[^_]*_", "", colnames(data_1)))
rownames(sampledata) = colnames(data_1)

# Filter the columns of otumat matrix based on the conditions in sampledata
selected_samples <- sampledata$sample_types %in% c('amp', 'deep', '20_sho', '20_bwa')
data_1 <- data_1[, selected_samples]
data_1 <- data_1[apply(data_1, 1, function(x) any(x > 0.000001)), ]
data_1 <- data_1[, apply(data_1, 2, function(x) any(x != 0))]
phylum_data = as.matrix(data_1)

# Domain level heatmap
data_2 <- read.csv("ranks_jungle_relab_taxo_domain.tsv", sep = "\t", header = TRUE, row.names = 1, check.names=FALSE)
data_2 <- data_2[, selected_samples]
data_2 <- data_2[apply(data_2, 1, function(x) any(x > 0.000001)), ]
data_2 <- data_2[, apply(data_2, 2, function(x) any(x != 0))]
dom_data = as.matrix(data_2)


In [None]:
# Create the colors_names annotation
sample_names <- names(data_1) 

# Initialize an empty dataframe for metadata
new_sampledata <- data.frame(sample_name = sample_names, sample_group = character(length(sample_names)), stringsAsFactors = FALSE)

# Function to determine sample_type based on the suffix
get_sample_type <- function(sample_name) {
  suffix <- sub("^[^_]*_", "", sample_name)
if (suffix %in% c("1_bwa", "5_bwa", "10_bwa", "15_bwa", "20_bwa")) {
    return("#3cb44b")
  } else if (suffix %in% c("1_sho", "5_sho", "10_sho", "15_sho", "20_sho")) {
    return("#f77a13")
  } else if (suffix == "deep") {
    return("#000000")
  } else if (suffix == "amp") {
    return("#2004d4")
  } else {
    return(NA)
  }
}

new_sampledata$sample_group <- sapply(new_sampledata$sample_name, get_sample_type)
sampledata_list <- setNames(new_sampledata$sample_group, new_sampledata$sample_name)

annotation = HeatmapAnnotation(Group = new_sampledata$sample_name, col = list(Group = sampledata_list), annotation_name_gp = gpar(fontsize = 8), annotation_legend_param = list(Group = list(at = NULL)))

# Sorting the samples by name
order_samples_by_prefix <- function(sample_names, prefix_list) {
  column_order <- c()
  for (prefix in prefix_list) {
    matching_samples <- grep(paste0(prefix, "$"), sample_names, value = TRUE)
    column_order <- c(column_order, matching_samples)
  }
  return(column_order)
}
prefix_list <- c(
  "_deep",
  "_20_bwa",
  "_20_sho",
  "_amp"
)
column_order <- order_samples_by_prefix(sample_names, prefix_list)

ht_1 = Heatmap(phylum_data, 
        column_title = "", 
        row_title = "",
        row_names_gp = gpar(fontsize = 7), # Text size for col names
        col = c("#e0e0e0", "#02a8a2"),
        show_column_names = FALSE,
        top_annotation = annotation,
        show_heatmap_legend = FALSE
        #column_order = column_order
)

ht_2 = Heatmap(dom_data, 
        column_title = "", 
        row_title = "",
        row_names_gp = gpar(fontsize = 7), # Text size for col names
        col = c("#e0e0e0", "#040285"),
        show_column_names = FALSE,
        show_heatmap_legend = FALSE
        #column_order = column_order
)

ht_list = ht_1 %v% ht_2

pdf("jungle_heatmap_taxonomy.pdf", width = 5, height = 5) 
draw(ht_list)
dev.off()


## Groups comparison and ordination plots for taxonomy

### Filtered matrix to phyloseq object

In [None]:
# Building a clean phyloseq object and filter in the sallow-shotgun data with the best subsampling only
# Reading table
data <- read.table("new_mouse_relab_taxo_species.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

sampledata <- data.frame(sample_types = sub("^[^_]*_", "", colnames(otumat)))
rownames(sampledata) = colnames(otumat)

SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)

# Keeping only 1M reads samples for shallowmapping-bwa and shogun
physeq_filt <- subset_samples(physeq, sample_types %in% c('amp', 'deep', '10_sho', '10_bwa'))
metadata = data.frame(sample_data(physeq_filt))

### Global betadisper and Adonis2 for taxonomy

In [None]:
css_beta = distance(physeq_filt, method="bray")
ado_result = adonis2(css_beta ~ sample_types, data = metadata, perm=1e3)
ado_result

In [None]:
bd = betadisper(css_beta, metadata$'sample_types')
anova(bd)

boxplot(bd)
plot(bd)

### Tukey's HSD test on dispersion for taxonomy (when betadisper global P-val < 0.01)

In [None]:
tukey_table <- TukeyHSD(bd)

# Extract the results and convert them to data frames
tukey_df_list <- lapply(tukey_table, as.data.frame)

# Option 2: Combine all components into a single data frame and write to one file
# Add a column to identify the factor
tukey_df_combined <- do.call(rbind, lapply(names(tukey_df_list), function(name) {
  df <- tukey_df_list[[name]]
  df$Comparison <- rownames(df)  # Add comparison names
  rownames(df) <- NULL  # Remove row names
  df$Factor <- name
  return(df)
}))

# Write to a single file
write.table(tukey_df_combined, "new_mouse_tukey_bd_species.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")


### Pairwise ANOSIM for taxonomy

In [None]:
cbn <- combn(x = unique(metadata$sample_types), m = 2)
p <- c()  # vector to store p values
R <- c()  # vector to store R values

for (i in 1:ncol(cbn)) {
    ps.subs <- subset_samples(physeq_filt, sample_types %in% cbn[, i])
    metadata_sub <- data.frame(sample_data(ps.subs))
    permanova_pairwise <- anosim(phyloseq::distance(ps.subs, method="bray"), 
                                 metadata_sub$sample_types)
    p <- c(p, permanova_pairwise$signif[1])  # Store p-values
    R <- c(R, permanova_pairwise$statistic)  # Store R statistics
}

p.adj <- p.adjust(p, method = "BH")  # Adjust p-values
p.table <- cbind.data.frame(t(cbn), R = R, p = p, p.adj = p.adj)  # Combine all results in a table

write.table(p.table, "new_mouse_pairwise_anosim_species.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")

### Heatmap of pairwise ANOSIM for taxonomy

In [None]:
library(pheatmap)

In [None]:
data <- read.table("new_mouse_pairwise_anosim_species.tsv", header = TRUE, sep = "\t")
samples <- unique(c(data$X1, data$X2))
r_matrix <- matrix(NA, nrow = length(samples), ncol = length(samples), 
                   dimnames = list(samples, samples))

for (i in 1:nrow(data)) {
    row <- which(samples == data$X1[i])
    col <- which(samples == data$X2[i])
    r_matrix[row, col] <- data$R[i]
    r_matrix[col, row] <- data$R[i]  # Ensure the matrix is symmetric
}

desired_order <- c("deep", "10_bwa", "10_sho", "amp")
r_matrix <- r_matrix[desired_order, desired_order]

# Set the lower triangle to NA
r_matrix[lower.tri(r_matrix)] <- NA

# Define the colors and breaks
colors <- c(
  colorRampPalette(c("#a80202", "#ff6403"))(20),    # Gradient from 0 to 0.2
  colorRampPalette(c("#ff6403", "#ffda05"))(30),    # Sharp change from 0.2 to 0.5
  colorRampPalette(c("#ffda05", "black"))(50)       # Sharp change from 0.5 to 1.0
)
breaks <- c(
  seq(0, 0.2, length.out = 21),     # Breaks from 0 to 0.2
  seq(0.2, 0.5, length.out = 31)[-1],   # Breaks from 0.2 to 0.5 (excluding duplicate 0.2)
  seq(0.5, 1.0, length.out = 51)[-1]    # Breaks from 0.5 to 1.0 (excluding duplicate 0.5)
)

# Plot the heatmap
pdf("new_mouse_rvalues_species.pdf", width = 7, height = 5)
pheatmap(r_matrix,
         color = colors,
         breaks = breaks,
         cluster_rows = FALSE,  # Disable clustering to preserve the order
         cluster_cols = FALSE,  # Disable clustering to preserve the order
         show_rownames = TRUE,
         show_colnames = TRUE,
         display_numbers = FALSE,
         na_col = "white")  # Set the color for NA values
dev.off()


### Ordination plots for taxonomy

In [None]:
# Building a clean phyloseq object and filter in the sallow-shotgun data with the best subsampling only
# Reading table
data <- read.table("new_mouse_relab_taxo_species.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

sampledata <- data.frame(sample_types = sub("^[^_]*_", "", colnames(otumat)))
rownames(sampledata) = colnames(otumat)

SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)

# Keeping only 1M reads samples for shallowmapping-bwa and shogun
physeq_filt <- subset_samples(physeq, sample_types %in% c('amp', 'deep', '10_sho', '10_bwa'))
metadata = data.frame(sample_data(physeq_filt))

In [None]:
colorCode_sample <- c(
  "10_bwa" = "#3CB44B",
  "10_sho" = "#f77a13",
  "deep" = "#000000",
  "amp" = "#3402c9"
)

# Testing different ordination methods
dist = "bray"
ord_meths = c("DCA", "CCA", "RDA", "MDS", "PCoA")
plist = llply(as.list(ord_meths), function(i, physeq_obj, dist){
        ordi = ordinate(physeq_obj, method=i, distance=dist)
        plot_ordination(physeq_obj, ordi, "samples")
}, physeq_filt, dist)

names(plist) <- ord_meths
pdataframe = ldply(plist, function(x){
    df = x$data[, 1:2]
    colnames(df) = c("Axis_1", "Axis_2")
    return(cbind(df, x$data))
})
names(pdataframe)[1] = "method"


In [None]:
# Saving all the plots to individual pdf files
for (index in 1:5) {
    plot_type = ord_meths[[index]]
    file_name = paste0('new_mouse_', plot_type, '_species.pdf')
    pdf(file_name, width = 5, height = 4)
    p = plist[[index]] + 
        geom_point(size=2, alpha=1, aes(color=metadata$sample_types)) + 
        stat_ellipse(level=0.9, type="norm", geom="polygon", alpha=0, aes(color=metadata$sample_types)) +
        theme_bw() + 
        scale_color_manual(values=colorCode_sample) +
        labs(color = "Groups") 
    print(p)
    dev.off()
}

In [None]:
# Displaying the plots with all the methods to find the best
options(repr.plot.width=7, repr.plot.height=6)
print_plots <- function() {
    for (index in 1:5) {
        p = plist[[index]] + geom_point(size=2, alpha=1, aes(color=metadata$sample_types)) + 
            stat_ellipse(level=0.9, type="norm", geom="polygon", alpha=0, aes(color=metadata$sample_types)) +
            theme_bw() + 
            scale_color_manual(values=colorCode_sample) +
            labs(color = "Groups") 
        print(p)
    }
}
print_plots()


In [None]:
# Generating the final figure in pdf
options(repr.plot.width=6, repr.plot.height=5)
pdf("new_mouse_DCA_species.pdf", width = 5, height = 4)
p = plist[[1]] + geom_point(size=2, alpha=1, aes(color=metadata$sample_types)) + 
    stat_ellipse(level=0.9, type="norm", geom="polygon", alpha=0, aes(color=metadata$sample_types)) +
    theme_bw() + 
    scale_color_manual(values=colorCode_sample) +
    labs(color = "Groups") 
p
dev.off()


# Analysis of samples groups on (TMAO paper)

## Features profiles analysis based on high and low TMAO producers

### Taxonomic annotation at species level analysis

In [None]:
# Building a clean phyloseq object for taxonomic annotations
data <- read.table("bwa_taxo_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(otumat), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("HP", second_elements), "High", "Low"))
rownames(sampledata) = colnames(otumat)
#head(sampledata, 15)

SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)
metadata = data.frame(sample_data(physeq))
                          
# Global ANOSIM all individuals
anosim(phyloseq::distance(physeq, method="bray"), metadata$sample_types)                          
                          
# Global ADONIS all individuals
css_beta = distance(physeq, method="bray")
ado_result = adonis2(css_beta ~ sample_types, data = metadata, perm=1e3, na.rm = TRUE)
ado_result                          
                          
# Betadisper all individuals
bd = betadisper(css_beta, metadata$'sample_types')
anova(bd)

options(repr.plot.width=10, repr.plot.height=10)
plot(bd)                                                    

In [None]:
# Separating vegetarian and omnivores
# Extract the sample data
sample_data_df <- sample_data(physeq)

# Subset phyloseq_veg for samples with names starting with 'V'
physeq_veg <- subset_samples(physeq, grepl("^V", rownames(sample_data_df)))
metadata_veg = data.frame(sample_data(physeq_veg))

# ANOSIM vegetarian
anosim(phyloseq::distance(physeq_veg, method="bray"), metadata_veg$sample_types)                          

# ADONIS vegetarian
v_css_beta = distance(physeq_veg, method="bray")
v_ado_result = adonis2(v_css_beta ~ sample_types, data = metadata_veg, perm=1e3, na.rm = TRUE)
v_ado_result

# Betadisper vegetarian
v_bd = betadisper(v_css_beta, metadata_veg$'sample_types')
anova(v_bd)
plot(v_bd)                                                    

# Subset phyloseq_omn for samples with names starting with 'O'
physeq_omn <- subset_samples(physeq, grepl("^O", rownames(sample_data_df)))
metadata_omn = data.frame(sample_data(physeq_omn))

# ANOSIM omnivores
anosim(phyloseq::distance(physeq_omn, method="bray"), metadata_omn$sample_types)                          

# ADONIS omnivores
o_css_beta = distance(physeq_omn, method="bray")
o_ado_result = adonis2(o_css_beta ~ sample_types, data = metadata_omn, perm=1e3, na.rm = TRUE)
o_ado_result

# Betadisper omnivores
o_bd = betadisper(o_css_beta, metadata_omn$'sample_types')
anova(o_bd)
plot(o_bd)                                                    

### Functional annotation KOs

In [None]:
# Building a clean phyloseq object for KOs annotations
data <- read.table("bwa_kos_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(otumat), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("HP", second_elements), "High", "Low"))
rownames(sampledata) = colnames(otumat)
#head(sampledata, 15)
                          
SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)
metadata = data.frame(sample_data(physeq))

# ANOSIM all individuals
anosim(phyloseq::distance(physeq, method="bray"), metadata$sample_types)                          
                                          
# ADONIS all individuals
css_beta = distance(physeq, method="bray")
ado_result = adonis2(css_beta ~ sample_types, data = metadata, perm=1e3, na.rm = TRUE)
ado_result
                          
# Betadisper all individuals
bd = betadisper(css_beta, metadata$'sample_types')
anova(bd)

options(repr.plot.width=10, repr.plot.height=10)
plot(bd)                          

In [None]:
# Separating vegetarian and omnivores
# Extract the sample data
sample_data_df <- sample_data(physeq)

# Subset phyloseq_veg for samples with names starting with 'V'
physeq_veg <- subset_samples(physeq, grepl("^V", rownames(sample_data_df)))
metadata_veg = data.frame(sample_data(physeq_veg))

# ANOSIM vegetarian
anosim(phyloseq::distance(physeq_veg, method="bray"), metadata_veg$sample_types)                          

# ADONIS vegetarian
v_css_beta = distance(physeq_veg, method="bray")
v_ado_result = adonis2(v_css_beta ~ sample_types, data = metadata_veg, perm=1e3, na.rm = TRUE)
v_ado_result

# Betadisper vegetarian
v_bd = betadisper(v_css_beta, metadata_veg$'sample_types')
anova(v_bd)
plot(v_bd)                                                    

# Subset phyloseq_omn for samples with names starting with 'O'
physeq_omn <- subset_samples(physeq, grepl("^O", rownames(sample_data_df)))
metadata_omn = data.frame(sample_data(physeq_omn))

# ANOSIM omnivores
anosim(phyloseq::distance(physeq_omn, method="bray"), metadata_omn$sample_types)                          

# ADONIS omnivores
o_css_beta = distance(physeq_omn, method="bray")
o_ado_result = adonis2(o_css_beta ~ sample_types, data = metadata_omn, perm=1e3, na.rm = TRUE)
o_ado_result

# Betadisper omnivores
o_bd = betadisper(o_css_beta, metadata_omn$'sample_types')
anova(o_bd)
plot(o_bd)                                                    

### Functional annotation Pfams

In [None]:
# Building a clean phyloseq object for KOs annotations
data <- read.table("bwa_pfam_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(otumat), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("HP", second_elements), "High", "Low"))
rownames(sampledata) = colnames(otumat)
#head(sampledata, 15)
                          
SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)
metadata = data.frame(sample_data(physeq))

# ANOSIM all individuals
anosim(phyloseq::distance(physeq, method="bray"), metadata$sample_types)                          
                                                    
# ADONIS all individuals
css_beta = distance(physeq, method="bray")
ado_result = adonis2(css_beta ~ sample_types, data = metadata, perm=1e3, na.rm = TRUE)
ado_result
                          
# Betadisper all individuals
bd = betadisper(css_beta, metadata$'sample_types')
anova(bd)

options(repr.plot.width=10, repr.plot.height=10)
plot(bd)                          

In [None]:
# Separating vegetarian and omnivores
# Extract the sample data
sample_data_df <- sample_data(physeq)

# Subset phyloseq_veg for samples with names starting with 'V'
physeq_veg <- subset_samples(physeq, grepl("^V", rownames(sample_data_df)))
metadata_veg = data.frame(sample_data(physeq_veg))

# ANOSIM vegetarian
anosim(phyloseq::distance(physeq_veg, method="bray"), metadata_veg$sample_types)                          

# ADONIS vegetarian
v_css_beta = distance(physeq_veg, method="bray")
v_ado_result = adonis2(v_css_beta ~ sample_types, data = metadata_veg, perm=1e3, na.rm = TRUE)
v_ado_result

# Betadisper vegetarian
v_bd = betadisper(v_css_beta, metadata_veg$'sample_types')
anova(v_bd)
plot(v_bd)                                                    

# Subset phyloseq_omn for samples with names starting with 'O'
physeq_omn <- subset_samples(physeq, grepl("^O", rownames(sample_data_df)))
metadata_omn = data.frame(sample_data(physeq_omn))

# ANOSIM omnivores
anosim(phyloseq::distance(physeq_omn, method="bray"), metadata_omn$sample_types)                          

# ADONIS omnivores
o_css_beta = distance(physeq_omn, method="bray")
o_ado_result = adonis2(o_css_beta ~ sample_types, data = metadata_omn, perm=1e3, na.rm = TRUE)
o_ado_result

# Betadisper omnivores
o_bd = betadisper(o_css_beta, metadata_omn$'sample_types')
anova(o_bd)
plot(o_bd)                                                    

## Features profiles analysis based on L-carnitine intervention on helath cohort 1 (all, omnivores, vegetarian)

### Taxonomic annotation at species level analysis

In [None]:
# Building a clean phyloseq object for taxonomic annotations
data <- read.table("bwa_taxo_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(otumat), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("^C", second_elements), "Post", "Pre"))
rownames(sampledata) = colnames(otumat)
#head(sampledata, 3)
                          
SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)
metadata = data.frame(sample_data(physeq))

# ANOSIM all individuals
anosim(phyloseq::distance(physeq_veg, method="bray"), metadata_veg$sample_types)                          
                          
# ADONIS all individuals
css_beta = distance(physeq, method="bray")
ado_result = adonis2(css_beta ~ sample_types, data = metadata, perm=1e3, na.rm = TRUE)
ado_result

In [None]:
# Separating vegetarian and omnivores
# Extract the sample data
sample_data_df <- sample_data(physeq)

# Subset phyloseq_veg for samples with names starting with 'V'
physeq_veg <- subset_samples(physeq, grepl("^V", rownames(sample_data_df)))
metadata_veg = data.frame(sample_data(physeq_veg))

# ANOSIM vegetarian
anosim(phyloseq::distance(physeq_veg, method="bray"), metadata_veg$sample_types)                          

# ADONIS vegetarian
v_css_beta = distance(physeq_veg, method="bray")
v_ado_result = adonis2(v_css_beta ~ sample_types, data = metadata_veg, perm=1e3, na.rm = TRUE)
v_ado_result

# Subset phyloseq_omn for samples with names starting with 'O'
physeq_omn <- subset_samples(physeq, grepl("^O", rownames(sample_data_df)))
metadata_omn = data.frame(sample_data(physeq_omn))

# ANOSIM omnivores
anosim(phyloseq::distance(physeq_omn, method="bray"), metadata_omn$sample_types)

# ADONIS omnivores
o_css_beta = distance(physeq_omn, method="bray")
o_ado_result = adonis2(o_css_beta ~ sample_types, data = metadata_omn, perm=1e3, na.rm = TRUE)
o_ado_result

### Functional annotation KOs

In [None]:
# Building a clean phyloseq object for KOs annotations
data <- read.table("bwa_kos_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(otumat), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("^C", second_elements), "Post", "Pre"))
rownames(sampledata) = colnames(otumat)
#head(sampledata, 3)
                          
SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)
metadata = data.frame(sample_data(physeq))

# ANOSIM all individuals
anosim(phyloseq::distance(physeq, method="bray"), metadata$sample_types)                          
                                                  
# ADONIS all individuals
css_beta = distance(physeq, method="bray")
ado_result = adonis2(css_beta ~ sample_types, data = metadata, perm=1e3, na.rm = TRUE)
ado_result

In [None]:
# Separating vegetarian and omnivores
# Extract the sample data
sample_data_df <- sample_data(physeq)

# Subset phyloseq_veg for samples with names starting with 'V'
physeq_veg <- subset_samples(physeq, grepl("^V", rownames(sample_data_df)))
metadata_veg = data.frame(sample_data(physeq_veg))

# ANOSIM vegetarian
anosim(phyloseq::distance(physeq_veg, method="bray"), metadata_veg$sample_types)                          

# ADONIS vegetarian
v_css_beta = distance(physeq_veg, method="bray")
v_ado_result = adonis2(v_css_beta ~ sample_types, data = metadata_veg, perm=1e3, na.rm = TRUE)
v_ado_result

# Subset phyloseq_omn for samples with names starting with 'O'
physeq_omn <- subset_samples(physeq, grepl("^O", rownames(sample_data_df)))
metadata_omn = data.frame(sample_data(physeq_omn))

# ANOSIM omnivores
anosim(phyloseq::distance(physeq_omn, method="bray"), metadata_omn$sample_types)                  

# ADONIS omnivores
o_css_beta = distance(physeq_omn, method="bray")
o_ado_result = adonis2(o_css_beta ~ sample_types, data = metadata_omn, perm=1e3, na.rm = TRUE)
o_ado_result

### Functional annotation Pfams

In [None]:
# Building a clean phyloseq object for KOs annotations
data <- read.table("bwa_pfam_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(otumat), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("^C", second_elements), "Post", "Pre"))
rownames(sampledata) = colnames(otumat)
#head(sampledata, 3)
                          
SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)
metadata = data.frame(sample_data(physeq))

# ANOSIM all individuals
anosim(phyloseq::distance(physeq, method="bray"), metadata$sample_types)                                                   
                          
# ADONIS all individuals
css_beta = distance(physeq, method="bray")
ado_result = adonis2(css_beta ~ sample_types, data = metadata, perm=1e3, na.rm = TRUE)
ado_result

In [None]:
# Separating vegetarian and omnivores
# Extract the sample data
sample_data_df <- sample_data(physeq)

# Subset phyloseq_veg for samples with names starting with 'V'
physeq_veg <- subset_samples(physeq, grepl("^V", rownames(sample_data_df)))
metadata_veg = data.frame(sample_data(physeq_veg))

# ANOSIM vegetarian
anosim(phyloseq::distance(physeq_veg, method="bray"), metadata_veg$sample_types)                                               

# ADONIS vegetarian
v_css_beta = distance(physeq_veg, method="bray")
v_ado_result = adonis2(v_css_beta ~ sample_types, data = metadata_veg, perm=1e3, na.rm = TRUE)
#v_ado_result

# Subset phyloseq_omn for samples with names starting with 'O'
physeq_omn <- subset_samples(physeq, grepl("^O", rownames(sample_data_df)))
metadata_omn = data.frame(sample_data(physeq_omn))

# ANOSIM omnivores
anosim(phyloseq::distance(physeq_omn, method="bray"), metadata_omn$sample_types)                                               

# ADONIS omnivores
o_css_beta = distance(physeq_omn, method="bray")
o_ado_result = adonis2(o_css_beta ~ sample_types, data = metadata_omn, perm=1e3, na.rm = TRUE)
#o_ado_result

## Differentially abundant species between high and low TMAO producer groups
### All individuals (species)

In [None]:
suppressMessages({
    library(ANCOMBC)
})

In [None]:
# Building a clean phyloseq object for taxonomic annotations
data <- read.table("bwa_taxo_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data)
                     
#otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(otumat), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("^C", second_elements), "Post", "Pre"))
rownames(sampledata) = colnames(otumat)
#head(sampledata, 3)

SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)
physeq                        

In [None]:
# Running ANCOMBC
results <- ancombc2(physeq, 
                p_adj_method = "bonferroni",
                fix_formula = "sample_types",
                group = "sample_types")

In [None]:
# Extracting results
res_prim = results$res
write.table(res_prim, "ancombc2_species_all.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")

# Generating a plot from the differentially abundant species
# Select the relevant columns including standard error and filter for significant results
df_lfc <- res_prim %>%
  dplyr::select(taxon, lfc_sample_typesPre, p_sample_typesPre, se_sample_typesPre) %>%
  dplyr::filter(!is.na(lfc_sample_typesPre) & p_sample_typesPre < 0.05) %>%  # Filter for significant results
  dplyr::arrange(desc(abs(lfc_sample_typesPre))) %>%  # Sort by absolute LFC to get top 10
  #head(10) %>%  # Take top 10 species
  dplyr::mutate(direct = ifelse(lfc_sample_typesPre > 0, "Positive LFC", "Negative LFC"),
                color = ifelse(p_sample_typesPre < 0.05, "aquamarine3", "black"),
                # Trim the species names to the last part (penultimate part)
                taxon = sapply(strsplit(taxon, ";"), function(x) ifelse(length(x) > 1, x[length(x) - 1], NA)))

# Convert 'taxon' to factor with the order of appearance for plotting
df_lfc$taxon = factor(df_lfc$taxon, levels = df_lfc$taxon)
df_lfc$direct = factor(df_lfc$direct, levels = c("Positive LFC", "Negative LFC"))

# Plotting the differentially abundant species with error bars
fig_lfc <- df_lfc %>%
  ggplot(aes(x = taxon, y = lfc_sample_typesPre, fill = direct)) + 
  geom_bar(stat = "identity", width = 0.7, color = "black", 
           position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = lfc_sample_typesPre - se_sample_typesPre, 
                    ymax = lfc_sample_typesPre + se_sample_typesPre), 
                width = 0.2, position = position_dodge(0.05), color = "black") +
  labs(x = NULL, y = "Log fold change", 
       title = "Differentially abundant species (all individuals)") + 
  scale_fill_discrete(name = NULL) +
  scale_color_discrete(name = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        panel.grid.minor.y = element_blank(),
        axis.text.x = element_text(angle = 60, hjust = 1,
                                   color = df_lfc$color))
          
pdf("ancombc2_species_all.pdf", width = 5, height = 5)
fig_lfc
dev.off()

### Separating vegetarian and omnivores (species)

In [None]:
# Extract the sample data
sample_data_df <- sample_data(physeq)

# Subset phyloseq_veg for samples with names starting with 'V'
physeq_veg <- subset_samples(physeq, grepl("^V", rownames(sample_data_df)))

# Running ANCOMBC
results_veg <- ancombc2(physeq_veg, 
                p_adj_method = "bonferroni",
                fix_formula = "sample_types",
                group = "sample_types")

res_prim_veg = results_veg$res
write.table(res_prim_veg, "ancombc2_species_veg.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")


# Subset phyloseq_omn for samples with names starting with 'O'
physeq_omn <- subset_samples(physeq, grepl("^O", rownames(sample_data_df)))

# Running ANCOMBC
results_omn <- ancombc2(physeq_omn, 
                p_adj_method = "bonferroni",
                fix_formula = "sample_types",
                group = "sample_types")
res_prim_omn = results_omn$res
write.table(res_prim_omn, "ancombc2_species_omn.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")

In [None]:
# Generating plot for vegetarians from the differentially abundant species
# Select the relevant columns including standard error and filter for significant results
df_lfc <- res_prim_veg %>%
  dplyr::select(taxon, lfc_sample_typesPre, p_sample_typesPre, se_sample_typesPre) %>%
  dplyr::filter(!is.na(lfc_sample_typesPre) & p_sample_typesPre < 0.05) %>%  # Filter for significant results
  dplyr::arrange(desc(abs(lfc_sample_typesPre))) %>%  # Sort by absolute LFC to get top 10
  #head(10) %>%  # Take top 10 species
  dplyr::mutate(direct = ifelse(lfc_sample_typesPre > 0, "Positive LFC", "Negative LFC"),
                color = ifelse(p_sample_typesPre < 0.05, "aquamarine3", "black"),
                # Trim the species names to the last part (penultimate part)
                taxon = sapply(strsplit(taxon, ";"), function(x) ifelse(length(x) > 1, x[length(x) - 1], NA)))

# Convert 'taxon' to factor with the order of appearance for plotting
df_lfc$taxon = factor(df_lfc$taxon, levels = df_lfc$taxon)
df_lfc$direct = factor(df_lfc$direct, levels = c("Positive LFC", "Negative LFC"))

# Plotting the differentially abundant species with error bars
fig_lfc <- df_lfc %>%
  ggplot(aes(x = taxon, y = lfc_sample_typesPre, fill = direct)) + 
  geom_bar(stat = "identity", width = 0.7, color = "black", 
           position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = lfc_sample_typesPre - se_sample_typesPre, 
                    ymax = lfc_sample_typesPre + se_sample_typesPre), 
                width = 0.2, position = position_dodge(0.05), color = "black") +
  labs(x = NULL, y = "Log fold change", 
       title = "Differentially abundant species (vegetarians)") + 
  scale_fill_discrete(name = NULL) +
  scale_color_discrete(name = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        panel.grid.minor.y = element_blank(),
        axis.text.x = element_text(angle = 60, hjust = 1,
                                   color = df_lfc$color))

                               
pdf("ancombc2_species_veg.pdf", width = 5, height = 5)
fig_lfc
dev.off()

In [None]:
# Generating a plot for omnivores the differentially abundant species
# Select the relevant columns including standard error and filter for significant results
df_lfc <- res_prim_omn %>%
  dplyr::select(taxon, lfc_sample_typesPre, p_sample_typesPre, se_sample_typesPre) %>%
  dplyr::filter(!is.na(lfc_sample_typesPre) & p_sample_typesPre < 0.05) %>%  # Filter for significant results
  dplyr::arrange(desc(abs(lfc_sample_typesPre))) %>%  # Sort by absolute LFC to get top 10
  #head(10) %>%  # Take top 10 species
  dplyr::mutate(direct = ifelse(lfc_sample_typesPre > 0, "Positive LFC", "Negative LFC"),
                color = ifelse(p_sample_typesPre < 0.05, "aquamarine3", "black"),
                # Trim the species names to the last part (penultimate part)
                taxon = sapply(strsplit(taxon, ";"), function(x) ifelse(length(x) > 1, x[length(x) - 1], NA)))

# Convert 'taxon' to factor with the order of appearance for plotting
df_lfc$taxon = factor(df_lfc$taxon, levels = df_lfc$taxon)
df_lfc$direct = factor(df_lfc$direct, levels = c("Positive LFC", "Negative LFC"))

# Plotting the differentially abundant species with error bars
fig_lfc <- df_lfc %>%
  ggplot(aes(x = taxon, y = lfc_sample_typesPre, fill = direct)) + 
  geom_bar(stat = "identity", width = 0.7, color = "black", 
           position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = lfc_sample_typesPre - se_sample_typesPre, 
                    ymax = lfc_sample_typesPre + se_sample_typesPre), 
                width = 0.2, position = position_dodge(0.05), color = "black") +
  labs(x = NULL, y = "Log fold change", 
       title = "Differentially abundant species (omnivores)") + 
  scale_fill_discrete(name = NULL) +
  scale_color_discrete(name = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        panel.grid.minor.y = element_blank(),
        axis.text.x = element_text(angle = 60, hjust = 1,
                                   color = df_lfc$color))

pdf("ancombc2_species_omn.pdf", width = 5, height = 5)
fig_lfc
dev.off()

## Differentially abundant KOs between hi and low TMAO producers using ANCOM-BC
### All individuals (KOs)

In [None]:
# Building a clean phyloseq object for KO annotations
data <- read.table("bwa_kos_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data)
                     
#otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(otumat), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("HP", second_elements), "High", "Low"))
rownames(sampledata) = colnames(otumat)
#head(sampledata, 3)

SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)
physeq                        


In [None]:
# Running ANCOMBC
results <- ancombc2(physeq, 
                p_adj_method = "bonferroni",
                fix_formula = "sample_types",
                group = "sample_types")
# Extracting results
res_prim = results$res
write.table(res_prim, "ancombc2_kos_all_LH.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")
                          

In [None]:
# Generating a plot from the differentially abundant KOs
# Select the relevant columns including standard error and filter for significant results
df_lfc <- res_prim %>%
  dplyr::select(taxon, lfc_sample_typesPre, p_sample_typesPre, se_sample_typesPre) %>%
  dplyr::filter(!is.na(lfc_sample_typesPre) & p_sample_typesPre < 0.05) %>%  # Filter for significant results
  dplyr::arrange(desc(abs(lfc_sample_typesPre))) %>%  # Sort by absolute LFC to get top 10
  head(20) %>%  # Take top 20 kos
  dplyr::mutate(direct = ifelse(lfc_sample_typesPre > 0, "Positive LFC", "Negative LFC"),
                color = ifelse(p_sample_typesPre < 0.05, "aquamarine3", "black"),
                # Trim the species names to the last part (after last ';')
                taxon = sapply(strsplit(taxon, ";"), function(x) tail(x, 1)))

# Convert 'taxon' to factor with the order of appearance for plotting
df_lfc$taxon = factor(df_lfc$taxon, levels = df_lfc$taxon)
df_lfc$direct = factor(df_lfc$direct, levels = c("Positive LFC", "Negative LFC"))

# Plotting the top 20 differentially abundant KOs with error bars
fig_lfc <- df_lfc %>%
  ggplot(aes(x = taxon, y = lfc_sample_typesPre, fill = direct)) + 
  geom_bar(stat = "identity", width = 0.7, color = "black", 
           position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = lfc_sample_typesPre - se_sample_typesPre, 
                    ymax = lfc_sample_typesPre + se_sample_typesPre), 
                width = 0.2, position = position_dodge(0.05), color = "black") +
  labs(x = NULL, y = "Log fold change", 
       title = "Differentially abundant KOs (all individuals)") + 
  scale_fill_discrete(name = NULL) +
  scale_color_discrete(name = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        panel.grid.minor.y = element_blank(),
        axis.text.x = element_text(angle = 60, hjust = 1,
                                   color = df_lfc$color))

                               
pdf("ancombc2_kos_all_LH.pdf", width = 5, height = 5)
fig_lfc
dev.off()


### Separating vegetarian and omnivores (KOs)

In [None]:
# Extract the sample data
sample_data_df <- sample_data(physeq)

# Subset phyloseq_veg for samples with names starting with 'V'
physeq_veg <- subset_samples(physeq, grepl("^V", rownames(sample_data_df)))

# Running ANCOMBC
results_veg <- ancombc2(physeq_veg, 
                p_adj_method = "bonferroni",
                fix_formula = "sample_types",
                group = "sample_types")

res_prim_veg = results_veg$res
write.table(res_prim_veg, "ancombc2_kos_veg_HL.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")



In [None]:

# Subset phyloseq_omn for samples with names starting with 'O'
physeq_omn <- subset_samples(physeq, grepl("^O", rownames(sample_data_df)))

# Running ANCOMBC
results_omn <- ancombc2(physeq_omn, 
                p_adj_method = "bonferroni",
                fix_formula = "sample_types",
                group = "sample_types")
res_prim_omn = results_omn$res
write.table(res_prim_omn, "ancombc2_kos_omn_LH.tsv", row.names = FALSE, col.names = TRUE, quote = FALSE, sep = "\t")

In [None]:
# Generating a plots from the differentially abundant KOs for vegetarians
# Select the relevant columns including standard error and filter for significant results
df_lfc <- res_prim_veg %>%
  dplyr::select(taxon, lfc_sample_typesPre, p_sample_typesPre, se_sample_typesPre) %>%
  dplyr::filter(!is.na(lfc_sample_typesPre) & p_sample_typesPre < 0.05) %>%  # Filter for significant results
  dplyr::arrange(desc(abs(lfc_sample_typesPre))) %>%  # Sort by absolute LFC to get top 10
  head(20) %>%  # Take top 20 kos
  dplyr::mutate(direct = ifelse(lfc_sample_typesPre > 0, "Positive LFC", "Negative LFC"),
                color = ifelse(p_sample_typesPre < 0.05, "aquamarine3", "black"),
                # Trim the species names to the last part (after last ';')
                taxon = sapply(strsplit(taxon, ";"), function(x) tail(x, 1)))

# Convert 'taxon' to factor with the order of appearance for plotting
df_lfc$taxon = factor(df_lfc$taxon, levels = df_lfc$taxon)
df_lfc$direct = factor(df_lfc$direct, levels = c("Positive LFC", "Negative LFC"))

# Plotting the top 20 differentially abundant KOs with error bars
fig_lfc <- df_lfc %>%
  ggplot(aes(x = taxon, y = lfc_sample_typesPre, fill = direct)) + 
  geom_bar(stat = "identity", width = 0.7, color = "black", 
           position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = lfc_sample_typesPre - se_sample_typesPre, 
                    ymax = lfc_sample_typesPre + se_sample_typesPre), 
                width = 0.2, position = position_dodge(0.05), color = "black") +
  labs(x = NULL, y = "Log fold change", 
       title = "Differentially abundant KOs (vegetarian)") + 
  scale_fill_discrete(name = NULL) +
  scale_color_discrete(name = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        panel.grid.minor.y = element_blank(),
        axis.text.x = element_text(angle = 60, hjust = 1,
                                   color = df_lfc$color))
          
pdf("ancombc2_kos_veg_LH.pdf", width = 5, height = 5)
fig_lfc
dev.off()


In [None]:
# Generating a plots from the differentially abundant KOs for omnivorous
# Select the relevant columns including standard error and filter for significant results
df_lfc <- res_prim_omn %>%
  dplyr::select(taxon, lfc_sample_typesPre, p_sample_typesPre, se_sample_typesPre) %>%
  dplyr::filter(!is.na(lfc_sample_typesPre) & p_sample_typesPre < 0.05) %>%  # Filter for significant results
  dplyr::arrange(desc(abs(lfc_sample_typesPre))) %>%  # Sort by absolute LFC to get top 10
  head(20) %>%  # Take top 20 kos
  dplyr::mutate(direct = ifelse(lfc_sample_typesPre > 0, "Positive LFC", "Negative LFC"),
                color = ifelse(p_sample_typesPre < 0.05, "aquamarine3", "black"),
                # Trim the species names to the last part (after last ';')
                taxon = sapply(strsplit(taxon, ";"), function(x) tail(x, 1)))

# Convert 'taxon' to factor with the order of appearance for plotting
df_lfc$taxon = factor(df_lfc$taxon, levels = df_lfc$taxon)
df_lfc$direct = factor(df_lfc$direct, levels = c("Positive LFC", "Negative LFC"))

# Plotting the top 20 differentially abundant KOs with error bars
fig_lfc <- df_lfc %>%
  ggplot(aes(x = taxon, y = lfc_sample_typesPre, fill = direct)) + 
  geom_bar(stat = "identity", width = 0.7, color = "black", 
           position = position_dodge(width = 0.4)) +
  geom_errorbar(aes(ymin = lfc_sample_typesPre - se_sample_typesPre, 
                    ymax = lfc_sample_typesPre + se_sample_typesPre), 
                width = 0.2, position = position_dodge(0.05), color = "black") +
  labs(x = NULL, y = "Log fold change", 
       title = "Differentially abundant KOs (omnivorous)") + 
  scale_fill_discrete(name = NULL) +
  scale_color_discrete(name = NULL) +
  theme_bw() + 
  theme(plot.title = element_text(hjust = 0.5),
        panel.grid.minor.y = element_blank(),
        axis.text.x = element_text(angle = 60, hjust = 1,
                                   color = df_lfc$color))
          
pdf("ancombc2_kos_omn_LH.pdf", width = 5, height = 5)
fig_lfc
dev.off()


## Differentially abundant KOs between hi and low TMAO producers using metagenomeSeq
### All individuals (KOs)

In [None]:
# Building a clean phyloseq object for KO annotations
data <- read.table("bwa_kos_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]
otumat <- as.matrix(data)
                     
#otumat <- as.matrix(data[-1])
OTU = otu_table(otumat, taxa_are_rows = TRUE)

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(otumat), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("HP", second_elements), "High", "Low"))
rownames(sampledata) = colnames(otumat)
#head(sampledata, 3)

SAM = sample_data(sampledata)
physeq = phyloseq(OTU, SAM)
physeq                        

In [None]:
# Convert phyloseq object to a metagenomeSeq object
mgseq = phyloseq_to_metagenomeSeq(physeq)

In [None]:
# Normalising
p = cumNormStatFast(mgseq)
mgseq_norm = cumNorm(mgseq, p = p)

In [None]:
# Running differential abundance test
suppressWarnings({
    differential_test <- fitFeatureModel(mgseq_norm, mod = model.matrix(~sample_types, data = pData(mgseq_norm)))
})

In [None]:
coef_results = MRcoefs(differential_test)

In [None]:
write.table(coef_results, "metagenomeseq_kos_all_LH.tsv", row.names = TRUE, col.names = TRUE, quote = FALSE, sep = "\t")

In [None]:
suppressWarnings({
    differential_test_fz <- fitZig(mgseq_norm, mod = model.matrix(~sample_types, data = pData(mgseq_norm)))
})

In [None]:
coef_results = MRcoefs(differential_test_fz)
write.table(coef_results, "fz_metagenomeseq_kos_all_LH.tsv", row.names = TRUE, col.names = TRUE, quote = FALSE, sep = "\t")

## Differentially abundant KOs between hi and low TMAO producers using maaslin3
### All individuals (KOs)

In [None]:
library(maaslin3)

In [None]:
# Reading the KOs table
data <- read.table("bwa_kos_matrix.tsv", header = TRUE, sep = "\t", row.names = 1)
data <- data[!apply(data, 1, function(x) all(x == 0 | is.na(x))), ]
data <- data[, apply(data, 2, function(x) any(x != 0))]

# Building metadata object with groups formation info (all individuals)
second_elements <- sapply(strsplit(colnames(data), "_"), function(x) x[2])
sampledata <- data.frame(sample_types = ifelse(grepl("HP", second_elements), "High", "Low"))
rownames(sampledata) = colnames(data)
sampledata$reads <- 1000000

In [None]:
set.seed(1)
fit_out <- maaslin3(input_data = data,
                    input_metadata = sampledata,
                    output = 'maaslin3_output',
                    formula = '~ sample_types + reads',
                    median_comparison_abundance = FALSE,
                    max_pngs = 50)