## spatial - viral analysis
- Compare viral community composition to spatial distance
- Compare viral community composition to soil biochemical data


In [None]:
# Load the libraries and functions needed for the analysis
library(vegan)
library(ggplot2)
library(ape)
library(ggrepel)
library(tidyverse)
library(cowplot)
library(measurements)
library(geosphere)
library(broom)

In [2]:
# Open the metadata
map <- read.csv("../data/metadata.csv", header = T)


# Open the OTU table
otu <- read.table("../data/coveragetable.csv", 
              sep=",", header=T, row.names=1)


# Transform otu table to bray-curtis dissimilarity matrix, using jaccard distance
otu <- t(otu)
otu.xform <- decostand(otu, method="hellinger")
otu.dist <- as.matrix(vegdist(otu.xform, method='jaccard'))


In [5]:
# # get coordinates right ( as in latitude longitude)
mtx = map %>% 
  select(SampleID, lon_d, lat_d) %>% 
  as.data.frame()
row.names(mtx) <- mtx$SampleID
mtx <- mtx[,-1]
mtx <- as.matrix(mtx)

# calculate spatial distance between coordinates
spatial.dist <- geosphere::distm(mtx, mtx) %>% as.data.frame()
row.names(spatial.dist) <- row.names(mtx)
colnames(spatial.dist) <- row.names(mtx)


# Make a table that has each sample and then the distance between those samples in km
spatial.dist.tidy <- spatial.dist %>% 
  mutate(SampleID.x = row.names(.)) %>% 
  gather(key = "SampleID.y", value = "SpatialDistance", -SampleID.x) %>% 
  mutate(SpatialDistance = SpatialDistance/1000)

# Write this to a csv cause i want to look at it
write.csv(spatial.dist.tidy, '../data/spatial_dist_matrix.csv')

In [6]:
# keep only things i need from metadata
meta = map %>% 
  select(SampleID, plot, plot_detail, timepoint) %>% 
  as.data.frame()

In [7]:
# Make dataframe from braycurtis dist
bray.dist <- otu.dist %>% 
as.data.frame() %>% 
mutate(SampleID.x = row.names(.)) %>% 
gather(key = "SampleID.y", value = "BrayDistance", - SampleID.x)

In [9]:
# join both distance frames
distance.both <- bray.dist %>% 
  inner_join(spatial.dist.tidy, by = c("SampleID.x", "SampleID.y"))

# print distance frame to check
head(distance.both)

Unnamed: 0_level_0,SampleID.x,SampleID.y,BrayDistance,SpatialDistance
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>
1,BMLB1_T1_V,BMLB1_T1_V,0.0,0.0
2,BMLB1_T2_V,BMLB1_T1_V,0.8637022,0.020652167
3,BMLB1_T3_V,BMLB1_T1_V,0.9013672,0.007399428
4,BMLB2_T1_V,BMLB1_T1_V,0.9315496,0.018727481
5,BMLB2_T2_V,BMLB1_T1_V,0.929635,0.034443434
6,BMLB2_T3_V,BMLB1_T1_V,0.9265447,0.044068026


In [11]:
# Create dataframe from the table, and add metadata
df <- distance.both %>% 
  inner_join(meta, by = c("SampleID.x" = "SampleID")) %>% 
  inner_join(meta, by = c("SampleID.y" = "SampleID"))


In [17]:
# Keep datapoints that are not compared to itself
df <- subset(df, BrayDistance > 0)

# convert spatial distance to meters
df$SpatialDistance_m <- df$SpatialDistance * 1000


In [19]:
#Perform correlation analysis and regression 
cor <- cor.test(df$BrayDistance, df$SpatialDistance_m, method = "pearson", alternative = "two.sided") %>% tidy()
lm <- lm(SpatialDistance_m ~ BrayDistance, data = df1) %>% tidy() %>% filter(term == "BrayDistance")
dist.stats <- data.frame(label = paste("r = ", signif(cor$estimate,3), 
                                       "\nslope = ", signif(lm$estimate, 3),
                                       "\nP = ", signif(cor$p.value, 3)))
dist.stats

label
<chr>
r = 0.38 slope = 2320 P = 2.7e-134


In [24]:
# Make a line plot for the spatial distance compared to jaccard distance of samples

#pdf('../plots/221201_spatial_jaccard_dist_no.pdf', width = 11, height =7)
dist.p <- df %>% 
  ggplot(aes(SpatialDistance_m, BrayDistance)) +
  geom_text(data = dist.stats, aes(x = 200, y = 0.5, label = label), hjust = 0, size = 5) + 
  geom_point(alpha = 0.5, color = "gray25") +
  geom_smooth(color = "gray25", method = "lm", se = F) +
  scale_color_brewer(name = "", palette = "Dark2") +
  scale_fill_brewer(name = "", palette = "Dark2") +
  ylab("vOTUs (Virome)\nPairwise Jaccard Dissimilarity ") +
  xlab("Spatial distance (m)") +
  theme_light() +
  #scale_y_log10() +
  scale_x_log10() +
  theme(text = element_text(size = 15),
        legend.position = "left")
dist.p
dev.off()

`geom_smooth()` using formula 'y ~ x'



## Now comparing the biochemical data to viral community composition

In [12]:
#Generate a data frame with the full names of all the variables profiled

alt.names <- tribble(
  ~Variable, ~Variable2,
  "soil_ph", "Soil pH",
  "perc_moisture", "Soil moisture (%)",
  "Salts_mmho_cm1_1_S", "Soluble salts (mmho/cm)",
  "org_matter", "Organic Matter (LOI %)",
  "Nitrate_N_ppm_N", "Nitrate (ppm)",
  "Olsen_P_ppm_P", "Phosphorus (ppm)",
  "Potassium_ppm_K", "Potassium (ppm)",
  "Sulfate_S_ppm_S", "Sulfate (ppm)",
  "Zinc_ppm_Zn", "Zinc (ppm)",
  "Iron_ppm_Fe", "Iron (ppm)",
  "Manganese_ppm_Mn", "Manganese (ppm)",
  "Copper_ppm_Cu", "Copper (ppm)",
  "Calcium_ppm_Ca", "Calcium (ppm)",
  "Magnesium_ppm_Mg", "Magnesium (ppm)",
  "Sodium_ppm_Na", "Sodium (ppm)",
  "CEC_Sum_of_Cations_me_100g", "CEC (meq/100g)"
)


In [12]:
# Nutrient analysis:
ward <- read.table("../data/nutrients.csv", header = T, sep = ",")


In [13]:
# Open the OTU table
otu <- read.table("../data/coveragetable.csv", 
              sep=",", header=T, row.names=1)

# transpose
otu <- t(otu)
otu.xform <- decostand(otu, method="hellinger")
otu.dist <- as.matrix(vegdist(otu.xform, method='jaccard'))

# Put in df
bray.dist <- otu.dist %>% 
as.data.frame() %>% 
mutate(SampleID.x = row.names(.)) %>% 
gather(key = "SampleID.y", value = "BrayDistance", - SampleID.x)

In [10]:
#Remove variables with no variation, z-transform each variable, and format it as a matrix
nut.mtx <- ward %>% 
  select(-Excess_Lime, -Texture_No, -H_Sat, -Ca_Sat, -K_Sat, -Na_Sat, -Mg_Sat) %>% 
  gather(key = "Variable", value = "Value", -SampleID) %>% 
  group_by(Variable) %>% 
  mutate(zValue = (Value - mean(Value))/sd(Value)) %>% 
  select(SampleID, Variable, zValue) %>% 
  spread(key = Variable, value = zValue) %>% 
  as.data.frame()
row.names(nut.mtx) <- nut.mtx$SampleID
nut.mtx <- nut.mtx[,-1]
nut.mtx <- as.matrix(nut.mtx)

In [15]:
# Format as matrix
nut.dist <- as.matrix(dist(nut.mtx, method = "euclidian"))


“NAs introduced by coercion”


In [32]:
# write matrix to file
write.csv(nut.dist, '../data/220830_nutrient_dist_matrix.csv')

In [16]:
#Create a long data frame and remove pairwise comparisons between the same sample
nut.dist.tidy <- nut.dist %>% 
  as.data.frame() %>% 
  mutate(SampleID.x = row.names(.)) %>% 
  gather(key = "SampleID.y", value = "EucDist", -SampleID.x) %>% 
  filter(!is.na(EucDist)) %>%
  filter(SampleID.x != SampleID.y)


In [17]:
# Concat bray distance between samples with the nutrient distance between samples
both.dist <- inner_join(bray.dist, nut.dist.tidy, by = c("SampleID.x" = "SampleID.x" , "SampleID.y" = "SampleID.y"))

In [64]:
# Compare spatial distance and nutrient comp

#Perform correlation analysis and regression 
cor <- cor.test(both.dist$BrayDistance, both.dist$EucDist, method = "pearson", alternative = "two.sided") %>% tidy()
lm <- lm(EucDist ~ BrayDistance, data = both.dist) %>% tidy() %>% filter(term == "BrayDistance")
dist.stats <- data.frame(label = paste("r = ", signif(cor$estimate,3), 
                                       "\nslope = ", signif(lm$estimate, 3),
                                       "\nP = ", signif(cor$p.value, 3)))

dist.stats

label
<chr>
r = 0.297 slope = 8.4 P = 4.67e-41


In [2]:
# Make a regression plot of euclidian environment distance vs jaccard distance for all euclidian values
#pdf('../plots/220826_euclidian_jaccard_dist_no1.pdf')
dist.p <- both.dist %>% 
  ggplot(aes(EucDist, BrayDistance)) +
  #geom_text(data = dist.stats, aes(x = 0, y = 0.5, label = label), hjust = 0, size = 5) + 
  geom_point(alpha = 0.5) +
  geom_smooth(color = "gray25", method = "lm", se = F) +
  scale_color_brewer(name = "", palette = "Dark2") +
  scale_fill_brewer(name = "", palette = "Dark2") +
  ylab("vOTUs (Virome)\nPairwise jaccard Dissimilarity") +
  xlab("Difference between Samples (Euclidian distance)") +
  theme_light() +
  #scale_x_log10() +
  theme(text = element_text(size = 15),
        legend.position = "left")
dist.p
dev.off()

## Now comparing each soil biochemical individually

In [67]:
# Take chemical measurements and put into matrix
nut.mtx2 <- ward %>% 
  select(-Excess_Lime, -Texture_No, -H_Sat, -Ca_Sat, -K_Sat, -Na_Sat, -Mg_Sat) %>% 
  as.data.frame()
row.names(nut.mtx2) <- nut.mtx2$SampleID
nut.mtx2 <- nut.mtx2[,-1]
nut.mtx2 <- as.matrix(nut.mtx2)

In [68]:
# Function to calculate differences between sampels
get_differences <- function(x){
  vec <- nut.mtx2[,x]
  vec %*% t(vec)
  var.dist <- dist(as.matrix(data.frame(x = 0, y = vec))) %>% as.matrix()
  var.dist[upper.tri(var.dist)] <- NA 
  var.dist %>% 
    as.data.frame() %>% 
    mutate(SampleID.x = row.names(.)) %>% 
    gather(key = "SampleID.y", value = "VarDist", -SampleID.x) %>% 
    filter(!is.na(VarDist))
}

In [69]:
# Do function for each of the biochemical measurements
var.list <- list()
for(i in 1:ncol(nut.mtx2)){
  var.name <- colnames(nut.mtx2)[i]
  var.list[[var.name]] <- get_differences(i) 
}

In [70]:
# Put results into df with the sample names
var.tidy <- plyr::ldply(var.list, function(x) x) %>% 
  filter(SampleID.x != SampleID.y) %>% 
  inner_join(both.dist, by = c("SampleID.x" = "SampleID.x" , "SampleID.y" = "SampleID.y"))
  #mutate(SpatialDistance = sqrt((Position.x.x - Position.x.y)^2 + (Position.y.x - Position.y.y)^2))
names(var.tidy)[1] <- "Variable"

In [71]:
# Function to calculate correlation statistics between viral comm comp and biochem
run_cor <- function(x){
  cor.test(x$BrayDistance, x$VarDist, method = "pearson", alternative = "two.sided") %>% 
  tidy()
}

In [73]:
# Make a stats text for each of the variables to plot later
stats <- var.tidy %>% 
  group_by(Variable) %>% 
  nest() %>% 
  mutate(cor = map(data, run_cor)) %>% 
  unnest(cor) %>% 
  ungroup() %>% 
  mutate(p.adj = p.adjust(p.value, method = "holm")) %>% 
  mutate(label = paste("r = ", signif(estimate,2), ", P = ", signif(p.adj,3), sep = "")) %>%
  inner_join(alt.names, by = "Variable")

In [74]:
stats.cn <- stats %>% 
  select(-data) %>% 
  as.data.frame()

In [76]:
# Create the plot for each of the biochemcical measurements individually
# For each of them there will be a seperate regression line and regression stats
pdf('../plots/220826_euclidian_jaccard_dist.pdf')
variables.p <- var.tidy %>% 
  inner_join(stats.cn, by = "Variable") %>%
  ggplot(aes(VarDist, BrayDistance)) +
  geom_point(shape = 16, size = 1, alpha = 0.5, color = "gray25") +
  geom_text(data = stats, aes(x = 0.1, y = 0.4, label = label), hjust = 0, size=3) + 
  geom_smooth(color = "black", se = F, method = "lm") +
  scale_color_brewer(name = "Block", palette = "Set1", direction = -1) +
  ylab("vOTUs (Virome)\nPairwise jaccard Dissimilarity") +
  xlab("Difference between samples") +
  #scale_x_continuous(breaks = seq(0, 18, by = 3)) +
  facet_wrap(~ Variable2, scales = "free", ncol = 4) + 
  theme_bw() +
  theme(text = element_text(size = 11),
        legend.position = "top",
        strip.background =element_rect(fill="gray25"),
        strip.text = element_text(colour = "white"))
variables.p
dev.off()

`geom_smooth()` using formula 'y ~ x'

